# Libraries setup

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize


import datetime
from datetime import date
%matplotlib inline

import re
import unicodedata
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords_es = stopwords.words("english")

plt.style.use("ggplot")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sebastian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data

In [11]:
df = pd.read_csv('../datasets/upworthy-archive-exploratory-packages-03.12.2020.csv') 

In [12]:
print(df.dtypes)
df = df.iloc[:,1:] # remove the first columns, since is just the index
df.head()

Unnamed: 0                int64
created_at               object
updated_at               object
clickability_test_id     object
excerpt                  object
headline                 object
lede                     object
slug                     object
eyecatcher_id            object
impressions               int64
clicks                    int64
significance            float64
first_place                bool
winner                     bool
share_text               object
square                   object
test_week                 int64
dtype: object


Unnamed: 0,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week
0,2014-11-20 06:43:16.005,2016-04-02 16:33:38.062,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3052,150,100.0,True,True,Anyone who's ever felt guilty about shopping a...,,201446
1,2014-11-20 06:43:44.646,2016-04-02 16:25:54.021,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3033,122,14.0,False,False,Walmart is getting schooled by another retaile...,,201446
2,2014-11-20 06:44:59.804,2016-04-02 16:25:54.024,546d88fb84ad38b2ce000024,Things that matter. Pass 'em on.,They're Being Called 'Walmart's Worst Nightmar...,"<p>When I saw *why* people are calling them ""W...",theyre-being-called-walmarts-worst-nightmare-a...,546d6fa19ad54eec8d00002d,3092,110,1.8,False,False,Walmart may not be crapping their pants over t...,,201446
3,2014-11-20 06:54:36.335,2016-04-02 16:25:54.027,546d902c26714c6c44000039,Things that matter. Pass 'em on.,This Is What Sexism Against Men Sounds Like,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,546bc55335992b86c8000043,3526,90,4.1,False,False,"If you ever wondered, ""but what about the men?...",,201446
4,2014-11-20 06:54:57.878,2016-04-02 16:31:45.671,546d902c26714c6c44000039,Things that matter. Pass 'em on.,This Is What Sexism Against Men Sounds Like,<p>DISCLOSURE: I'm a dude. I have cried on mul...,this-is-what-sexism-against-men-sounds-like-am...,546d900426714cd2dd00002e,3506,120,100.0,True,False,"If you ever wondered, ""but what about the men?...",,201446


In [13]:
idx = 4353
print(df.headline.values[idx])
print(df.excerpt.values[idx])

First You See Them Standing There Half-Naked. Then You Hear What They Say And It's Glorious.
Things that matter. Pass 'em on.


In [14]:
df = df.convert_dtypes()

date_format_columns = {dfc:np.datetime64 for dfc in ["created_at", "updated_at"]}
integer_columns = {ic:np.int64 for ic in ["clicks", "impressions", "significance", "test_week"]}  

df = df.astype(dict(date_format_columns, **integer_columns))
df.dtypes

created_at              datetime64[ns]
updated_at              datetime64[ns]
clickability_test_id            string
excerpt                         string
headline                        string
lede                            string
slug                            string
eyecatcher_id                   string
impressions                      int64
clicks                           int64
significance                     int64
first_place                    boolean
winner                         boolean
share_text                      string
square                          string
test_week                        int64
dtype: object

In [15]:
df.created_at.max()

Timestamp('2015-04-29 22:46:43.618000')

## Text preprocessing

In [16]:
from unicodedata import normalize

def preprocess_text(texts,
                    links=False,
                    hashtags_and_mentions=False,
                    stopwords=False,
                    tildes=True,
                    min_letters=0,
                    punctuation=True,
                    lower=True,
                    numbers=False):
    """
    Removes links, hashtags, mentions, stopweords, tildes, and numbers from given array of documents.
    """
    if lower:
      texts = map(lambda x: x.lower(), texts)
    if not links:
        #   Removing links
        links_regex = re.compile(r"http(s|)?:\/\/.*\s+")
        texts = np.array(
            list(map(lambda x: links_regex.sub("", x + " "), texts)))

    if not hashtags_and_mentions:
        hashtags_and_mentions_regex = re.compile(
            r"pic.twitter.com/(\w*)|#(\w+)|@(\w*)")
        texts = np.array(
            list(map(lambda x: hashtags_and_mentions_regex.sub("", x), texts)))

    if not stopwords:
        texts = np.array(
            list(
                map(
                    lambda x: ' '.join([
                        word for word in x.split() if word not in stopwords_es
                    ]), texts)))
    if min_letters > 0:
        texts = np.array(
            list(
                map(
                    lambda x: " ".join([
                        word for word in x.split() if len(word) > min_letters
                    ]), texts)))

    if not tildes:
        trans_tab = dict.fromkeys(map(ord, u'\u0301\u0308'), None)
        texts = np.array(
            list(
                map(
                    lambda x: normalize(
                        'NFKC',
                        normalize('NFKD', x).translate(trans_tab)), texts)))

    if not punctuation:
      texts = np.array(list(map(lambda x: re.compile(r"[^\w\d'\s]+").sub(" ", x), texts)))
    if not numbers:
        texts = np.array(list(map(lambda x: re.sub(r"[0-9]", "", x), texts)))
    texts = np.array(
        list(map(lambda x: re.compile(r"\s{2,}").sub(" ", x), texts)))
    return texts

In [17]:
df["headline_lowercase"] = df[["headline"]].apply(lambda x : np.array(list(map(lambda y: y.lower(), x.values))), axis=1, result_type="expand")
df.headline_lowercase

0        they're being called 'walmart's worst nightmar...
1        they're being called 'walmart's worst nightmar...
2        they're being called 'walmart's worst nightmar...
3              this is what sexism against men sounds like
4              this is what sexism against men sounds like
                               ...                        
22661    5 reasons you may need to plan a vacation - ri...
22662    the next time you encounter a small minded big...
22663    i've never wanted to buy a plane ticket more t...
22664    3 ladies having too much fun at the epicenter ...
22665    they're being called 'walmart's worst nightmar...
Name: headline_lowercase, Length: 22666, dtype: object

In [19]:
df.duplicated().sum()

0

# Comparing two headlines

## Cleaning dataset

In [20]:
num_packages = len(df.clickability_test_id.unique())
print(f"There are {num_packages} number of packages")
comp_df = df.copy()

There are 4873 number of packages


In [21]:
num_headlines = len(comp_df.headline.unique())
print(f"There are {num_headlines} number of headlines")

There are 12387 number of headlines


In [23]:
packages_df = comp_df.groupby("clickability_test_id").sum().reset_index(drop=False)
packages_df

Unnamed: 0,clickability_test_id,impressions,clicks,significance,first_place,winner,test_week
0,51436061220cb800020001e7,10380,219,172,1,1,805232
1,51436069220cb800020005ae,9797,111,101,1,1,1006555
2,51436069220cb800020005bd,10573,192,123,1,1,402610
3,5143606a220cb800020005c6,11079,152,101,1,1,805228
4,5143606b220cb800020005d7,11062,178,145,1,1,1006545
...,...,...,...,...,...,...,...
4868,554036c1393131000c330100,12236,52,215,1,0,1209102
4869,55403be0393131002cf60000,8178,29,110,1,0,806068
4870,55413ef4333531000c180000,4105,50,102,1,1,403034
4871,554141c4383063001c110000,10140,24,0,1,0,1007585


In [24]:
packages_df["num_unique_headlines"] = [len(comp_df[comp_df.clickability_test_id == id].headline.unique()) for id in packages_df.clickability_test_id] 
packages_df["num_total_headlines"] = [len(comp_df[comp_df.clickability_test_id == id].headline.values) for id in packages_df.clickability_test_id]
packages_df

Unnamed: 0,clickability_test_id,impressions,clicks,significance,first_place,winner,test_week,num_unique_headlines,num_total_headlines
0,51436061220cb800020001e7,10380,219,172,1,1,805232,4,4
1,51436069220cb800020005ae,9797,111,101,1,1,1006555,5,5
2,51436069220cb800020005bd,10573,192,123,1,1,402610,2,2
3,5143606a220cb800020005c6,11079,152,101,1,1,805228,4,4
4,5143606b220cb800020005d7,11062,178,145,1,1,1006545,5,5
...,...,...,...,...,...,...,...,...,...
4868,554036c1393131000c330100,12236,52,215,1,0,1209102,1,6
4869,55403be0393131002cf60000,8178,29,110,1,0,806068,4,4
4870,55413ef4333531000c180000,4105,50,102,1,1,403034,2,2
4871,554141c4383063001c110000,10140,24,0,1,0,1007585,1,5


#### How many packages have repeated headlines?

In [25]:
packages_df[packages_df.num_unique_headlines < packages_df.num_total_headlines]

Unnamed: 0,clickability_test_id,impressions,clicks,significance,first_place,winner,test_week,num_unique_headlines,num_total_headlines
5,5143606d220cb80002000643,16385,130,147,1,1,805616,1,4
14,51436074220cb800020007b2,10557,140,0,0,0,805236,3,4
18,51436079220cb800020008b8,10553,306,142,1,1,805216,3,4
19,5143607b220cb80002000918,17937,473,102,1,0,1409415,1,7
31,51436086220cb800020009a1,12211,323,129,1,1,805240,2,4
...,...,...,...,...,...,...,...,...,...
4865,553fdeee316638000cad0000,15215,44,255,1,0,1007585,1,5
4867,55400040396536001c4e0200,15213,42,436,1,0,1007585,1,5
4868,554036c1393131000c330100,12236,52,215,1,0,1209102,1,6
4871,554141c4383063001c110000,10140,24,0,1,0,1007585,1,5


In [26]:
2650 / 4873

0.5438128462959163

#### How many packages have more than a single package?

In [27]:
packages_df[packages_df["num_unique_headlines"] > 1]

Unnamed: 0,clickability_test_id,impressions,clicks,significance,first_place,winner,test_week,num_unique_headlines,num_total_headlines
0,51436061220cb800020001e7,10380,219,172,1,1,805232,4,4
1,51436069220cb800020005ae,9797,111,101,1,1,1006555,5,5
2,51436069220cb800020005bd,10573,192,123,1,1,402610,2,2
3,5143606a220cb800020005c6,11079,152,101,1,1,805228,4,4
4,5143606b220cb800020005d7,11062,178,145,1,1,1006545,5,5
...,...,...,...,...,...,...,...,...,...
4863,553fd64a356133000c450100,18252,73,280,1,0,1209102,6,6
4864,553fdd3c316638000c800000,18293,40,228,1,1,1209102,5,6
4866,553fdfd2316638000cbf0000,21296,153,263,1,1,1410619,7,7
4869,55403be0393131002cf60000,8178,29,110,1,0,806068,4,4


In [28]:
packages_ids = packages_df[packages_df["num_unique_headlines"] > 1].clickability_test_id.values
len(packages_ids)

2607

### 1. Removing duplicate headlnes 

In [29]:
sub_train_dfs = [df[df.clickability_test_id == id].drop_duplicates("headline") for id in packages_ids]

In [30]:
filtered_df = pd.concat(sub_train_dfs, ignore_index=True)
filtered_df

Unnamed: 0,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week,headline_lowercase
0,2013-02-24 07:10:53.000,2016-04-02 16:24:07.081,51436061220cb800020001e7,Religion and science can indeed be friends.,Creationism Has Nothing To Do With Christianit...,"<p>In a debate about creationism, one guy got ...",creationism-has-nothing-to-do-with-christianit...,5332ba141fae79f09f002c4f,2551,39,0,False,False,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,creationism has nothing to do with christianit...
1,2013-02-24 07:08:08.000,2016-04-02 16:26:25.064,51436061220cb800020001e7,"Good show, father.",The One Where A Creationist Picks A Fight And ...,"<p>In a debate about creationism, one guy got ...",the-one-where-a-creationist-picks-a-fight-and-...,5332ba141fae79f09f002c4f,2629,68,100,True,True,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,the one where a creationist picks a fight and ...
2,2013-02-24 07:07:13.000,2016-04-02 16:26:28.801,51436061220cb800020001e7,Why is he wearing that costume?,Creationism Shouldn't Be Taught In Science Cla...,"<p>In a debate about creationism, one guy got ...",creationism-shouldnt-be-taught-in-science-clas...,5332ba141fae79f09f002c4f,2539,49,11,False,False,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,creationism shouldn't be taught in science cla...
3,2013-02-24 07:09:18.000,2016-04-02 16:26:28.804,51436061220cb800020001e7,"Well played, God. Well played.",God Finds Out About Creationism And Sends A Re...,"<p>In a debate about creationism, one guy got ...",god-finds-out-about-creationism-and-sends-a-re...,5332ba141fae79f09f002c4f,2661,63,61,False,False,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,god finds out about creationism and sends a re...
4,2013-03-17 07:38:21.000,2016-04-02 16:24:11.525,51436069220cb800020005ae,"Honesty is so refreshing. Also, rude.",The One Where Your Advertisements Talk To You ...,Learn what advertisements say to each other be...,the-one-where-your-advertisements-talk-to-you-...,5332b5961fae79f09f000636,1950,15,0,False,False,,Screenshot_3_17_13_1_32_AM_MDT.jpg,201311,the one where your advertisements talk to you ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11115,2015-04-29 02:03:57.892,2016-04-02 16:32:29.144,55403be0393131002cf60000,,"A plastic bottle finds its way home, along wit...",<p>There's something timeless about this story...,a-plastic-bottle-finds-its-way-home-along-with...,553a6c16646537000c210100,2094,5,3,False,False,,,201517,"a plastic bottle finds its way home, along wit..."
11116,2015-04-29 02:18:05.646,2016-04-02 16:32:29.121,55403be0393131002cf60000,,"A plastic bottle finds its way home, but its o...",<p>There's something timeless about this story...,a-plastic-bottle-finds-its-way-home-but-its-or...,553a6c16646537000c210100,2040,14,100,True,False,,,201517,"a plastic bottle finds its way home, but its o..."
11117,2015-04-29 02:18:16.117,2016-04-02 16:32:29.124,55403be0393131002cf60000,,The epic journey of a plastic bottle shows us ...,<p>There's something timeless about this story...,the-epic-journey-of-a-plastic-bottle-shows-us-...,553a6c16646537000c210100,2061,6,6,False,False,,,201517,the epic journey of a plastic bottle shows us ...
11118,2015-04-29 20:28:36.176,2016-04-02 16:32:29.583,55413ef4333531000c180000,It was hiding in the trees.,They spent 4 days in the swamp looking for a c...,<p>They proved lots of people wrong.</p>,they-spent-4-days-in-the-swamp-looking-for-a-c...,55412a79643265000ce10200,2091,32,100,True,True,These two guys proved lots of people wrong.,,201517,they spent 4 days in the swamp looking for a c...


In [32]:
packages_ids = filtered_df.clickability_test_id.values
packages_ids = np.array(packages_ids)
print(f"packages_ids: {packages_ids}\nlength: {len(packages_ids)}")

packages_ids: ['51436061220cb800020001e7' '51436061220cb800020001e7'
 '51436061220cb800020001e7' ... '55403be0393131002cf60000'
 '55413ef4333531000c180000' '55413ef4333531000c180000']
length: 11120


## Train and data partitioning

In [33]:
train_samples, val_samples, test_samples = list(map(lambda x : int(x * len(packages_ids)), (0.8, 0.1, 0.1))) 

train_package_ids = packages_ids[:train_samples]
val_package_ids = packages_ids[train_samples:train_samples+val_samples]
test_package_ids = packages_ids[-test_samples:]

print(train_package_ids.shape)
print(val_package_ids.shape)
print(test_package_ids.shape)

(8896,)
(1112,)
(1112,)


In [35]:
train_texts = np.array(df[df.clickability_test_id.isin(train_package_ids)]["headline_lowercase"].tolist())
train_texts

array(['why democracy loves social media',
       'it’s the time of year people buy new cars, here’s the chart you need to see before you buy',
       'this might seem overblown, but think deeply about it', ...,
       'why this canadian restaurant wants you to order your food without speaking',
       "you'll walk into this restaurant for food, and walk out with a specific new skill",
       'what the waitstaff at one restaurant is teaching its customers - about ordering'],
      dtype='<U112')

## Exporting datasets

In [39]:
EXPORT_DIR = "../datasets"

In [44]:
import pickle 

dataset_with_features = {
    'df': pd.read_csv("../datasets/11k.csv"), 
    'train_package_ids': train_package_ids,
    'val_package_ids': val_package_ids,
    'test_package_ids': test_package_ids   
} 

with open(f"{EXPORT_DIR}/dataset_dict_features.pickle", 'wb') as handle:
    pickle.dump(dataset_with_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
import pickle 

dataset_without_features = {
    'df': filtered_df, 
    'train_package_ids': train_package_ids,
    'val_package_ids': val_package_ids,
    'test_package_ids': test_package_ids   
} 

with open(f"{EXPORT_DIR}/dataset_dict_no_features.pickle", 'wb') as handle:
    pickle.dump(dataset_without_features, handle, protocol=pickle.HIGHEST_PROTOCOL)