In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Original dataset
raw = pd.read_csv("../OnlineNewsPopularity.csv").set_index('url')

# Results of my own scraping
update_data = pd.read_csv("Updates.csv").set_index('url')

print(len(update_data))

39644


# Updates Dataframe
Since this was data scrapped from the articles, its bound to be a bit messy

In [2]:
update = update_data.copy()

# Sperate the time stamp into weekday and date
update['weekday'] = update['date'].str.split(",", n = 1, expand = True)[0]
update['date'] = pd.to_datetime(update['date'].str.split(",", n = 1, expand = True)[1])

# Edit text of titles and keywords
def remove_text(column,text_list):
    update[column] = update[column].str.lower()
    for string in text_list:
        update[column] = update[column].str.replace(string,'')
        
remove = [r"\<.*?\>",'amp;','[',']']
remove_text('keywords',remove)
remove_text('title',remove)

update.head()

Unnamed: 0_level_0,channel,date,title,keywords,weekday
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/,Entertainment,2013-01-07,this astronaut is rooting for notre dame tonight,"space, college football, entertainment, sports",Mon
http://mashable.com/2013/01/07/earth-size-planets-milky-way/,World,2013-01-07,there are 17 billion earth-size alien planets ...,"alien planets, earth, space, world",Mon
http://mashable.com/2013/01/07/apple-40-billion-app-downloads/,Business,2013-01-07,apple's app store passes 40 billion downloads,"apple, apps, apps and software, business, mobile",Mon
http://mashable.com/2013/01/07/downton-abbey-tumblrs/,Culture,2013-01-07,8 'downton abbey' tumblrs suitable for aristoc...,"downton abbey, gallery, memes, tumblr, televis...",Mon
http://mashable.com/2013/01/07/att-u-verse-apps/,Tech,2013-01-07,new u-verse apps simplify sharing photos and v...,"apps, apps and software, at&t, ces, tech",Mon


## Keywords
split the keywords columns into something useable

In [3]:
# Split the keywords into seperate entries
keywords_df = update.keywords.dropna().str.split(', ',expand=True).reset_index()
keywords_df = pd.melt(keywords_df,id_vars=['url'],value_name='keyword')
keywords_df = keywords_df.dropna().drop(columns='variable')

# Create a new data frame made up of keywords, shares, and url
keyword_shares = keywords_df.merge(raw.shares, how='left',on='url')
keyword_shares = keyword_shares[keyword_shares.url != ''].set_index('url')
keyword_shares.head()

Unnamed: 0_level_0,keyword,shares
url,Unnamed: 1_level_1,Unnamed: 2_level_1
http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/,space,1200
http://mashable.com/2013/01/07/earth-size-planets-milky-way/,alien planets,1600
http://mashable.com/2013/01/07/apple-40-billion-app-downloads/,apple,1500
http://mashable.com/2013/01/07/downton-abbey-tumblrs/,downton abbey,761
http://mashable.com/2013/01/07/att-u-verse-apps/,apps,505


In [4]:
# Drop keywords only used once
keyword_counts = keyword_shares.keyword.value_counts()
keyword_counts = keyword_counts[keyword_counts>1].index

keyword_shares = keyword_shares.reset_index().set_index('keyword').loc[keyword_counts]
keyword_shares = keyword_shares.reset_index().set_index('url')

In [5]:
# Get the min, avg, and max of each keyword
kw_min_max = keyword_shares.groupby('keyword').max().rename(columns={'shares':'max'})
kw_min_max['avg'] = keyword_shares.groupby('keyword').mean().astype('int')
kw_min_max['min'] = keyword_shares.groupby('keyword').min()

In [None]:
# Select the feautres of the best, worst, and middle keywords

kw_dict = {'url':[],
          'kw_min':[],'kw_min_min':[],'kw_min_avg':[],'kw_min_max':[],
          'kw_avg':[],'kw_avg_min':[],'kw_avg_avg':[],'kw_avg_max':[],
          'kw_max':[],'kw_max_min':[],'kw_max_avg':[],'kw_max_max':[]}

for url in keyword_shares.index.unique():
    article = keyword_shares.loc[url]
    if type(article) == pd.core.series.Series:
        key = article.loc['keyword']
        min_share = kw_min_max.loc[key]['min']
        avg_share = kw_min_max.loc[key]['avg']
        max_share = kw_min_max.loc[key]['max']
        
        kw_dict['url'].append(url)
        
        for col in ['kw_min','kw_avg','kw_max']:
            kw_dict[col].append(key)
            
        for col in ['min','avg','max']:
            kw_dict['kw_'+col+'_min'].append(min_share)
            kw_dict['kw_'+col+'_avg'].append(avg_share)
            kw_dict['kw_'+col+'_max'].append(max_share)
        
    if type(article) == pd.core.frame.DataFrame:
        keys = article['keyword'].values
        article_kw = kw_min_max.loc[keys].sort_values('avg').reset_index()
  
        kw_min = article_kw.iloc[0]
        kw_max = article_kw.iloc[-1]
        kw_avg = article_kw.iloc[int(len(article)/2)]
          
        kw_dict['url'].append(url)
    
        kw_dict['kw_min'].append(kw_min['keyword'])
        kw_dict['kw_min_min'].append(kw_min['min'])
        kw_dict['kw_min_avg'].append(kw_min['avg'])
        kw_dict['kw_min_max'].append(kw_min['max'])
        
        kw_dict['kw_avg'].append(kw_avg['keyword'])
        kw_dict['kw_avg_min'].append(kw_avg['min'])
        kw_dict['kw_avg_avg'].append(kw_avg['avg'])
        kw_dict['kw_avg_max'].append(kw_avg['max'])
        
        kw_dict['kw_max'].append(kw_max['keyword'])
        kw_dict['kw_max_min'].append(kw_max['min'])
        kw_dict['kw_max_avg'].append(kw_max['avg'])
        kw_dict['kw_max_max'].append(kw_max['max'])

kw_min_max_all = pd.DataFrame(kw_dict).set_index('url')
kw_min_max_all

In [None]:
# replace the origional with out new features
temp_data = raw.drop(columns=['kw_min_min','kw_min_avg','kw_min_max',
                          'kw_avg_min','kw_avg_avg','kw_avg_max',
                          'kw_max_min','kw_max_avg','kw_max_max'])
update = update.merge(kw_min_max_all,how='right',right_index=True,left_index=True)

# Join Dataframes

In [None]:
data = temp_data.reset_index().join(update, on='url',how='inner').set_index('url').dropna()

# Days of the Week
First of, since we have a new days of the week column, I'm going to drop the origionals

In [None]:
data = data.drop(columns=['weekday_is_monday','weekday_is_tuesday','weekday_is_wednesday',
                          'weekday_is_thursday','weekday_is_friday','weekday_is_saturday',
                          'weekday_is_sunday','is_weekend'])

In [None]:
def break_down(column,sort='total articles',show=True):
    counts = data[[column,'shares']].groupby(by=column).count().rename(columns={'shares':'total articles'})
    sums = data[[column,'shares']].groupby(by=column).sum().rename(columns={'shares':'sum shares'})
    group = counts.join(sums,on=column)
    
    # add the shares per article ratio
    group["shares per article ratio"] = group["sum shares"]/group["total articles"]

    # add percentages to make it easier to see
    group['percent of all shares'] = group['sum shares']/group['sum shares'].sum()*100
    group['percent of articles'] = group['total articles']/group['total articles'].sum()*100

    # sort
    group = group.sort_values('total articles',ascending=False)
    
    if show==True:
        group
    
    return group

break_down('weekday')

Seems like less articles are published on Saturday and Sunday so I combined both these entries as `weekend`

In [None]:
data['weekday'] = data['weekday'].replace(['Sat','Sun'],'Weekend')
break_down('weekday')         

# Data Channels
Do the same with data channel

In [None]:
data = data.drop(columns=['data_channel_is_lifestyle',' data_channel_is_entertainment','data_channel_is_bus',
                          'data_channel_is_socmed','data_channel_is_tech','data_channel_is_world'])

In [None]:
break_down('channel')

The most articles are from the `Entertainment` category, followed by `World` and `Tech`. Understandably the highest sum shares come from the same categories.

Since `Social Good`, `US`, and the unlabeled data channels are so small, I just grouped them up into one category labeled `Other`

In [None]:
data['channel'] = data['channel'].replace(['Social Good','U.S.','Unlabeled'],'Other')
break_down('channel') 

# Distribution of Shares
Now that the weekdays and channels have been updated, lets look at the shares as a whole

In [None]:
plt.figure(figsize=(10,5),dpi=200)

plt.subplot(121)
plt.hist(data['shares'],bins=20);
plt.title('Distribution of Shares')

plt.subplot(122)
plt.hist(data['shares'],bins=20,range=(0,10000))
plt.title('Distribution of Shares - Zoomed In');

Seems like there's alot of outlires. Viral articles get a whole lot more shares than the typical article

# Save Data

Saved the data as `OnlineNewsPopularity_Cleaned.csv`.

In [None]:
data.head()

In [None]:
data.to_csv('../OnlineNewsPopularity_Clean.csv')

In [None]:
data.info()