# Data Cleaning
Taking the webscrapped results and create similar features to `OnlineNewsPopularity.csv`.

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer

# Results of webscraping
raw = pd.read_csv('Data/Raw.csv').rename(columns={'Unnamed: 0':'URLs'}).set_index('URLs').dropna()

# Original dataset
orig = pd.read_csv('Data/OnlineNewsPopularity.csv', index_col='url')

# Dataset to save to
clean = pd.DataFrame(orig.loc[raw.index]['shares'])

raw.head()

Unnamed: 0_level_0,title,text,refs,images,videos,topics,date,time,channel
URLs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
http://mashable.com/2013/01/07/cosmic-events-doomsday/,3 Cosmic Events That Will Spark Doomsday Rumors,"Take a deep breath; Dec 21, 2012 is behind us....",['http://news.discovery.com/space/doomsday-pla...,5,0,"['Apocalypse', 'Asteroid', 'comet', 'Conversat...",2013-01-07,21:25:29,World
http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/,This Astronaut Is Rooting for Notre Dame Tonight,"When it comes to college football, NASA astron...",['http://www.space.com/16748-international-spa...,3,0,"['Space', 'college football', 'Entertainment',...",2013-01-07,22:23:38,Entertainment
http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/,AP's Twitter to Begin Displaying Sponsored Tweets,The Associated Press is the latest news organi...,['https://mashable.com/category/associated-pre...,1,0,"['Business', 'Media', 'Twitter']",2013-01-07,19:08:52,Business
http://mashable.com/2013/01/07/iheartradio-app-perfect-for/,"iHeartRadio Launches Stations Based on Moods, ...",LAS VEGAS — Popular digital radio service iHea...,['https://mashable.com/follow/topics/iHeartRad...,1,0,"['Apps', 'Apps and Software', 'CES', 'iHeartRa...",2013-01-07,10:27:49,Tech
http://mashable.com/2013/01/07/lego-taking-robotics-to-next-level-with-mindstorms-ev3/,LEGO Taking Robotics to Next Level with Mindst...,LEGO Mindstorms EV3LEGO Mindstorms EV3 BoxLego...,"['https://mashable.com/category/CES/', 'https:...",20,0,"['CES', 'Gadgets', 'gallery', 'Tech']",2013-01-07,13:11:42,Tech


# Text Features

In [5]:
def text_features(data, column):
    print('\nCreating text features from "%s"\n' % column)
    text_features = pd.DataFrame()
    sia = SentimentIntensityAnalyzer()
    for n, index in enumerate(data.index):
        text = data.loc[index][column]
        features = sia.polarity_scores(text)
        features['subjectivity'] = TextBlob(text).subjectivity
        features = pd.DataFrame(features,index=[index])
        text_features = text_features.append(features)
        print('\r%5d/%d Complete (%d%%)' % (n,len(data.index),n/len(data.index)*100),end='')
    text_features.columns = [column+'_'+col for col in ['neg_polarity','neu_polarity','pos_polarity','avg_polarity','subjectivity']]
    return text_features

def count_tokens(data):
    return len(data)

# title analysis
clean = clean.join(text_features(raw,'title'))
clean['n_tokens_title'] = raw['title'].str.split(' ').apply(count_tokens)
# content analysis
clean = clean.join(text_features(raw,'text'))
clean['n_tokens_text'] = raw['text'].str.split(' ').apply(count_tokens)


Creating text features from "title"

38257/38258 Complete (99%)
Creating text features from "text"

38257/38258 Complete (99%)

# References

In [6]:
references = raw['refs'].str.strip('[]').str.replace('\'','').str.split(', ')
ref = references.iloc[0]

def n_refs(ref):
    n = len(ref)
    m = len([1 for mash in ref if 'mashable' in mash])

    return str(n)+' '+str(m)

references = references.apply(n_refs).str.split(' ', expand=True)
references.columns = ['n_refs','n_self_refs']

clean = clean.join(references)

# Weekday

In [7]:
# get date as a datetime object
clean['date'] = pd.to_datetime(raw['date'])
# and get the day of the week
clean['weekday'] = clean['date'].dt.day_name()

# Media

In [8]:
clean = clean.join(raw[['images','videos']])

# Data Channel and Keywords
Creating the keywords will be the longest process. We want to identify the worst and best performing keyword per article, with the aggrigations not including the article itself, and then find the max, average, and min of each. 

In [9]:
# data channel
clean['channel'] = raw['channel']

In [12]:
# clean the keyword list such that its a common seperated list
# i.e. "['example_1', 'example_2]" -> "example_1, example_2" 
keywords = raw['topics'].str.strip('[]').str.replace('\'','').str.split(', ',expand=True).reset_index()

# melt such that each article amd keyword gets its own entry
# i.e.                                          "example_1": "key_1"
#      "example_1": "key_1, key_2, key_3"  ->   "example_1": "key_2"
#                                               "example_1": "key_3"
keywords_melt = pd.melt(keywords,id_vars=['URLs'],value_name='keyword')
keywords_melt = keywords_melt.set_index('URLs')['keyword'].dropna()

# join with shares
keyword_shares = pd.DataFrame(keywords_melt).join(orig['shares'])

In [13]:
# how many keywords are we looking at? how many are used more than once?
keyword_counts = keyword_shares['keyword'].value_counts()
only_once = keyword_counts[keyword_counts == 1]

total = len(keyword_shares['keyword'].unique())
use_1 = len(only_once)

print('Of %d total keywords, %d are used only once.' % (total,use_1))
print('This means that only %d%% of keywords are shared between two or more articles.' % ((total-use_1)/total * 100))

Of 16460 total keywords, 8557 are used only once.
This means that only 48% of keywords are shared between two or more articles.


In [14]:
# drop the keywords only used once
keyword_shares = keyword_shares.reset_index().set_index('keyword').drop(index = list(only_once.index))
keyword_shares = keyword_shares.reset_index().set_index('index')

In [15]:
print('\nCreating keyword features\n')
keyword_features = pd.DataFrame()
URLs = keyword_shares.index.unique()
for n, index in enumerate(URLs):
    keyword_entry = dict()
    keys = keyword_shares.loc[index]['keyword']
    if type(keys) == str:
        keys = [keys]
    else:
        keys = keys.unique()

    keyword_entry['n_keywords'] = len(keys)
    key_group = keyword_shares.drop(index).reset_index().set_index('keyword')
    key_group = key_group.loc[keys].groupby(level=0).agg(['min','mean','max'])

    # worst performing keyword
    kw_min = key_group.sort_values(('shares', 'mean')).iloc[0]['shares']
    keyword_entry['kw_min'] = kw_min.name
    keyword_entry['kw_min_min'] = kw_min['min']
    keyword_entry['kw_min_avg'] = kw_min['mean']
    keyword_entry['kw_min_max'] = kw_min['max']

    # worst performing keyword
    kw_avg = key_group.sort_values(('shares', 'mean')).iloc[len(key_group)//2]['shares']
    keyword_entry['kw_avg'] = kw_avg.name
    keyword_entry['kw_avg_min'] = kw_avg['min']
    keyword_entry['kw_avg_avg'] = kw_avg['mean']
    keyword_entry['kw_avg_max'] = kw_avg['max']

    # worst performing keyword
    kw_max = key_group.sort_values(('shares', 'mean')).iloc[-1]['shares']
    keyword_entry['kw_max'] = kw_max.name
    keyword_entry['kw_max_min'] = kw_max['min']
    keyword_entry['kw_max_avg'] = kw_max['mean']
    keyword_entry['kw_max_max'] = kw_max['max']

    keyword_features = keyword_features.append(pd.DataFrame(keyword_entry,index=[index]))
    print('\r%5d/%d Complete (%d%%)' % (n,len(URLs),n/len(URLs)*100),end='')


Creating keyword features

38256/38257 Complete (99%)

# Join Dataframes
Add these new keyword features to the dataset

In [16]:
clean = clean.join(keyword_features)

# Save Data

In [17]:
clean.to_csv('Data/Clean.csv')
clean.head()

Unnamed: 0_level_0,shares,title_neg_polarity,title_neu_polarity,title_pos_polarity,title_avg_polarity,title_subjectivity,n_tokens_title,text_neg_polarity,text_neu_polarity,text_pos_polarity,...,kw_min_avg,kw_min_max,kw_avg,kw_avg_min,kw_avg_avg,kw_avg_max,kw_max,kw_max_min,kw_max_avg,kw_max_max
URLs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://mashable.com/2013/01/07/cosmic-events-doomsday/,2200,0.355,0.467,0.178,-0.4404,0.0,8,0.079,0.853,0.068,...,1528.571429,4900.0,World,22.0,3226.473702,310800.0,Conversations,36.0,4405.444294,690400.0
http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/,1200,0.0,1.0,0.0,0.0,0.0,8,0.005,0.847,0.148,...,1363.142857,3100.0,Sports,43.0,3117.970755,200100.0,Entertainment,5.0,3261.614561,652900.0
http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/,711,0.0,1.0,0.0,0.0,0.0,7,0.0,0.938,0.062,...,2947.087408,74100.0,Twitter,52.0,3339.586439,145500.0,Business,44.0,3438.132293,690400.0
http://mashable.com/2013/01/07/iheartradio-app-perfect-for/,1500,0.0,1.0,0.0,0.0,0.0,7,0.008,0.863,0.129,...,2577.0,12300.0,Apps and Software,47.0,3454.003844,211600.0,Music,42.0,3661.511628,652900.0
http://mashable.com/2013/01/07/lego-taking-robotics-to-next-level-with-mindstorms-ev3/,3900,0.0,1.0,0.0,0.0,0.0,9,0.007,0.914,0.079,...,2681.208511,53100.0,Gadgets,1.0,3771.445918,843300.0,gallery,399.0,3969.972222,71800.0
