In [56]:
from __future__ import print_function
import json
import string
import operator
import itertools
import numpy as np
import seaborn as sns
from scipy import stats
from pprint import pprint
from nltk import word_tokenize
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
%matplotlib inline

# General Feature Exploration and Analysis
The following notebook performs general exploration of the dataset to find potential correlations and patterns within the dataset itself. 

This will be used to guide the development of the modeling process.

## General Dataset Information

In [2]:
# Application Directory Constants
DATA_DIR = '../Data/dataset/'

In [5]:
# Load Dataset
instance_raw = open(DATA_DIR+'instances_train.jsonl', 'rb').read().replace('\"', '"').split('\n')[:-1]
data_train_X = map(json.loads, instance_raw)

truth_raw = open(DATA_DIR+'truth_train.jsonl', 'rb').read().replace('\"', '"').split('\n')[:-1]
data_train_Y = map(json.loads, truth_raw)

In [6]:
print('TOTAL RECORDS: ' + str(len(data_train_X)))
print('TRAIN X - DICTIONARY SCHEMA: ' + str(data_train_X[0].keys()))
print('TRAIN Y - DICTIONARY SCHEMA: ' + str(data_train_Y[0].keys()))

TOTAL RECORDS: 17581
TRAIN X - DICTIONARY SCHEMA: [u'postText', u'targetCaptions', u'postMedia', u'targetKeywords', u'targetParagraphs', u'postTimestamp', u'targetTitle', u'id', u'targetDescription']
TRAIN Y - DICTIONARY SCHEMA: [u'truthMedian', u'truthClass', u'truthJudgments', u'truthMean', u'truthMode', u'id']


### Sample Data Instance

In [85]:
# Non-Clickbait Example
pprint(data_train_X[0])
print()

pprint(data_train_Y[0])
print()

{u'id': u'0',
 u'postMedia': [],
 u'postText': [u"Apple's iOS 9 'App thinning' feature will give your phone's storage a boost"],
 u'postTimestamp': u'Tue Jun 09 16:31:10 +0000 2015',
 u'targetCaptions': [u"'App thinning' will be supported on Apple's iOS 9 and later models. It ensures apps use the lowest amount of storage space on a device by only downloading the parts it needs to run on individual handsets. It 'slices' the app into 'app variants' that only need to access the specific files on that specific device",
                     u"'App thinning' will be supported on Apple's iOS 9 and later models. It ensures apps use the lowest amount of storage space on a device by only downloading the parts it needs to run on individual handsets. It 'slices' the app into 'app variants' that only need to access the specific files on that specific device",
                     u"The guidelines also discuss so-called 'on-demand resources.' This allows developers to omit features from an app until

In [87]:
# Clickbait Example
pprint(data_train_X[2])
print()

pprint(data_train_Y[2])
print()

{u'id': u'2',
 u'postMedia': [],
 u'postText': [u'U.S. Soccer should start answering tough questions about Hope Solo, @eric_adelson writes.'],
 u'postTimestamp': u'Fri Jun 12 23:36:05 +0000 2015',
 u'targetCaptions': [u'US to vote for Ali in FIFA election and not Blatter',
                     u'US to vote for Ali in FIFA election and not Blatter',
                     u"FILE - This Oct. 10, 2014, file photo shows Sunil Gulati, president of the United States Soccer Federation, during a press conference in Bristol, Conn. The United States says it will vote for Jordan's Prince Ali bin Al-Hussein for FIFA president Friday, May 29, 2015 and not for incumbent Sepp Blatter. (AP Photo/Elise Amendola, File)"],
 u'targetDescription': u"A U.S. Senator's scathing letter questioned U.S. Soccer's inadequate handling of Solo's domestic violence charges. It's time for Sunil Gulati to respond.",
 u'targetKeywords': u'',
 u'targetParagraphs': [u"WINNIPEG, Manitoba \u2013 The bubble U.S. Soccer is putti

## Data Class Label Distribution Analysis

In [7]:
print('Discrete Class Counts')
c_list = map(lambda x: 0 if x['truthClass'] == 'no-clickbait' else 1, data_train_Y)
print('CLICKBAIT POS: ' + str(sum(c_list)) + '\t' + str(sum(c_list)/float(len(c_list))))
print('CLICKBAIT NEG: ' + str(len(c_list) - sum(c_list)) + '\t' + str(1 - (sum(c_list)/float(len(c_list)))))
print()

print('Median Distribution')
med_list = map(lambda x: x['truthMedian'], data_train_Y)
pprint(stats.describe(med_list))
print()

print('Mean Distribution')
mean_list = map(lambda x: x['truthMean'], data_train_Y)
pprint(stats.describe(mean_list))
print()

print('Mode Distribution')
mode_list = map(lambda x: x['truthMode'], data_train_Y)
pprint(stats.describe(mode_list))
print()

print('Overall Distribution')
overall_list = map(lambda x: x['truthJudgments'], data_train_Y)
overall_list = list(itertools.chain.from_iterable(overall_list))
pprint(stats.describe(overall_list))

Discrete Class Counts
CLICKBAIT POS: 4433	0.252147204368
CLICKBAIT NEG: 13148	0.747852795632

Median Distribution
DescribeResult(nobs=17581, minmax=(0.0, 1.0), mean=0.28597159927869292, variance=0.11023303798566474, skewness=0.8424582531710086, kurtosis=-0.49802822996455554)

Mean Distribution
DescribeResult(nobs=17581, minmax=(0.0, 1.0), mean=0.32994331090893236, variance=0.063558726054432441, skewness=0.6992524359835718, kurtosis=-0.4036965464625224)

Mode Distribution
DescribeResult(nobs=17581, minmax=(0.0, 1.0), mean=0.27558159451935615, variance=0.12979563066539043, skewness=0.9663085836264188, kurtosis=-0.5125765548566874)

Overall Distribution
DescribeResult(nobs=87905, minmax=(0.0, 1.0), mean=0.32994331102621804, variance=0.13623815425012464, skewness=0.6748670366487401, kurtosis=-0.9773744894072753)


### Overall Notes and Observations
* Dataset is generally biased towards negative samples - for every 3 negative, we have 1 positive.
* Distribution of the overall data is generally positively skewed.
* Mean seems to be quite consistent against other parameters - indicating most of the news in the data is negatively baity.
* Variance is quite small, so the confusion rate of the end model should approach this distribution at the very least.
* **TODO**: Identify the articles which are away from the variance and see why it would be controvertial.
* **TODO**: Devise strategies for oversampling methods when training the model - bootstrapping, smote, stratified k-fold, etc.

## Data Feature Analysis
The following analysis is based on the various features provided by the data source.

In [17]:
# Split Training Based on Bait or Not-Bait
data_train_X_bait = filter(lambda y: y is not '', map(lambda x: x[0] if x[1]['truthClass'] == 'clickbait' else '', zip(data_train_X, data_train_Y)))
data_train_X_notbait = filter(lambda y: y is not '', map(lambda x: x[0] if x[1]['truthClass'] == 'no-clickbait' else '', zip(data_train_X, data_train_Y)))

## Target Title Analysis

### Target Title Character/Word Count Distributions
Analyze the character level distribution details of the title.

**TODO:** Perform statistical testing to check significance of the distribution differences.

In [20]:
def char_word_dist(input_data):
    print('Character Length Distribution')
    title_chlen_list = map(lambda x: len(x['targetTitle']), input_data)
    pprint(stats.describe(title_chlen_list))
    print()

    print('Word Length Distribution (Assume Separation by Whitespace)')
    title_chlen_list = map(lambda x: map(len, x['targetTitle'].split(' ')), input_data)
    title_chlen_list = list(itertools.chain.from_iterable(title_chlen_list))
    pprint(stats.describe(title_chlen_list))
    print()

    print('Upper Case Characters Distribution')
    title_cap_list = map(lambda x: sum([1 for i in x['targetTitle'] if i.isupper()]), input_data)
    pprint(stats.describe(title_cap_list))
    print()

    print('Lower Case Characters Distribution')
    title_low_list = map(lambda x: sum([1 for i in x['targetTitle'] if i.islower()]), input_data)
    pprint(stats.describe(title_low_list))
    print()

    print('Punctuation Distribution')
    title_punct_list = map(lambda x: sum([1 for i in x['targetTitle'] if i in string.punctuation and i is not ' ']), input_data)
    pprint(stats.describe(title_punct_list))
    print()

#### Entire Distribution

In [21]:
char_word_dist(data_train_X)

Character Length Distribution
DescribeResult(nobs=17581, minmax=(4, 4038), mean=80.607360218417611, variance=22675.621422502139, skewness=15.056280752353373, kurtosis=268.055872703193)

Word Length Distribution (Assume Separation by Whitespace)
DescribeResult(nobs=232465, minmax=(0, 31), mean=5.1718495257350572, variance=6.6723698434872043, skewness=0.7323561694237745, kurtosis=0.9709286012182616)

Upper Case Characters Distribution
DescribeResult(nobs=17581, minmax=(0, 199), mean=7.4869461350321371, variance=97.60456508179692, skewness=10.383947623179736, kurtosis=140.83544204595051)

Lower Case Characters Distribution
DescribeResult(nobs=17581, minmax=(0, 3096), mean=58.27302201239975, variance=13117.971416060323, skewness=15.129069031202686, kurtosis=272.816959987369)

Punctuation Distribution
DescribeResult(nobs=17581, minmax=(0, 74), mean=1.6065070246288606, variance=9.4275207961537237, skewness=10.631201733725636, kurtosis=157.1301561576207)



#### Clickbait Title Distribution Analysis

In [22]:
char_word_dist(data_train_X_bait)

Character Length Distribution
DescribeResult(nobs=4433, minmax=(4, 4038), mean=79.248364538687113, variance=21845.357750596322, skewness=16.706889125188475, kurtosis=330.58211989180535)

Word Length Distribution (Assume Separation by Whitespace)
DescribeResult(nobs=57667, minmax=(0, 29), mean=5.1688834168588622, variance=6.6759658100650556, skewness=0.7591308305442004, kurtosis=1.0895376951683469)

Upper Case Characters Distribution
DescribeResult(nobs=4433, minmax=(0, 194), mean=7.4432664110083469, variance=88.180952505047046, skewness=10.689453923145273, kurtosis=153.87172985695204)

Lower Case Characters Distribution
DescribeResult(nobs=4433, minmax=(0, 3096), mean=57.193097225355288, variance=12714.446008297627, skewness=16.78385777183443, kurtosis=335.1615601802185)

Punctuation Distribution
DescribeResult(nobs=4433, minmax=(0, 64), mean=1.5851567787051657, variance=8.8240305315972005, skewness=11.18981279316539, kurtosis=177.6721286470377)



#### Non-Clickbait Title Distribution Analysis

In [23]:
char_word_dist(data_train_X_notbait)

Character Length Distribution
DescribeResult(nobs=13148, minmax=(4, 4038), mean=81.065561302099184, variance=22956.404615970001, skewness=14.5367689965689, kurtosis=248.84125546177148)

Word Length Distribution (Assume Separation by Whitespace)
DescribeResult(nobs=174798, minmax=(0, 31), mean=5.1728280643943299, variance=6.6712178365455204, skewness=0.7235151550621862, kurtosis=0.9317997014531723)

Upper Case Characters Distribution
DescribeResult(nobs=13148, minmax=(0, 199), mean=7.5016732582902339, variance=100.78793361134231, skewness=10.283432372583922, kurtosis=136.89659270240287)

Lower Case Characters Distribution
DescribeResult(nobs=13148, minmax=(0, 3096), mean=58.637131122604195, variance=13254.476287562635, skewness=14.603099399783527, kurtosis=253.4003772389118)

Punctuation Distribution
DescribeResult(nobs=13148, minmax=(0, 74), mean=1.6137055065409187, variance=9.631475642178545, skewness=10.458613154710031, kurtosis=151.0343731438817)



### Target Title Word Frequency Distribution
Analyze the most frequently used words in the entire corpus set.

Objective here is to try to find the most 'baity' words as possible - good for crafting a 'bait' corpus.

In [71]:
# Preprocess Text
tkn = RegexpTokenizer(r'\w+')
def preprocess(title):
    title = word_tokenize(title.lower())  # Tokenize & Normalize Text
    title = set(title) - set(string.punctuation)  # Removes Punctuation (WARNING: This messes up the order of text; doesn't matter for now)
    title = set(title) - set(stopwords.words('english'))  # Removes Common Stopwords
    # title = set(title) - set(['\'s']) # Remove "'s" - for some reason it's not removed...
    return title

def token_freq_dist(titles, k=10):
    token_X_list = map(lambda x: preprocess(x['targetTitle']), titles)
    token_X_list = list(itertools.chain.from_iterable(token_X_list))
    token_X_dist = sorted(Counter(token_X_list).items(), key=operator.itemgetter(1), reverse=True)
    pprint(token_X_dist[0:k])
    print()

#### Entire Distribution

In [78]:
token_freq_dist(data_train_X, k=30)

[(u"'s", 2617),
 (u'trump', 1869),
 (u'new', 1001),
 (u'says', 689),
 (u'video', 497),
 (u'2017', 488),
 (u'donald', 459),
 (u'us', 451),
 (u"n't", 425),
 (u'first', 402),
 (u'man', 383),
 (u'people', 369),
 (u'world', 352),
 (u'house', 344),
 (u'news', 337),
 (u'woman', 307),
 (u'live', 297),
 (u'time', 293),
 (u'white', 290),
 (u'2016', 289),
 (u'police', 287),
 (u'president', 282),
 (u'could', 281),
 (u'one', 280),
 (u'women', 277),
 (u'may', 272),
 (u'best', 266),
 (u'watch', 261),
 (u'china', 255),
 (u'get', 251)]



#### Clickbait Distribution

In [79]:
token_freq_dist(data_train_X_bait, k=30)

[(u"'s", 650),
 (u'trump', 431),
 (u'new', 218),
 (u'says', 176),
 (u'video', 122),
 (u'2017', 120),
 (u'us', 114),
 (u"n't", 112),
 (u'people', 105),
 (u'first', 102),
 (u'world', 99),
 (u'man', 98),
 (u'donald', 91),
 (u'house', 86),
 (u'white', 82),
 (u'news', 81),
 (u'2016', 78),
 (u'woman', 76),
 (u'president', 72),
 (u'watch', 72),
 (u'time', 71),
 (u'one', 69),
 (u'could', 69),
 (u'women', 69),
 (u'top', 68),
 (u'police', 65),
 (u'life', 64),
 (u'get', 64),
 (u'live', 63),
 (u'may', 63)]



#### Non-Clickbait Distribution

In [80]:
token_freq_dist(data_train_X_notbait, k=30)

[(u"'s", 1967),
 (u'trump', 1438),
 (u'new', 783),
 (u'says', 513),
 (u'video', 375),
 (u'2017', 368),
 (u'donald', 368),
 (u'us', 337),
 (u"n't", 313),
 (u'first', 300),
 (u'man', 285),
 (u'people', 264),
 (u'house', 258),
 (u'news', 256),
 (u'world', 253),
 (u'live', 234),
 (u'woman', 231),
 (u'police', 222),
 (u'time', 222),
 (u'could', 212),
 (u'one', 211),
 (u'2016', 211),
 (u'president', 210),
 (u'may', 209),
 (u'white', 208),
 (u'women', 208),
 (u'best', 204),
 (u'china', 199),
 (u'u.s.', 194),
 (u'day', 192)]



### Post Text Analysis
The following analysis performs analytics over the data from the actual content of the Tweet.

### Target Post Character/Word Count Distributions

In [92]:
def char_word_dist(input_data):
    print('Character Length Distribution')
    title_chlen_list = map(lambda x: len(' '.join(x['postText'])), input_data)
    pprint(stats.describe(title_chlen_list))
    print()

    print('Word Length Distribution (Assume Separation by Whitespace)')
    title_chlen_list = map(lambda x: map(len, ' '.join(x['postText']).split(' ')), input_data)
    title_chlen_list = list(itertools.chain.from_iterable(title_chlen_list))
    pprint(stats.describe(title_chlen_list))
    print()

    print('Upper Case Characters Distribution')
    title_cap_list = map(lambda x: sum([1 for i in ' '.join(x['postText']) if i.isupper()]), input_data)
    pprint(stats.describe(title_cap_list))
    print()

    print('Lower Case Characters Distribution')
    title_low_list = map(lambda x: sum([1 for i in ' '.join(x['postText']) if i.islower()]), input_data)
    pprint(stats.describe(title_low_list))
    print()

    print('Punctuation Distribution')
    title_punct_list = map(lambda x: sum([1 for i in ' '.join(x['postText']) if i in string.punctuation and i is not ' ']), input_data)
    pprint(stats.describe(title_punct_list))
    print()

#### Entire Distribution

In [93]:
char_word_dist(data_train_X)

Character Length Distribution
DescribeResult(nobs=17581, minmax=(0, 143), mean=71.538820317388087, variance=471.88184045127315, skewness=-0.251818048095352, kurtosis=0.5456269562150058)

Word Length Distribution (Assume Separation by Whitespace)
DescribeResult(nobs=206198, minmax=(0, 107), mean=5.1848563031649189, variance=7.6582143886296254, skewness=1.3589774606453782, kurtosis=12.317022199937528)

Upper Case Characters Distribution
DescribeResult(nobs=17581, minmax=(0, 40), mean=3.9819122916785168, variance=9.6353952280292248, skewness=1.8560016831070376, kurtosis=6.144072662794134)

Lower Case Characters Distribution
DescribeResult(nobs=17581, minmax=(0, 108), mean=53.955861441328707, variance=287.44424052778561, skewness=-0.26173039093673234, kurtosis=0.4969114777660284)

Punctuation Distribution
DescribeResult(nobs=17581, minmax=(0, 100), mean=1.9985211307661681, variance=4.2937407024687104, skewness=7.256441834179071, kurtosis=286.8242625948423)



#### Clickbait Distribution 

In [94]:
char_word_dist(data_train_X_bait)

Character Length Distribution
DescribeResult(nobs=4433, minmax=(0, 139), mean=70.407850214301831, variance=480.76593114001406, skewness=-0.2553805424820163, kurtosis=0.4826434739885137)

Word Length Distribution (Assume Separation by Whitespace)
DescribeResult(nobs=51266, minmax=(0, 107), mean=5.1746771739554482, variance=7.7848193876373557, skewness=1.977029309633079, kurtosis=36.435208962177725)

Upper Case Characters Distribution
DescribeResult(nobs=4433, minmax=(0, 24), mean=3.7769005188360025, variance=8.5321197231788819, skewness=1.7029072924633517, kurtosis=4.5521407296839715)

Lower Case Characters Distribution
DescribeResult(nobs=4433, minmax=(0, 106), mean=53.216106474170992, variance=294.69967734606138, skewness=-0.2625641700090724, kurtosis=0.48109122179964503)

Punctuation Distribution
DescribeResult(nobs=4433, minmax=(0, 100), mean=1.977667493796526, variance=5.7194921213641372, skewness=16.25141985154566, kurtosis=636.5186559948596)



#### Non-Clickbait Distribution

In [95]:
char_word_dist(data_train_X_notbait)

Character Length Distribution
DescribeResult(nobs=13148, minmax=(0, 143), mean=71.920139945238816, variance=468.34609773655592, skewness=-0.24958304899520672, kurtosis=0.5675080146958917)

Word Length Distribution (Assume Separation by Whitespace)
DescribeResult(nobs=154932, minmax=(0, 66), mean=5.1882245113985492, variance=7.6163259567071515, skewness=1.1476799501954635, kurtosis=3.9823364586298755)

Upper Case Characters Distribution
DescribeResult(nobs=13148, minmax=(0, 40), mean=4.0510343778521447, variance=9.9891044293345406, skewness=1.8885664647086793, kurtosis=6.462052710872953)

Lower Case Characters Distribution
DescribeResult(nobs=13148, minmax=(0, 108), mean=54.20527836933374, variance=284.77348092197212, skewness=-0.26018270421688167, kurtosis=0.5013843344248716)

Punctuation Distribution
DescribeResult(nobs=13148, minmax=(0, 20), mean=2.0055521752357772, variance=3.813234554743119, skewness=1.5363956815581272, kurtosis=3.8486335353845815)



### Target Post Word Frequency Distribution

In [103]:
def token_freq_dist(titles, k=10):
    token_X_list = map(lambda x: preprocess(' '.join(x['postText'])), titles)
    token_X_list = list(itertools.chain.from_iterable(token_X_list))
    token_X_dist = sorted(Counter(token_X_list).items(), key=operator.itemgetter(1), reverse=True)
    pprint(token_X_dist[0:k])
    print()

#### Entire Distribution

In [105]:
token_freq_dist(data_train_X, k=30)

[(u"'s", 2910),
 (u'trump', 1572),
 (u'``', 1379),
 (u"''", 1371),
 (u'new', 949),
 (u'says', 614),
 (u'via', 524),
 (u"n't", 494),
 (u'rt', 450),
 (u'people', 429),
 (u'president', 377),
 (u'one', 369),
 (u'first', 364),
 (u'donald', 360),
 (u'us', 337),
 (u'world', 334),
 (u'could', 319),
 (u'man', 305),
 (u'day', 285),
 (u'house', 284),
 (u'say', 283),
 (u'police', 280),
 (u'may', 279),
 (u'watch', 277),
 (u'video', 274),
 (u'best', 269),
 (u'get', 267),
 (u'year', 263),
 (u'u.s.', 260),
 (u'years', 255)]



#### Clickbait Distribution

In [106]:
token_freq_dist(data_train_X_bait, k=30)

[(u"'s", 739),
 (u'trump', 367),
 (u"''", 354),
 (u'``', 352),
 (u'new', 223),
 (u'says', 155),
 (u"n't", 131),
 (u'via', 126),
 (u'rt', 113),
 (u'people', 110),
 (u'president', 103),
 (u'one', 98),
 (u'first', 92),
 (u'us', 91),
 (u'world', 87),
 (u'man', 87),
 (u'could', 83),
 (u'donald', 81),
 (u'get', 76),
 (u'year', 71),
 (u'may', 71),
 (u'day', 71),
 (u'best', 69),
 (u'house', 69),
 (u'watch', 67),
 (u'video', 65),
 (u'life', 64),
 (u'say', 64),
 (u'years', 64),
 (u'time', 62)]



#### Non-Clickbait Distribution

In [107]:
token_freq_dist(data_train_X_notbait, k=30)

[(u"'s", 2171),
 (u'trump', 1205),
 (u'``', 1027),
 (u"''", 1017),
 (u'new', 726),
 (u'says', 459),
 (u'via', 398),
 (u"n't", 363),
 (u'rt', 337),
 (u'people', 319),
 (u'donald', 279),
 (u'president', 274),
 (u'first', 272),
 (u'one', 271),
 (u'world', 247),
 (u'us', 246),
 (u'could', 236),
 (u'police', 222),
 (u'say', 219),
 (u'man', 218),
 (u'house', 215),
 (u'day', 214),
 (u'u.s.', 210),
 (u'watch', 210),
 (u'video', 209),
 (u'may', 208),
 (u'best', 200),
 (u'amp', 195),
 (u'year', 192),
 (u'years', 191)]



### Target Keyword Frequency Distributions

In [None]:
def token_freq_dist(titles, k=10):
    '''
    token_X_list = map(lambda x: preprocess(x['targetKeywords'])), titles)
    token_X_list = list(itertools.chain.from_iterable(token_X_list))
    token_X_dist = sorted(Counter(token_X_list).items(), key=operator.itemgetter(1), reverse=True)
    pprint(token_X_dist[0:k])
    print()
    '''