In [1]:
import pandas as pd
import json

data = pd.read_csv("stumbleupon.tsv", sep='\t')
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,24,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,40,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,55,0,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,24,0,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,14,0,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...


## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender.  

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
title|string|Title of the article
body|string|Body text of article
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonlinkratio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonlinkratio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonlinkratio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonlinkratio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

> ### Let's try extracting some of the text content.
> ### Create a feature for the title containing 'recipe'. Is the % of evegreen websites higher or lower on pages that have recipe in the the title?

In [27]:
# Option 1: Create a function to check for this

def has_recipe(text_in):
    try:
        if 'recipe' in str(text_in).lower():
            return 1
        else:
            return 0
    except: 
        return 0
        
data['recipe'] = data['title'].map(has_recipe)

# Option 2: lambda functions

#data['recipe'] = data['title'].map(lambda t: 1 if 'recipe' in str(t).lower() else 0)


# Option 3: string functions
data['recipe'] = data['title'].str.contains('recipe')

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score


model = DecisionTreeClassifier()

X = data[['image_ratio', 'html_ratio', 'recipe', 'label']].dropna()
y = X['label']
X.drop('label', axis=1, inplace=True)
    
    
# Fits the model
model.fit(X, y)

# ... #

scores = cross_val_score(model, X, y, scoring='roc_auc', cv=5)
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))



CV AUC [ 0.54305863  0.52532651  0.54423384  0.5251413   0.56274208], Average AUC 0.5401004714441456


In [29]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20, oob_score= True)
    
model.fit(X, y)

scores = cross_val_score(model, X, y, scoring='roc_auc', cv=5)
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


CV AUC [ 0.56295158  0.57559422  0.59012318  0.56907472  0.57980152], Average AUC 0.5755090437733441


 ### Demo: Use of Spacy

In [7]:
from spacy.en import English
nlp_toolkit = English()

title = "IBM sees holographic calls, air breathing batteries"
parsed = nlp_toolkit(title)

for (i, word) in enumerate(parsed):
    print("Word: {}".format(word))
    print("\t Phrase type: {}".format(word.dep_))
    print("\t Is the word a known entity type? {}".format(word.ent_type_  if word.ent_type_ else "No"))
    print("\t Lemma: {}".format(word.lemma_))
    print("\t Parent of this word: {}".format(word.head.lemma_))

# nsubj: nominal subject
# ROOT: root
# amod: adjectival modifier
# dobj: direct object

# https://spacy.io/docs/usage/pos-tagging
# https://spacy.io/docs/api/annotation

Word: IBM
	 Phrase type: nsubj
	 Is the word a known entity type? ORG
	 Lemma: ibm
	 Parent of this word: see
Word: sees
	 Phrase type: ROOT
	 Is the word a known entity type? No
	 Lemma: see
	 Parent of this word: see
Word: holographic
	 Phrase type: amod
	 Is the word a known entity type? No
	 Lemma: holographic
	 Parent of this word: call
Word: calls
	 Phrase type: dobj
	 Is the word a known entity type? No
	 Lemma: call
	 Parent of this word: see
Word: ,
	 Phrase type: punct
	 Is the word a known entity type? No
	 Lemma: ,
	 Parent of this word: call
Word: air
	 Phrase type: compound
	 Is the word a known entity type? No
	 Lemma: air
	 Parent of this word: breathing
Word: breathing
	 Phrase type: compound
	 Is the word a known entity type? No
	 Lemma: breathing
	 Parent of this word: battery
Word: batteries
	 Phrase type: appos
	 Is the word a known entity type? No
	 Lemma: battery
	 Parent of this word: call


In [8]:
title = "Tom likes eating food"
parsed = nlp_toolkit(title)

for (i, word) in enumerate(parsed):
    print("Word: {}".format(word))
    print("\t Phrase type: {}".format(word.dep_))
    print("\t Is the word a known entity type? {}".format(word.ent_type_  if word.ent_type_ else "No"))
    print("\t Lemma: {}".format(word.lemma_))
    print("\t Parent of this word: {}".format(word.head.lemma_))
    
# note: you can make spacy learn using your own functions

Word: Tom
	 Phrase type: nsubj
	 Is the word a known entity type? PERSON
	 Lemma: tom
	 Parent of this word: like
Word: likes
	 Phrase type: ROOT
	 Is the word a known entity type? No
	 Lemma: like
	 Parent of this word: like
Word: eating
	 Phrase type: xcomp
	 Is the word a known entity type? No
	 Lemma: eat
	 Parent of this word: like
Word: food
	 Phrase type: dobj
	 Is the word a known entity type? No
	 Lemma: food
	 Parent of this word: eat


## Investigate Page Titles

Let's see if we can find organizations in our page titles.

In [12]:
def references_organization(title):
    parsed = nlp_toolkit(title)
    return any([word.ent_type_ == 'ORG' for word in parsed])

data['references_organization'] = data['title'].fillna(u'').map(references_organization)

# Take a look
data[data['references_organization']][['title','references_organization']].head() 
# gives you results where references_organization is True

Unnamed: 0,title,references_organization
0,IBM Sees Holographic Calls Air Breathing Batte...,True
5,Genital Herpes Treatment,True
6,fashion lane American Wild Child,True
8,Valet The Handbook 31 Days 31 days,True
10,Business Financial News Breaking US Internatio...,True


## Exercise:

Lets write a function to identify titles that mention an organization (ORG) and a person (PERSON).

In [14]:
## Exercise solution
def references_org_person(title):
    parsed = nlp_toolkit(title)
    contains_org = any([word.ent_type_ == 'ORG' for word in parsed])
    contains_person = any([word.ent_type_ == 'PERSON' for word in parsed])
    return contains_org and contains_person

data['references_org_person'] = data['title'].fillna(u'').map(references_org_person)

# Take a look
data[data['references_org_person']][['title','references_org_person']].head()

Unnamed: 0,title,references_org_person
11,A Tip of the Cap to The Greatest Iron Man of T...,True
29,Genevieve Morton Swimsuit by Tyler Rose Swimwe...,True
44,Alyssa Miller Swimsuit by Charlie by Matthew Z...,True
114,Baby Gorilla Tries To Act Tough Video,True
115,BBC News UK Sweet message in a bottle,True


In [15]:
## Exercise solution
def references_product(title):
    parsed = nlp_toolkit(title)
    contains_prod = any([word.ent_type_ == 'PRODUCT' for word in parsed])
    return contains_prod

data['references_product'] = data['title'].fillna(u'').map(references_product)

# Take a look
data[data['references_product']][['title','references_product']].head()

Unnamed: 0,title,references_product
471,Why Ford Won t Sell Its 65mpg Car in the US Mi...,True
1091,No Churn homemade icecream The Wanna be Countr...,True
1112,Lego Milling Machine Creates Amazing 3D Sculpt...,True
1134,Watch This Snowboarder Survive an Avalanche By...,True
1324,Monster Marshmallow Cookies Cinnamon Girl Reci...,True


In [21]:
## Exercise solution
def references_gpe(title):
    parsed = nlp_toolkit(title)
    contains_gpe = any([word.ent_type_ == 'GPE' for word in parsed]) #Countries, cities, states
    return contains_gpe

data['references_gpe'] = data['title'].fillna(u'').map(references_gpe)

# Take a look
print(data[data['references_gpe']][['title','references_gpe']].head())

print(data['references_gpe'].value_counts())

                                                title  references_gpe
28      Supermodels show off their bikini bods Humor             True
40                          The bottled water fad UK             True
61  Banned s Afternoon Picdump 110 Pictures Banned...            True
66              NewsNow co uk The UK s 1 news portal             True
73  Clean and Disinfect with Tap Water The Activei...            True
False    6983
True      412
Name: references_gpe, dtype: int64


In [22]:
def contains_adjective(title):
    parsed = nlp_toolkit(title)
    contains_adj = any([word.dep_ == 'amod' for word in parsed])
    return contains_adj

data['contains_adjective'] = data['title'].fillna(u'').map(contains_adjective)

# Take a look
data[data['contains_adjective']][['title','contains_adjective']].head()

Unnamed: 0,title,contains_adjective
0,IBM Sees Holographic Calls Air Breathing Batte...,True
1,The Fully Electronic Futuristic Starting Gun T...,True
4,The 50 Coolest Jerseys You Didn t Know Existed...,True
5,Genital Herpes Treatment,True
7,Racing For Recovery by Dean Johnson racing for...,True


 ### Demo: Use of the Count Vectorizer

In [31]:
titles = data['title'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 1000, # use only 1000 most frequent words
                             ngram_range=(1, 2), # 1-grams and bigrams, ngram_range=(2,2) would be bigrams only
                             stop_words='english', # remove words like 'it', 'to' etc.
                             binary=True) # return T/F rather than count
                            #,tokenizer=None this is where you can provide your own dictionary of words to search for

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)

# words which appear in the test set but not in the train will be excluded

In [32]:
vectorizer.get_feature_names()

['000',
 '10',
 '10 best',
 '10 things',
 '10 ways',
 '100',
 '101',
 '101 cookbooks',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '20',
 '2007',
 '2008',
 '2008 sports',
 '2009',
 '2010',
 '2010 sports',
 '2011',
 '2011 sports',
 '2012',
 '2013',
 '2013 check',
 '2013 sports',
 '22',
 '24',
 '25',
 '30',
 '3d',
 '50',
 '8211',
 '8217',
 '8230',
 'abs',
 'accessories',
 'actually',
 'adventures',
 'advice',
 'air',
 'alcohol',
 'allrecipes',
 'allrecipes com',
 'almond',
 'alton brown',
 'amazing',
 'america',
 'american',
 'analysis',
 'anderson',
 'android',
 'angeles',
 'angeles slideshows',
 'anxiety',
 'apple',
 'apple pie',
 'apples',
 'apps',
 'archive',
 'art',
 'artichoke',
 'asian',
 'athletes',
 'atlantic',
 'attack',
 'avocado',
 'awesome',
 'baby',
 'bacon',
 'bad',
 'bake',
 'baked',
 'baker',
 'bakers',
 'baking',
 'ball',
 'balls',
 'balsamic',
 'banana',
 'banana bread',
 'bar',
 'bar refaeli',
 'bars',
 'basil',
 'bbc',
 'bbc food',
 'bbc news',
 'bbq',
 

In [34]:
X.shape

(7395, 1000)

In [35]:
print(X)

# read as (title ID, word ID)   frequency

  (0, 43)	1
  (2, 209)	1
  (2, 357)	1
  (2, 384)	1
  (2, 435)	1
  (2, 564)	1
  (2, 565)	1
  (3, 1)	1
  (3, 102)	1
  (3, 797)	1
  (3, 907)	1
  (4, 34)	1
  (4, 240)	1
  (4, 504)	1
  (6, 51)	1
  (6, 347)	1
  (6, 970)	1
  (7, 217)	1
  (7, 476)	1
  (7, 477)	1
  (8, 273)	1
  (9, 137)	1
  (9, 231)	1
  (9, 232)	1
  (9, 245)	1
  :	:
  (7391, 213)	1
  (7392, 311)	1
  (7392, 312)	1
  (7392, 691)	1
  (7392, 863)	1
  (7394, 23)	1
  (7394, 24)	1
  (7394, 217)	1
  (7394, 314)	1
  (7394, 316)	1
  (7394, 393)	1
  (7394, 462)	1
  (7394, 463)	1
  (7394, 583)	1
  (7394, 584)	1
  (7394, 662)	1
  (7394, 663)	1
  (7394, 787)	1
  (7394, 788)	1
  (7394, 789)	1
  (7394, 829)	1
  (7394, 830)	1
  (7394, 865)	1
  (7394, 868)	1
  (7394, 869)	1


In [37]:
vectorizer.get_feature_names()[43] in titles[0]

True

In [38]:
vectorizer.get_feature_names()[43]

'air'

In [39]:
titles[0]

'IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries'

In [40]:
titles[1]

'The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races'

 ### Demo: Build a random forest model to predict evergreeness of a website using the title features

In [30]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20)
    
# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles).toarray()
y = data['label']

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.79093865  0.80496739  0.80564403], Average AUC 0.800516691549724


### Exercise: Build a random forest model to predict evergreeness of a website using the title features and quantitative features

In [47]:
model = RandomForestClassifier(n_estimators = 20)
vectorizer.fit(titles)

X = pd.merge(pd.DataFrame(vectorizer.transform(titles).toarray())
             ,data[['html_ratio','image_ratio','numwords_in_url']]
             ,left_index=True
             ,right_index=True)
y = data['label']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.79284558  0.81219005  0.79194586], Average AUC 0.7989938289015477


 ### Exercise: Build a random forest model to predict evergreeness of a website using the body features

In [49]:
## TODO

body = data['body'].fillna('')

model = RandomForestClassifier(n_estimators = 20)

vectorizer.fit(body)

X = vectorizer.transform(body).toarray()
y = data['label']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.83393793  0.85358465  0.83556719], Average AUC 0.8410299224814045


 ### Exercise: Use `TfIdfVectorizer` instead of `CountVectorizer` - is this an improvement?

In [53]:
## TODO

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 1000,
                             ngram_range=(1, 2),
                             stop_words='english',
                             binary=True)

body = data['body'].fillna('')

model = RandomForestClassifier(n_estimators = 20)

vectorizer.fit(body)

X = vectorizer.transform(body).toarray()
y = data['label']

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.84093569  0.8538083   0.84409999], Average AUC 0.8462813271104385
