In [1]:
import pandas as pd
import json

data = pd.read_csv("../../lesson-12/code/data/stumbleupon.tsv", sep='\t')
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))
data.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM sees holographic calls, air brea...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,24,0,5424,170,8,0.152941,0.07913,0,"IBM sees holographic calls, air breathing batt...",A sign stands outside the International Busine...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The fully electronic futuristic star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,40,0,4973,187,9,0.181818,0.125448,1,The fully electronic futuristic starting gun t...,And that can be carried on a plane without the...
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,55,0,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,24,0,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,14,0,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...


## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender.  

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
title|string|Title of the article
body|string|Body text of article
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonlinkratio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonlinkratio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonlinkratio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonlinkratio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of &lt;embed&gt; usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an &lt;a&gt; with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of &lt;img&gt; tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 &lt;a&gt; 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer| Number of &lt;a&gt; markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

 ### Review: Use of the Count Vectorizer
 
 We previously used the Count Vectorizer to extract text features for this classification task

In [2]:
titles = data['title'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 1000, 
                             ngram_range=(1, 2), 
                             stop_words='english',
                             binary=False)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)

 ### Review: Build a model to predict evergreeness of a website
 
 Then we used those features to build a classification model

In [3]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty = 'l1')
y = data['label']

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [ 0.8024141   0.81964822  0.81211154], Average AUC 0.811391287699


 ### Demo: Pipelines
 
Often we will want to combine these steps to evaluate on some future dataset. For that incoming, future dataset, we need to make sure we perform the **exact same** transformations on the data. If `has_brownies_in_text` is column 19, we need to make sure it is column 19 when it comes to evaluation time. 

Pipelines combine all of the pre-processing steps and model building into a single object.

Rather than manually evaluating the transformers and then feeding them into the model, pipelines tie these steps together. Similar to models and vectorizers in scikit-learn, they are equipped with `fit` and `predict` or `predict_proba` methods as any model would be, but they ensure the proper data transformations are performed

In [4]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
        ('features', vectorizer),
        ('model', model)   
    ])

In [5]:
# Split the data into a training set
training_data = data[:6000]
X_train = training_data['title'].fillna('')
y_train = training_data['label']

# These rows are rows obtained in the future, unavailable at training time
X_new = data[6000:]['title'].fillna('')

In [6]:
# Fit the full pipeline
# This means we perform the steps laid out above
# First we fit the vectorizer, 
# and then feed the output of that into the fit function of the model

pipeline.fit(X_train, y_train)

Pipeline(steps=[('features', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
 ...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [7]:
# Here again we apply the full pipeline for predictions
# The text is transformed automatically to match the features from the pipeline
pipeline.predict_proba(X_new)

array([[ 0.45563224,  0.54436776],
       [ 0.36441937,  0.63558063],
       [ 0.07973325,  0.92026675],
       ..., 
       [ 0.41561022,  0.58438978],
       [ 0.57713583,  0.42286417],
       [ 0.6516715 ,  0.3483285 ]])

> ### Exercise: Add a MaxAbsScaler scaling step to the pipeline as well, this should occur after the vectorization

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler

pipeline = Pipeline([
        ('features', vectorizer),
        ('scaling', MaxAbsScaler()),
        ('model', model)   
    ])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('features', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
 ...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
pipeline.predict_proba(X_new)

array([[ 0.45314334,  0.54685666],
       [ 0.4705122 ,  0.5294878 ],
       [ 0.08209681,  0.91790319],
       ..., 
       [ 0.4009298 ,  0.5990702 ],
       [ 0.57225252,  0.42774748],
       [ 0.65554346,  0.34445654]])

Additionally, we want to merge many different feature sets automatically, we can use `FeatureUnion`

> ### Exercise: Combining GridSearchCV with Pipelines
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
fit_params = {
    'features__max_features': [500, 1000],
    'features__ngram_range': [(1, 2), (1,1)]
}
gcv = GridSearchCV(pipeline, fit_params, cv=5)

In [14]:
gcv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('features', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
 ...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'features__max_features': [500, 1000], 'features__ngram_range': [(1, 2), (1, 1)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
gcv.best_estimator_

Pipeline(steps=[('features', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
 ...ty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [16]:
gcv.best_score_

0.75049999999999994

In [17]:
gcv.cv_results_

{'mean_fit_time': array([ 0.1839838 ,  0.07799544,  0.18360839,  0.07768188]),
 'mean_score_time': array([ 0.02881379,  0.01642962,  0.03053474,  0.01590395]),
 'mean_test_score': array([ 0.74416667,  0.746     ,  0.74966667,  0.7505    ]),
 'mean_train_score': array([ 0.76820814,  0.76924984,  0.78345821,  0.7848749 ]),
 'param_features__max_features': masked_array(data = [500 500 1000 1000],
              mask = [False False False False],
        fill_value = ?),
 'param_features__ngram_range': masked_array(data = [(1, 2) (1, 1) (1, 2) (1, 1)],
              mask = [False False False False],
        fill_value = ?),
 'params': ({'features__max_features': 500, 'features__ngram_range': (1, 2)},
  {'features__max_features': 500, 'features__ngram_range': (1, 1)},
  {'features__max_features': 1000, 'features__ngram_range': (1, 2)},
  {'features__max_features': 1000, 'features__ngram_range': (1, 1)}),
 'rank_test_score': array([4, 3, 2, 1], dtype=int32),
 'split0_test_score': array([ 0.733