# Class 10 - Starter Code

Predicting Evergreeness of Content using Decision Trees and Random Forests

In [None]:
import numpy as np
import pandas as pd
import json
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid", font_scale=1.5)
%matplotlib inline

### Load Dataset and Pre-Process

In [None]:
# Load data
data = pd.read_csv("../../assets/dataset/stumbleupon.tsv", sep='\t')

# Split `boilerplate` column
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))

# Check head
data.head()

### Predicting "Greenness" Of Content

### Data Dictionary

This exercise uses the [Kaggle StumbleUpon Evergreen Classification Challenge](https://www.kaggle.com/c/stumbleupon)

This dataset comes from [StumbleUpon](https://www.stumbleupon.com/), a web page recommender. A description of the columns is below:

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
title|string|Title of the article
body|string|Body text of article
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonlinkratio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonlinkratio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonlinkratio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonlinkratio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?

> #### Evergreen sites are those that are always relevant.  As opposed to breaking news or current events, evergreen websites are relevant no matter the time or season. 

> #### A sample of URLs is below, where label = 1 are 'evergreen' websites

In [None]:
# Check URLs and their evergreen labels
data[['url', 'label']].head()

# Part 1: Explore the dataset

### 1.1 Does being a news site affect evergreeness? 
Compute or plot the percentage of news related evergreen sites.

In [None]:
# Using groupby()
### FILL IN ###

In [None]:
# Using a plot
### FILL IN ###

### 1.2 Does category in general affect evergreeness? 
Plot the rate of evergreen sites for all Alchemy categories.

In [None]:
# Using groupby()
### FILL IN ###

In [None]:
# Using a plot
### FILL IN ###

### 1.3 How many articles are there per category?

In [None]:
# Using groupby()
### FILL IN ###

In [None]:
# Using a plot
### FILL IN ###

### 1.4 Explore additional relationships
Are there any other relationships you brainstormed?

In [None]:
### FILL IN ###

### 1.5 Can you create any additional features?
Create a feature that indicates whether the title contains the word 'recipe'. Is the percent of evegreen websites higher or lower on pages that have recipe in the the title?

Hint: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.contains.html

In [None]:
# Check if title contains the word 'recipe'
data['recipe_in_title'] = data['title'].str.contains('recipe')

In [None]:
# Using groupby()
data.groupby(['recipe_in_title'])[['label']].mean()

In [None]:
# Using a plot
sns.factorplot(x='recipe_in_title', y='label', kind='bar', data=data)

# Part 2:  Let's Explore Some Decision Trees

Demo: Build a decision tree model to predict the "evergreeness" of a given website. 

### 2.1 Pre-process dataset

In [None]:
# Check dtypes and missing values
pd.DataFrame({'dtypes': data.dtypes, 'missing':data.isnull().sum()})

In [None]:
# Drop missing values
data = data.dropna()

# Create dummy variables for alchemy_category
data = (data.join(pd.get_dummies(data['alchemy_category'], prefix='alchemy_cat'))
            .drop(['alchemy_category'], axis=1))

### 2.2 Build a Decision Tree Model

In [None]:
from sklearn import tree

# Set features to use
features = ['image_ratio', 'html_ratio', 'recipe_in_title'] + \
            filter(lambda x: x.startswith('alchemy_cat_'), data.columns)

# Set target variable name
target = 'label'

# Set X and y
X = data[features]
y = data[target]

# Create separate training and test sets with 60/40 train/test split
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=42)

# Instantiate model using default params
tm = tree.DecisionTreeClassifier()

# Train model on training set
tm.fit(X_train, y_train)

# Evaluate accuracy of model on test set
print "Accuracy: %0.3f" % tm.score(X_test, y_test)

# Evaluate ROC AUC score of model on test set
print 'ROC AUC: %0.3f' % metrics.roc_auc_score(y_test, tm.predict_proba(X_test)[:,1])

### 2.3 Evaluate the Decision Tree Model

In [None]:
# Get confusion matrix on test set
y_pred = tm.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

ax = plt.axes()
sns.heatmap(cm_normalized, annot=True)
ax.set_ylabel('True')
ax.set_xlabel('Pred')
plt.show()

print "Confusion Matrix:"
print cm

# Part 3: Adjusting Decision Trees to Avoid Overfitting

### 3.1 Check if the model is overfit by checking accuracy on training set vs test set

In [None]:
# Evaluate model on train set
print "Accuracy: %0.3f" % tm.score(X_train, y_train)

# Evaluate model on test set
print "Accuracy: %0.3f" % tm.score(X_test, y_test)

### 3.2 Demo: Control for overfitting in the decision model by adjusting the maximum number of questions (max_depth) or the minimum number of records in each final node (min_samples_leaf)

In [None]:
# Instantiate model using default params
tm = tree.DecisionTreeClassifier(max_depth=2, min_samples_leaf=5)

# Train model on training set
tm.fit(X_train, y_train)

In [None]:
# Evaluate model on train set
print "Accuracy: %0.3f" % tm.score(X_train, y_train)

# Evaluate model on test set
print "Accuracy: %0.3f" % tm.score(X_test, y_test)

# Part 4: Random Forests

### 4.1 Demo: Build a random forest model to predict the evergreeness of a website. 

In [None]:
from sklearn import ensemble

# Instantiate model
rf = ensemble.RandomForestClassifier(n_estimators=20)

# Train model on training set
rf.fit(X_train, y_train)

In [None]:
# Evaluate model on train set
print "Accuracy: %0.3f" % tm.score(X_train, y_train)

# Evaluate model on test set
print "Accuracy: %0.3f" % tm.score(X_test, y_test)

### 4.2 Tune and update the model

In [None]:
# Set list of values to grid search over
n = [1, 2, 3, 10, 20, 30, 100, 200, 300]
params = {'n_estimators': n}

# Perform grid search using list of values
gs = grid_search.GridSearchCV(
    estimator=ensemble.RandomForestClassifier(),
    param_grid=params)
gs.fit(X_train, y_train)

# Get best value to use
print "Best Params:"
print gs.best_params_

# Get improvement
print "Accuracy of current model: %0.3f" % rf.score(X_test, y_test)
print "Accuracy using best param: %0.3f" % gs.best_score_

# Plot scores
plt.plot(n, [s[1] for s in gs.grid_scores_])



# Current model params
print rf
print "Accuracy of current model: %0.3f" % rf.score(X_test, y_test)

# Update model params
rf.set_params(n_estimators=gs.best_params_['n_estimators'])

# Retrain model on new params
rf.fit(X_train, y_train)

# Updated model params
print rf
print "Accuracy of updated model: %0.3f" % rf.score(X_test, y_test)

### 4.3 Extract Feature Importances

In [None]:
# Plot importances for all features
features = X.columns
feature_importances = rf.feature_importances_

features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

sns.barplot(y='Features', x='Importance Score', data=features_df)

In [None]:
# Function to combine dummy features importances
def combine_dummies(dummy_prefixes):
    for p in dummy_prefixes:
        sub_keys = filter(lambda x: x.startswith(p), feature_dict)
        sub_keys_sum = sum([feature_dict[x] for x in sub_keys])
        for k in sub_keys: feature_dict.pop(k)
        feature_dict[p] = sub_keys_sum
    return feature_dict

In [None]:
# Plot importances with dummy features combined
feature_names = X.columns
feature_importances = rf.feature_importances_
feature_dummy_prefixes = ['alchemy_cat_']

feature_dict = dict(zip(feature_names, feature_importances))
feature_dict = combine_dummies(feature_dummy_prefixes)

features_df = pd.DataFrame(feature_dict.items(), columns=['Features', 'Importance Score'])
features_df.sort_values('Importance Score', inplace=True, ascending=False)

sns.barplot(y='Features', x='Importance Score', data=features_df)

# Part 5: Improve Random Forest Model through Feature Engineering

### 5.1 Independent Practice: Improve model using additional (new and existing) features

1. Continue adding input variables to the model that you think may be relevant
2. For each feature:
  - Evaluate the model for improved predictive performance using cross-validation
  - Evaluate the _importance_ of the feature
  - 
3. **Bonus**: Just like the 'recipe' feature, add in similar text features and evaluate their performance.


In [None]:
# Create new feature to check if title contains the word 'recipe'
data['year_in_title'] = data['title'].str.contains('2010') | \
                        data['title'].str.contains('2011') | \
                        data['title'].str.contains('2012') | \
                        data['title'].str.contains('2013')
            
# Create additional new features
### FILL IN ###

In [None]:
# Set features to use
features = ### FILL IN ###

# Set target variable name
target = ### FILL IN ###

# Set X and y
X = ### FILL IN ###
y = ### FILL IN ###


# Instantiate model
rf = ### FILL IN ###

# Train model on training set
rf.### FILL IN ###



# Evaluate accuracy of model on test set
print "Accuracy: %0.3f" % rf.score(X_test, y_test)

# Evaluate ROC AUC score of model on test set
print 'ROC AUC: %0.3f' % metrics.roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])



# Plot importances with dummy features combined
feature_names = X.### FILL IN ###
feature_importances = rf.### FILL IN ###
feature_dummy_prefixes = ['alchemy_cat_']

feature_dict = dict(zip(feature_names, feature_importances))
feature_dict = combine_dummies(feature_dummy_prefixes)

features_df = pd.DataFrame(feature_dict.items(), columns=['Features', 'Importance Score'])
features_df.sort_values('Importance Score', inplace=True, ascending=False)

sns.barplot(y='Features', x='Importance Score', data=features_df)