In [8]:
! pip install numpy
# you can install packages with the exclamation point



In [1]:
# import all dependencies
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cross_validation import cross_val_score
from scipy.sparse import hstack

In [2]:
# put your functions in one cell, call when you need
def read_tsv(path):
    return pd.read_csv(path, sep='\t')
def get_json_value(col,key):
    return json.loads(col).get(key, '')
def has_text(text,text_val):
    try:
        if 'recipe' in str(text).lower():
            return 1
        else:
            return 0
    except: 
        return 0

In [3]:
path = "../../assets/dataset/stumbleupon.tsv"
data = read_tsv(path)

In [4]:
data.dtypes

url                                object
urlid                               int64
boilerplate                        object
alchemy_category                   object
alchemy_category_score             object
avglinksize                       float64
commonlinkratio_1                 float64
commonlinkratio_2                 float64
commonlinkratio_3                 float64
commonlinkratio_4                 float64
compression_ratio                 float64
embed_ratio                       float64
framebased                          int64
frameTagRatio                     float64
hasDomainLink                       int64
html_ratio                        float64
image_ratio                       float64
is_news                            object
lengthyLinkDomain                   int64
linkwordscore                       int64
news_front_page                    object
non_markup_alphanum_characters      int64
numberOfLinks                       int64
numwords_in_url                   

In [5]:
data.label.head(5)

0    0
1    1
2    1
3    1
4    0
Name: label, dtype: int64

In [None]:
# get the title text from the JsonObject 
# note the JsonObject is stored in column boilerplate
# hint: look at function get_json_value above, it takes two inputs, 
# a dataframe column, and a key value to search the Json Dictionary,
# call the new dataframe column "title" 
# hint: first input for get_json_value is a column, 
# and second input 'title'

In [None]:
# get the body text from the JsonObject
# note the JsonObject is stored in column boilerplate
# hint: look at function get_json_value above, it takes two inputs, 
# a dataframe column, and a key value to search the Json Dictionary,
# call the new dataframe column "body" 
# hint: first input for get_json_value is a column, 
# and second input 'body'

In [None]:
# Now that you have two new columns in your dataframe, one which contains
# only the title text of the website, and the other only body text
# lets see if having the word 'recipe' in the title contributed to higher
# evergreen websites?

# hint, create new column called 'has_recipe', use function has_text,
# which has two inputs, a column, 
# and the specific word you are looking for
# no more hints!

In [None]:
# how many records had the word recipe in the title?

In [None]:
# how many records did not have the word recipe in the title?

In [None]:
# Interpret Results Above

In [None]:
# remove all null values for title column, by replacing NaN with empty ''
# maintain the 'title' column during the transformation

In [None]:
# lets make every word in the title a feature, 
# maybe other words besides recipes contributed to 
# higher evergreen websites?

vectorizer = CountVectorizer(
                max_features = 1000, # max number of words to consider as features
                ngram_range=(1, 2), 
                stop_words='english', # stop when you see word english
                binary=True # each word will be a binary dummy categorical var
            )

In [None]:
# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(data.title)

In [None]:
# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
x_text_features = vectorizer.transform(data.title)

In [None]:
labels = data.label

In [None]:
model = RandomForestClassifier(n_estimators = 20)
# n_estimators = number of decision trees
# other parameters such as max_depth, best_split, etc available

In [None]:
scores = cross_val_score(
            model,  # your model
            x_text_features,  # your features in vector form
            labels,  # your labels (response/output/predictor)
            scoring='roc_auc' # what metric do you want returned?
        )

In [None]:
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

In [None]:
# Identify the features you want from the original dataset
other_features_columns = ['html_ratio', 'image_ratio']
other_features = data[other_features_columns]

In [None]:
# Stack them horizontally together
# This takes all of the word/n-gram columns and appends on two more columns for `html_ratio` and `image_ratio`
features = hstack((x_text_features, other_features)).toarray()

scores = cross_val_score(model, features, labels, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

In [None]:
# What features of these are most important, lets actually fit the model!
model.fit(features, labels)

In [None]:
all_feature_names = vectorizer.get_feature_names() + other_features_columns

In [None]:
feature_importances = pd.DataFrame(
    {'Features' : all_feature_names, 
     'Importance Score': model.feature_importances_
    }
)

# this makes a nice table for all features, 
# and there corresponding importance

In [None]:
feature_importances.sort_values(
   'Importance Score', 
    ascending=False
).head(10) # sort_values defined for dataframe, default is ascending

In [None]:
# Your model was built from features coming from the vocabulary
# of the title vs. the label. 
# Repeat the above process but with your features
# coming from the 'body' column.
# You must fit vocab in your body column, along with creating a vector with
# all possible features in n columns (dummy variables)
# hint: from vectorizor.transform(column_with_text)