In [1]:
# import json
import pandas as pd
import numpy as np
import warnings

# path = 'yelp_dataset/yelp_academic_dataset_review.json'
# records = [json.loads(line) for line in open(path, 'rb')]
# df = pd.DataFrame(records)
# df.head()
# data = df.sample(frac=0.0001)
# data.to_csv('yelp_academic_dataset_review_sample.csv', sep=';', encoding='latin-1')
data = pd.read_csv('yelp_academic_dataset_review_sample.csv', sep=';', encoding='latin-1')
data.head()

Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,4180367,riFzCvp77DMKDX-5GoTpqA,0,2016-07-26,0,T6iKuaerOK1dgFP6qIlzjA,2,We were gambling late and wanted something to ...,1,B0ENvYKQdNNr1Izd2r-BAA
1,5960159,NFMD_-wFrbfdhTcdHPDCZQ,15,2014-02-06,8,P9hsidWSqjjsz-XJO0KWGw,4,I have had this place bookmarked for some time...,18,UYcmGbelzRa0Q6JqzLoguw
2,3109126,RiO6Jcjya1z-9gAQzNU3LA,0,2014-05-12,3,Spb1EIt9SEYeVmnorWU6Jw,5,Decided to get out of Tremont and find some be...,2,QKj_fglwc_X1VKQP8oBHXQ
3,2899286,yB1oj0R5CSjQtC_1JilPrA,0,2016-12-12,1,BAbr9Gt_h4f8kkPk6zGDEw,2,Unfortunately I only gave main event 2 stars b...,1,PszdWf6OZl50W54UHvMaWg
4,4231667,W98fZXdlQIUBRjWah7U-ng,0,2015-06-16,0,-Kd88c6rXYAI4PzxOVeRIQ,2,This company is really difficult to do busines...,5,JrXaC8v3CcJ1Vp_KJ2epHw


In [2]:
data['useful'].value_counts()

0     327
1     112
2      70
3      39
5      12
6      10
4       8
9       4
12      4
8       3
7       2
15      2
10      1
37      1
16      1
17      1
18      1
33      1
42      1
Name: useful, dtype: int64

In [3]:
bin_edges = data['useful'][data['useful'] != 0].quantile([.25, .5, .75]).tolist()

def numbers_to_scores(x):
    switcher = {
        x==0   : 0,
        bin_edges[0]<=x<bin_edges[1]: 1,
        bin_edges[1]<=x<bin_edges[2]: 2, 
        bin_edges[2]<=x: 3
}[1]  
    return switcher

data['useful_score'] = data.loc[:, 'useful'].apply(lambda x: numbers_to_scores(x))

In [4]:
bin_edges

[1.0, 2.0, 3.0]

In [5]:
data[['useful', 'useful_score']].head()

Unnamed: 0,useful,useful_score
0,1,1
1,18,3
2,2,2
3,1,1
4,5,3


In [6]:
data['useful_score'].value_counts()

0    327
1    112
3     91
2     70
Name: useful_score, dtype: int64

In [7]:
target = 'useful_score'

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(target, 1), data.pop(target).tolist(), test_size= 0.3, random_state=42)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
import nltk
from nltk import pos_tag
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from IPython.display import display

stopWords = set(stopwords.words('english'))
num_attr_names = 'length'
text_attr_name = 'text'

class AttributeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_attr_name = text_attr_name):
        self.unique_features = []
        self.text_attr_name = text_attr_name
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = SnowballStemmer("english")
        
    def extract_features(self, X):
        # convert each review into lowercase and split into words
        X['list_of_features'] = X.loc[:, self.text_attr_name].apply(lambda x: x.lower())
        X['list_of_features'] = X.loc[:, 'list_of_features'].apply(lambda x: self.tokenizer.tokenize(x))
        # get the number of words
        X['length'] = X.loc[:, 'list_of_features'].apply(lambda x: len(x))
        # get the average word length
        X['avg_word_length'] = X.loc[:, 'list_of_features'].apply(lambda list_of_words: np.mean([len(word) for word in list_of_words]))
        # iterate over the list of words and stem them, i.e., remove morphological affixes from words, leaving only the word stem 
        X['list_of_features'] = X.loc[:, 'list_of_features'].apply(lambda list_of_words: [self.stemmer.stem(word) for word in list_of_words])        
        # iterate over the list of words and only leave them if they are not in the stopWords list 
        X['list_of_features'] = X.loc[:, 'list_of_features'].apply(lambda list_of_words: [word for word in list_of_words if word not in stopWords and len(word)>2 and pos_tag(word)[0][1] in ['RB', 'JJ']])
        return X
    
    def fit(self, X, y=None):
        # build the vocabulary
        X = self.extract_features(X)
        for features in np.array(X['list_of_features']):
            self.unique_features = np.union1d(self.unique_features, features)
        return self
    
    def transform(self, X):
        X = self.extract_features(X)
        # transform the list of features based on the vocabulary
        matrix_of_features = np.zeros((len(X), len(self.unique_features)), dtype = np.int8)
        for i, features in enumerate(np.array(X['list_of_features'])):
            matrix_of_features[i, np.where(np.isin(self.unique_features, features))] = 1 
        df_of_features = pd.DataFrame(matrix_of_features, 
                                      columns = self.unique_features, 
                                      index = X.index) 
        df_of_features[['length', 'avg_word_length', 'stars']] = X.loc[:, ['length', 'avg_word_length', 'stars']]
        return df_of_features

    
attr_pipeline = Pipeline([
    ('AttributeTransformer', AttributeTransformer()),
    ('VarianceThreshold', VarianceThreshold(0.1)),
    ('StandardScaler', StandardScaler()),
    ('PCA', PCA(.95))
])


X_train_prepared = attr_pipeline.fit_transform(X_train)
X_train_prepared

array([[ 0.40332385, -1.15306946, -1.14724521, ..., -0.46956791,
         0.46789351, -0.47749836],
       [ 4.80441898,  0.39840943, -1.35550594, ...,  0.3684575 ,
         0.62340657,  1.20935557],
       [-0.25259109, -0.45830427, -0.55574081, ..., -1.44330281,
        -0.5922493 , -0.43939962],
       ...,
       [ 0.29415003,  1.1297274 ,  1.42110631, ...,  0.15744071,
         1.12840241, -0.36121732],
       [-1.22921052, -0.81464973,  0.92745235, ...,  0.12204896,
        -0.82943129, -0.20163373],
       [-0.5224184 , -1.3606695 , -0.32518768, ...,  0.21692676,
         1.05896   , -0.20385857]])

In [9]:
X_train_prepared.shape

(420, 9)

In [10]:
X_test_prepared = attr_pipeline.transform(X_test)
X_test_prepared

array([[-0.75432482, -1.1181576 ,  1.3436041 , ...,  0.20672485,
        -1.24992812,  0.05256534],
       [-0.66533466, -1.33282488, -0.31302243, ...,  0.22269932,
         1.0755629 , -0.38715749],
       [-0.01341334, -0.80010558,  0.88985117, ..., -0.48239916,
         1.53321753, -0.87745127],
       ...,
       [-1.17680789, -0.82485942,  0.92299175, ...,  0.11993235,
        -0.83551902, -0.13442412],
       [ 1.18023284,  0.60110327,  0.35343491, ...,  1.01802174,
         2.52713441,  0.25075954],
       [ 0.01169613,  0.5719086 , -0.23302069, ..., -2.34045848,
         0.64109425, -0.7171158 ]])

In [11]:
# https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
from sklearn.model_selection import GridSearchCV
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict

class EstimatorSelectionHelper:
    
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
     
    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')
            
    def evaluate(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)[['params', 'mean_test_score', 'mean_train_score']]
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        return df
    
    def predict(self, X, y): 
        df = pd.DataFrame()
        for key in self.keys:
            y_scores = cross_val_predict(self.models[key], X, y, cv = 2, 
                                              method = "predict_proba")
            df[key] = [np.argmax(y) for y in y_scores] 
        return df

In [12]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

models1 = { 
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
}

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 15, num = 5)]

params1 = { 
    'ExtraTreesClassifier': { 'n_estimators': n_estimators },
    'RandomForestClassifier': { 'n_estimators': n_estimators},
    'AdaBoostClassifier':  { 'n_estimators': n_estimators },
    'GradientBoostingClassifier': { 'n_estimators': n_estimators, 'learning_rate': [0.3, 0.5, 0.7] }
}

In [13]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_train_prepared, y_train, scoring='accuracy', n_jobs=2)
helper1.evaluate()

Running GridSearchCV for ExtraTreesClassifier.
Running GridSearchCV for RandomForestClassifier.
Running GridSearchCV for AdaBoostClassifier.
Running GridSearchCV for GradientBoostingClassifier.
Done.




Unnamed: 0,index,params,mean_test_score,mean_train_score,estimator
0,0,{'n_estimators': 5},0.528571,0.553573,AdaBoostClassifier
1,1,{'n_estimators': 7},0.52619,0.546498,AdaBoostClassifier
2,3,{'n_estimators': 12},0.516667,0.544007,AdaBoostClassifier
3,2,{'n_estimators': 10},0.516667,0.553582,AdaBoostClassifier
4,4,{'n_estimators': 15},0.495238,0.55587,AdaBoostClassifier
5,0,"{'learning_rate': 0.3, 'n_estimators': 5}",0.47619,0.744116,GradientBoostingClassifier
6,1,"{'learning_rate': 0.3, 'n_estimators': 7}",0.47381,0.786915,GradientBoostingClassifier
7,2,"{'learning_rate': 0.3, 'n_estimators': 10}",0.469048,0.839332,GradientBoostingClassifier
8,4,"{'learning_rate': 0.3, 'n_estimators': 15}",0.464286,0.88338,GradientBoostingClassifier
9,3,"{'learning_rate': 0.3, 'n_estimators': 12}",0.447619,0.863167,GradientBoostingClassifier


In [14]:
pd.DataFrame(helper1.predict(attr_pipeline.transform(X_train), y_train), index = y_train)

Unnamed: 0,ExtraTreesClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
0,1,1,1,1
3,0,0,1,0
0,1,1,1,1
1,3,0,0,3
1,3,0,0,3
1,3,0,0,3
1,3,0,0,3
0,1,1,1,1
0,1,1,1,1
0,1,1,1,1


In [15]:
pd.DataFrame(helper1.predict(attr_pipeline.transform(X_test), y_test), index = y_test)

Unnamed: 0,ExtraTreesClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
3,0,0,0,0
0,1,0,1,1
2,0,1,0,0
3,0,0,0,0
0,1,0,1,1
0,1,0,1,1
0,1,0,1,1
1,0,0,0,0
0,1,0,1,1
0,1,0,1,1
