In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


#### Load Data

In [None]:
with open('data/reviews_post_feature_eng.pkl', 'rb') as f:
    reviews = pickle.load(f)
    
def select_fetures(dataset, features):
    return dataset[features]

features = ['HelpfullnessRank', 'Score', 'Time', 'WordsCount', 'Total_Reviews_by_Reviewer',
        'ProductFrequency', 'WordCount', 'WordCountSummary', 'StopWords',
        'UpperCount', 'LowerCount', 'LowerCountSummary', 'DotCount',
        'CountPunctuation', 'CountDigits', 'Lexical', 'UpperLowerR',
        'UpperLowerSumR', 'DotCapitalR', 'DotCapitalSumR', 'CapitalsRatio',
        'neg', 'neu', 'pos', 'compound', 'ProductFreqlog',
        'ReviewsbyReviewerlog', 'WordCountlog', 'Month', 'Day']

reviews_features = select_fetures(reviews, features)

label = 'HelpfullnessRank'
X = reviews_features.drop(label, axis=1)
y = reviews_features[label]

# Split to train, test and validation datasets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

X_train.shape, X_test.shape, X_val.shape

#### Random Forest - GridSearch

In [None]:
from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# build a classifier
clf = RandomForestClassifier()


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# # specify parameters and distributions to sample from
# param_dist = {"max_depth": [3, None],
#               "max_features": sp_randint(1, 11),
#               "min_samples_split": sp_randint(2, 11),
#               "min_samples_leaf": sp_randint(1, 11),
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}

# # run randomized search
# n_iter_search = 20
# random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
#                                    n_iter=n_iter_search)

# start = time()
# random_search.fit(X, y)
# print("RandomizedSearchCV took %.2f seconds for %d candidates"
#       " parameter settings." % ((time() - start), n_iter_search))
# report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"n_estimators" : [10, 20, 50],
              "max_depth": [10, 100, None],
              "max_features": ['auto', 'sqrt'],
              "min_samples_leaf": [1, 2 ,3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y[label])

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)


#### Grid Search SVM

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import Scaler
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV


# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

scaler = Scaler()

X = scaler.fit_transform(X)

# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = 10. ** np.arange(-3, 8)
gamma_range = 10. ** np.arange(-5, 4)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5))

grid.fit(X, Y)

# Export the training features:
with open('svm_grid1.pkl', 'wb') as f:
    pickle.dump(grid ,f)


print("The best classifier is: ", grid.best_estimator_)

# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
score_dict = grid.grid_scores_

# We extract just the scores
scores = [x[1] for x in score_dict]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))

# Make a nice figure
pl.figure(figsize=(8, 6))
pl.subplots_adjust(left=0.15, right=0.95, bottom=0.15, top=0.95)
pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
pl.xlabel('gamma')
pl.ylabel('C')
pl.colorbar()
pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
pl.yticks(np.arange(len(C_range)), C_range)
pl.show()