In [1]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import warnings
import random
from datetime import datetime
random.seed(datetime.now())
warnings.filterwarnings('ignore')

from matplotlib import style
style.use("ggplot")
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn import cross_validation # used to test classifier
from sklearn.cross_validation import KFold, cross_val_score, train_test_split
from sklearn import metrics
%matplotlib inline
plt.rcParams['figure.figsize'] = (12,8)

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import roc_curve # ROC Curves
from sklearn.metrics import auc # Calculating AUC for ROC's!
import warnings
warnings.filterwarnings('ignore')




In [None]:
df = pd.read_csv("Replaced.csv",encoding="ISO-8859-1")

#### Drop the columns which are not required and not useful for predictions

In [2]:
drop_cols = ['Unnamed: 0','brand','categories','categories','dateAdded','dateUpdated','keys','manufacturer','name','reviewsdate','dateSeen','sourceURLs','text','title','userCity','upc','userProvince']
df = df.drop(drop_cols,axis=1)

df.head()

Unnamed: 0,id,ean,manufacturerNumber,didPurchase,doRecommend,reviews_id,reviews.numHelpful,rating,username
0,AV13O1A8GV-KLJ3akUyj,603000000000.0,14331328,,,,0.0,5,Joshua
1,AV14LG0R-jtxr-f38QfS,73416000391.0,574764,True,True,100209113.0,,5,Dorothy W
2,AV14LG0R-jtxr-f38QfS,73416000391.0,574764,True,True,100209113.0,,5,Dorothy W
3,AV16khLE-jtxr-f38VFn,67981934427.0,67981934427,False,False,113026909.0,,1,Rebecca
4,AV16khLE-jtxr-f38VFn,67981934427.0,67981934427,False,False,171267657.0,,1,Walker557


#### Convert boolean values to binary values i.e. True to 1 and False to 0
#### Fill the NaNs with suitable values

In [3]:
df['didPurchase'].fillna(True, inplace=True)
df['doRecommend'].fillna(True, inplace=True)

df.didPurchase = (df.didPurchase)*1
df.doRecommend = (df.doRecommend)*1

df.fillna(0, inplace=True)

#### Convert string values to integer values by hashing the column values

In [4]:
def get_hash(x):
  return abs(hash(x)) % 10**9
df['username'] = df['username'].apply(get_hash)
df['id'] = df['id'].apply(get_hash)

df.head()

Unnamed: 0,id,ean,manufacturerNumber,didPurchase,doRecommend,reviews_id,reviews.numHelpful,rating,username
0,815016927,603000000000.0,14331328,1,1,0.0,0.0,5,743012946
1,953502281,73416000391.0,574764,1,1,100209113.0,0.0,5,892130448
2,953502281,73416000391.0,574764,1,1,100209113.0,0.0,5,892130448
3,48238311,67981934427.0,67981934427,0,0,113026909.0,0.0,1,775542376
4,48238311,67981934427.0,67981934427,0,0,171267657.0,0.0,1,585166793


#### Scale the column values

In [10]:
def scaled_df(df):
    scaled = pd.DataFrame()
    for item in df:
        if item in df.select_dtypes(include=[np.float]):
            scaled[item] = ((df[item] - df[item].min()) / 
            (df[item].max() - df[item].min()))
        else: 
            scaled[item] = df[item]
    return scaled
df_scaled = scaled_df(df)

#### Set predictor columns to determine the results

In [11]:
predictor_names=['id','didPurchase','username','rating']
predictor_names

['id', 'didPurchase', 'username', 'rating']

#### Find Rank for each of the predictor columns

In [12]:
def rank_predictors(dat,l,f='doRecommend'):
    rank={}
    max_vals=dat.max()
    median_vals=dat.groupby(f).median()  # We are using the median as the mean is sensitive to outliers
    for p in l:
        score=np.abs((median_vals[p][1]-median_vals[p][0])/max_vals[p])
        rank[p]=score
    return rank
cat_rank=rank_predictors(df,predictor_names) 
cat_rank

{'didPurchase': 1.0,
 'id': 0.04017787014062463,
 'rating': 0.6,
 'username': 0.002091840679263104}

#### Sort the predictors by rank

In [13]:
cat_rank=sorted(cat_rank.items(), key=lambda x: x[1])
cat_rank

[('username', 0.002091840679263104),
 ('id', 0.04017787014062463),
 ('rating', 0.6),
 ('didPurchase', 1.0)]

#### Take the top predictors based on median difference

In [14]:
ranked_predictors=[]
for f in cat_rank[1:]:
    ranked_predictors.append(f[0])
ranked_predictors

['id', 'rating', 'didPurchase']

In [15]:
X = df_scaled[predictor_names]
#setting target
y = df_scaled['doRecommend']

#### dividing data to have a training and a testing set

In [19]:
X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = train_test_split(X, y, test_size= .4,random_state=0)

In [20]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

### train with Gradient Boosting algorithm
### compute the accuracy scores on train and validation sets when training with different learning rates

In [21]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train_sub, y_train_sub)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train_sub, y_train_sub)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation_sub, y_validation_sub)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.933
Accuracy score (validation): 0.930

Learning rate:  0.1
Accuracy score (training): 0.960
Accuracy score (validation): 0.959

Learning rate:  0.25
Accuracy score (training): 0.962
Accuracy score (validation): 0.960

Learning rate:  0.5
Accuracy score (training): 0.965
Accuracy score (validation): 0.964

Learning rate:  0.75
Accuracy score (training): 0.940
Accuracy score (validation): 0.939

Learning rate:  1
Accuracy score (training): 0.965
Accuracy score (validation): 0.964



### Changing hyper-parameter values changes the accuracy score of predictions with maximum accuracy of ~96.5%.