#### Load Data

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
df_cols = np.load('data/df_cols.npy')

X_test = pd.DataFrame(np.load('data/X_test.npy'), columns=df_cols)
X_train = pd.DataFrame(np.load('data/X_train.npy'), columns=df_cols)
X_val = pd.DataFrame(np.load('data/X_val.npy'), columns=df_cols)

y_test = np.load('data/y_test.npy')
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')


In [3]:
def select_fetures(X_train, X_test, X_val, features):
    return X_train[features], X_test[features], X_val[features]

features = ['Score', 'Time', 'WordsCount', 'Total_Reviews_by_Reviewer',
        'ProductFrequency', 'WordCount', 'WordCountSummary', 'StopWords',
        'UpperCount', 'LowerCount', 'LowerCountSummary', 'DotCount',
        'CountPunctuation', 'CountDigits', 'Lexical', 'UpperLowerR',
        'UpperLowerSumR', 'DotCapitalR', 'DotCapitalSumR', 'CapitalsRatio',
        'neg', 'neu', 'pos', 'compound', 'ProductFreqlog',
        'ReviewsbyReviewerlog', 'WordCountlog', 'Month', 'Day']

X_train, X_test, X_val  = select_fetures(X_train, X_test, X_val, features)

X_train.shape, X_test.shape, X_val.shape

((53480, 29), (18183, 29), (17471, 29))

#### Random Forest

##### Get the Best Random Forest Parmeters from the Grid Search

Parameters: {'bootstrap': False, 'max_depth': 200, 
             'max_features': 'sqrt', 'min_samples_leaf': 3,
             'n_estimators': 300}

In [4]:
clf = RandomForestClassifier(n_estimators=300, max_depth=200, min_samples_leaf=3, 
                             max_features='sqrt', bootstrap=False)

clf.fit(X_train, y_train)
rnd_val_pred = clf.predict(X_val)
rnd_train_pred = clf.predict(X_train)

print("Validation accuracy score: ", accuracy_score(rnd_val_pred, y_val))
print("Train accuracy score: ", accuracy_score(rnd_train_pred, y_train))


Validation accuracy score:  0.6622975216072349
Train accuracy score:  0.9972139117427076


In [5]:
from sklearn.externals import joblib

# Export All essensial variables:
joblib.dump(clf, 'data/rnd_forest_best.pk1')
# Export the training features:
with open('data/rnd_val_pred.pkl', 'wb') as f:
    pickle.dump(rnd_val_pred ,f)

# Export validation and test data:
with open('data/rnd_train_pred.pkl', 'wb') as f:
    pickle.dump(rnd_train_pred ,f)

In [6]:
# Save proba:
rfc_fe_train_proba = clf.predict_proba(X_train)
rfc_fe_val_proba = clf.predict_proba(X_val)
rfc_fe_test_proba = clf.predict_proba(X_test)

with open('data/proba/rfc_fe_train_proba.pkl', 'wb') as f:
    pickle.dump(rfc_fe_train_proba, f)
    
with open('data/proba/rfc_fe_val_proba.pkl', 'wb') as f:
    pickle.dump(rfc_fe_val_proba, f)
    
with open('data/proba/rfc_fe_test_proba.pkl', 'wb') as f:
    pickle.dump(rfc_fe_test_proba, f)

#### SVM

In [4]:
scaler = StandardScaler() # Scale features
clf_svm = SVC(probability=True, gamma=0.33, C=10)

print("Transform...")
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Fit...")
clf_svm.fit(X_train_scaled, y_train)

print("Predict...")
svm_val_pred = clf_svm.predict(X_val_scaled)
svm_train_pred = clf_svm.predict(X_train_scaled)

print("Validation accuracy score: ", accuracy_score(svm_val_pred, y_val))
print("Train accuracy score: ", accuracy_score(svm_train_pred, y_train))

Transform...
Fit...
Predict...
Validation accuracy score:  0.6226317898231355
Train accuracy score:  0.9955684367988032


In [5]:
# Save proba:
svm_fe_train_proba = clf_svm.predict_proba(X_train_scaled)
svm_fe_val_proba = clf_svm.predict_proba(X_val_scaled)
svm_fe_test_proba = clf_svm.predict_proba(X_test_scaled)

with open('data/proba/svm_fe_train_proba.pkl', 'wb') as f:
    pickle.dump(svm_fe_train_proba, f)
    
with open('data/proba/svm_fe_val_proba.pkl', 'wb') as f:
    pickle.dump(svm_fe_val_proba, f)
    
with open('data/proba/svm_fe_test_proba.pkl', 'wb') as f:
    pickle.dump(svm_fe_test_proba, f)

In [6]:
svm_fe_test_proba.shape

(18183, 11)

In [7]:
def validate_accuracy(proba, y):
    prediction = proba.argmax(axis=1)
#     print(prediction.shape, y.shape)
    return accuracy_score(y, prediction)
validate_accuracy(svm_fe_train_proba, y_train)

0.9954562453253553

#### XGBOOST

In [None]:
from numpy import loadtxt
from xgboost import XGBClassifier

X_train = X_train.astype('float64')
X_val = X_val.astype('float64')
X_test = X_test.astype('float64')


model = XGBClassifier(learning_rate=0.5, max_depth=10, n_estimators=200)
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred_test]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy test: %.2f%%" % (accuracy * 100.0))

y_pred_train = model.predict(X_train)
predictions_train = [round(value) for value in y_pred_train]

accuracy = accuracy_score(y_train, predictions_train)
print("Accuracy train: %.2f%%" % (accuracy * 100.0))

y_pred_val = model.predict(X_val)
predictions_val = [round(value) for value in y_pred_val]

accuracy = accuracy_score(y_val, predictions_val)
print("Accuracy val: %.2f%%" % (accuracy * 100.0))

In [None]:
xgb_fe_train_proba = model.predict_proba(X_train)
xgb_fe_val_proba = model.predict_proba(X_val)
xgb_fe_test_proba = model.predict_proba(X_test)

In [None]:
with open('data/proba/xgb_fe_train_proba.pkl', 'wb') as f:
    pickle.dump(xgb_fe_train_proba, f)
    
with open('data/proba/xgb_fe_val_proba.pkl', 'wb') as f:
    pickle.dump(xgb_fe_val_proba, f)
    
with open('data/proba/xgb_fe_test_proba.pkl', 'wb') as f:
    pickle.dump(xgb_fe_test_proba, f)