# Models
In this part, we are using 4 different models: Gradient Boosting Regression, Linear Regression, Neural Network, and Support Vector Regression.

In [10]:
import pandas as pd
import numpy as np
import sklearn
import os, sys

In [11]:
DATA_DIR = "../data/"
DATA_FILE = DATA_DIR + "dataX.csv"
TARGET_FILE = DATA_DIR + "dataY.csv"

# Check if data dir or file exist
if not os.path.exists(DATA_DIR) \
    or not os.path.exists(DATA_FILE)\
    or not os.path.exists(TARGET_FILE):
    print(
        "CANNOT find {}, {}, or {}".format(DATA_DIR, DATA_FILE, TARGET_FILE),
        file=sys.stderr
    )
    sys.exit()

# Load csv data
dfX = pd.read_csv(DATA_FILE)
dfY = pd.read_csv(TARGET_FILE)
X = dfX.as_matrix()
y = dfY.as_matrix(columns=['transaction_amount'])

Using ShuffleSplit to do Cross Validation index partition.

In [14]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=10, test_size=.1, random_state=2018)
rs.get_n_splits(X)

10

In [15]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

def evaluate(y_true, y_pred, model):
    r2score_reg = r2_score(y_true, y_pred)
    ev_score = explained_variance_score(y_true, y_pred)    
    print("R2 Score - Regular:\t{:.4f}\n"
          "Explained Variance Score:\t{:.4f}\n"
          .format(
              r2score_reg, 
              ev_score))

In [16]:
from sklearn.preprocessing import normalize
X_norm = normalize(X, axis=0, norm='max')
y_norm = normalize(y, axis=0, norm='max')

In [17]:
from sklearn.linear_model import LinearRegression

for train_index, test_index in rs.split(X):
    train_X, train_y = X[train_index], y[train_index]
    test_X, test_y = X[test_index], y[test_index]
    
    # Linear Regression
    lm = LinearRegression()
    lm.fit(train_X, train_y)
    y_pred = lm.predict(test_X)
    evaluate(test_y, y_pred, "Linear Regress")

R2 Score - Regular:	0.2844
Explained Variance Score:	0.2844

R2 Score - Regular:	0.3083
Explained Variance Score:	0.3083

R2 Score - Regular:	0.2581
Explained Variance Score:	0.2582

R2 Score - Regular:	0.2615
Explained Variance Score:	0.2616

R2 Score - Regular:	0.2502
Explained Variance Score:	0.2502

R2 Score - Regular:	0.2516
Explained Variance Score:	0.2516

R2 Score - Regular:	0.2726
Explained Variance Score:	0.2727

R2 Score - Regular:	0.2305
Explained Variance Score:	0.2305

R2 Score - Regular:	0.2649
Explained Variance Score:	0.2649

R2 Score - Regular:	0.3057
Explained Variance Score:	0.3057



In [18]:
from sklearn.ensemble import GradientBoostingRegressor
for train_index, test_index in rs.split(X):
    train_X, train_y = X[train_index], y[train_index]
    test_X, test_y = X[test_index], y[test_index]
    rbr = GradientBoostingRegressor(n_estimators=100, 
                                    learning_rate=0.1,
                                    max_depth=1, 
                                    random_state=0, 
                                    loss='ls')
    rbr.fit(train_X, train_y)
    evaluate(test_y, rbr.predict(test_X), "Gradient Tree Boosting")
    

  y = column_or_1d(y, warn=True)


R2 Score - Regular:	0.4081
Explained Variance Score:	0.4081

R2 Score - Regular:	0.3967
Explained Variance Score:	0.3967

R2 Score - Regular:	0.3836
Explained Variance Score:	0.3839

R2 Score - Regular:	0.3744
Explained Variance Score:	0.3744

R2 Score - Regular:	0.3669
Explained Variance Score:	0.3669

R2 Score - Regular:	0.3935
Explained Variance Score:	0.3935

R2 Score - Regular:	0.3895
Explained Variance Score:	0.3895

R2 Score - Regular:	0.3532
Explained Variance Score:	0.3532

R2 Score - Regular:	0.3945
Explained Variance Score:	0.3945

R2 Score - Regular:	0.4214
Explained Variance Score:	0.4214



In [19]:
# Neural Network Regressor
from sklearn.neural_network import MLPRegressor
for train_index, test_index in rs.split(X):
    train_X, train_y = X_norm[train_index], y_norm[train_index]
    test_X, test_y = X_norm[test_index], y_norm[test_index]

    nnreg = MLPRegressor(hidden_layer_sizes=(200, 50,5),
                 activation='tanh',
                 solver='sgd',
                 alpha=0.1,
                 learning_rate='adaptive',
                 learning_rate_init=0.001,
                 max_iter=500)
    nnreg.fit(train_X, train_y)
    y_pred = nnreg.predict(test_X)
    evaluate(test_y, y_pred, "NN")

  y = column_or_1d(y, warn=True)


R2 Score - Regular:	0.2333
Explained Variance Score:	0.2334

R2 Score - Regular:	0.2350
Explained Variance Score:	0.2351

R2 Score - Regular:	0.2455
Explained Variance Score:	0.2456

R2 Score - Regular:	0.2091
Explained Variance Score:	0.2091

R2 Score - Regular:	0.1872
Explained Variance Score:	0.1872

R2 Score - Regular:	0.2307
Explained Variance Score:	0.2307

R2 Score - Regular:	0.2332
Explained Variance Score:	0.2332

R2 Score - Regular:	0.2102
Explained Variance Score:	0.2102

R2 Score - Regular:	0.2213
Explained Variance Score:	0.2213

R2 Score - Regular:	0.2456
Explained Variance Score:	0.2456



In [20]:
# Support Vector Regressor
from sklearn import svm
for train_index, test_index in rs.split(X):
    train_X, train_y = X_norm[train_index], y_norm[train_index]
    test_X, test_y = X_norm[test_index], y_norm[test_index]
    svreg = svm.SVR()
    svreg.fit(train_X, train_y)
    evaluate(test_y, svreg.predict(test_X), "SVR")

  y = column_or_1d(y, warn=True)


R2 Score - Regular:	-2.7836
Explained Variance Score:	0.2274

R2 Score - Regular:	-3.0659
Explained Variance Score:	0.2452

R2 Score - Regular:	-2.6039
Explained Variance Score:	0.2265

R2 Score - Regular:	-3.0716
Explained Variance Score:	0.2198

R2 Score - Regular:	-2.8697
Explained Variance Score:	0.2213

R2 Score - Regular:	-3.0079
Explained Variance Score:	0.2269

R2 Score - Regular:	-2.8086
Explained Variance Score:	0.2226

R2 Score - Regular:	-2.9331
Explained Variance Score:	0.2075

R2 Score - Regular:	-2.7156
Explained Variance Score:	0.2315

R2 Score - Regular:	-3.1921
Explained Variance Score:	0.2463



In [21]:
# Neural Network Regressor Simplifed Version
from sklearn.neural_network import MLPRegressor
for train_index, test_index in rs.split(X):
    train_X, train_y = X_norm[train_index], y_norm[train_index]
    test_X, test_y = X_norm[test_index], y_norm[test_index]

    nnreg = MLPRegressor(hidden_layer_sizes=(20,5),
                 activation='tanh',
                 solver='sgd',
                 alpha=0.1,
                 learning_rate='adaptive',
                 learning_rate_init=0.001,
                 max_iter=500)
    nnreg.fit(train_X, train_y)
    y_pred = nnreg.predict(test_X)
    print(y_pred[:10])
    evaluate(test_y, y_pred, "NN")

  y = column_or_1d(y, warn=True)


[ 0.02779675  0.01449258  0.01802315  0.04026573  0.02039109  0.02621243
  0.04230546  0.08472288  0.02156243  0.02818379]
R2 Score - Regular:	-0.0589
Explained Variance Score:	-0.0588

[ 0.01985885  0.02124155  0.04099629  0.01270819  0.03153647  0.03759776
  0.06096505  0.05380119  0.04024489  0.02819383]
R2 Score - Regular:	-0.3560
Explained Variance Score:	-0.3556

[ 0.02031199  0.0061399   0.04436574  0.0308972   0.0159773   0.04929853
  0.02625813  0.01559733 -0.00578754  0.04145787]
R2 Score - Regular:	0.0370
Explained Variance Score:	0.0373

[ 0.04670062  0.02981889  0.02534115 -0.00133522  0.03872979  0.02633895
  0.06534957  0.00925886  0.0484656   0.04902986]
R2 Score - Regular:	-0.1721
Explained Variance Score:	-0.1720

[ 0.06899578  0.03950421  0.08701522 -0.04022355  0.01411301  0.25905389
  0.0466987   0.05710655 -0.01033755  0.03125264]
R2 Score - Regular:	-0.1360
Explained Variance Score:	-0.1360

[ 0.03234826  0.05295523  0.03971633  0.03166617  0.04148953  0.03864407