In [1]:
# import dataset
import pandas as pd
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# visualization of training dataset
df = pd.read_excel(r'/home/tuedo/Downloads/ChallengeDataSet-WellsFargo.xlsx')
df1 = df.drop('XC', axis=1) # no XC column bc otherwise our array is an object array (not float)
X = df1.to_numpy()
print(df1)

            X0        X1        X2        X3        X4        X5        X6  \
0     0.985703  0.899527  2.332283  1.044273  1.141718 -1.656647 -1.409265   
1     0.539816  0.243602  1.636048 -0.562150 -0.235560 -1.316039  1.529069   
2    -1.424483 -0.470533  0.095100  0.131981  1.168902 -0.827530 -1.233111   
3    -1.034157 -1.054496  0.339467 -0.829151  0.012123  0.048887  0.573195   
4     0.177091  1.154216  1.004133 -0.724015 -0.508090 -0.478284 -0.524431   
...        ...       ...       ...       ...       ...       ...       ...   
2995  0.127214 -0.644439 -0.356251 -1.188522  0.646979 -1.018895 -0.153524   
2996  0.427700  1.460896 -1.260326  2.206359 -0.981929  0.045186  0.406928   
2997 -0.880540  0.678472  0.493897 -0.118869  0.005285 -0.201233 -0.601732   
2998  0.151291 -0.949928 -1.960683 -1.352557  2.221015 -0.775556  0.932388   
2999  1.351919 -0.353120 -0.918519 -0.417465 -0.688730 -0.157284 -1.233306   

            X7        X8        X9  ...       X21       X22    

In [2]:
from sklearn.model_selection import train_test_split

train_set, cv_set = train_test_split(X, test_size=0.2, random_state=42)

print(train_set.shape, "and", cv_set.shape)

(2400, 31) and (600, 31)


In [3]:
X_train, X_cv, y_train, y_cv = train_set[:, 0:30], cv_set[:, 0:30], train_set[:, 30], cv_set[:, 30]
print(X_train.shape, y_train.shape)

(2400, 30) (2400,)


In [4]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(random_state=42)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

def show_predicts(clf, X_train, y_train):

    skfolds = StratifiedKFold(n_splits=5, shuffle=True)
    s = 0;

    for train_index, cv_index in skfolds.split(X_train, y_train):
        clone_clf = clone(clf)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train[train_index])
        X_cv_fold = X_train[cv_index]
        y_cv_fold = (y_train[cv_index])
    
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_cv_fold)
        n_correct = sum(y_pred == y_cv_fold)
        print(n_correct / len(y_pred))
        s += n_correct / len(y_pred);
    print("mean:", s / 5) # n_splits
    print()
    
show_predicts(sgd_clf, X_train, y_train)

0.8854166666666666
0.8625
0.86875
0.8895833333333333
0.84375
mean: 0.8699999999999999



In [6]:
# Things to try: PCA, incorporating the 'XC' column (either with 5 different classifers or something else)

In [7]:
# 5 classifiers, one for each value in 'XC' column - tradeoff, our dataset is small for each
xc = df["XC"]
print("A", sum(xc == 'A'))
print("B", sum(xc == 'B'))
print("C", sum(xc == 'C'))
print("D", sum(xc == 'D'))
print("E", sum(xc == 'E'))

A 615
B 552
C 665
D 580
E 588


In [8]:
ind_A = np.where(df["XC"]=='A')
ind_A[0]

array([   7,    9,   10,   14,   16,   33,   35,   43,   51,   52,   56,
         61,   63,   64,   67,   74,   75,   88,   94,   96,  106,  113,
        134,  140,  146,  150,  153,  155,  165,  171,  187,  196,  203,
        205,  212,  217,  223,  227,  228,  229,  230,  238,  239,  244,
        249,  251,  254,  262,  264,  267,  271,  272,  277,  278,  281,
        294,  300,  305,  308,  314,  321,  322,  325,  326,  334,  344,
        349,  352,  356,  374,  380,  392,  397,  402,  403,  408,  409,
        410,  411,  412,  416,  434,  435,  442,  446,  448,  450,  451,
        462,  464,  465,  467,  469,  472,  476,  480,  481,  483,  488,
        490,  496,  498,  499,  502,  505,  535,  538,  545,  554,  555,
        562,  572,  580,  586,  588,  591,  596,  601,  602,  603,  606,
        615,  618,  626,  634,  640,  645,  646,  647,  655,  657,  659,
        667,  671,  672,  673,  676,  686,  704,  707,  712,  719,  725,
        731,  739,  749,  754,  768,  770,  772,  7

In [9]:
data_A = X[np.ma.masked_array(ind_A[0]), :]
data_A.shape

(615, 31)

In [10]:
X_A, y_A = data_A[:, 0:30], data_A[:, 30]
sgd_clf.fit(X_A, y_A)

SGDClassifier(random_state=42)

In [11]:
show_predicts(sgd_clf, X_A, y_A)

0.9349593495934959
0.975609756097561
0.975609756097561
0.983739837398374
0.975609756097561
mean: 0.9691056910569106



In [12]:
# looks pretty good, we basically copy-paste 4 times now.
ind_B = np.where(df["XC"]=='B')
data_B = X[np.ma.masked_array(ind_B[0]), :]
X_B, y_B = data_B[:, 0:30], data_B[:, 30]
sgd_clf.fit(X_B, y_B)

ind_C = np.where(df["XC"]=='C')
data_C = X[np.ma.masked_array(ind_C[0]), :]
X_C, y_C = data_C[:, 0:30], data_C[:, 30]
sgd_clf.fit(X_C, y_C)

ind_D = np.where(df["XC"]=='D')
data_D = X[np.ma.masked_array(ind_D[0]), :]
X_D, y_D = data_D[:, 0:30], data_D[:, 30]
sgd_clf.fit(X_D, y_D)

ind_E = np.where(df["XC"]=='E')
data_E = X[np.ma.masked_array(ind_E[0]), :]
X_E, y_E = data_E[:, 0:30], data_E[:, 30]
sgd_clf.fit(X_E, y_E)

SGDClassifier(random_state=42)

In [13]:
show_predicts(sgd_clf, X_A, y_A)
show_predicts(sgd_clf, X_B, y_B)
show_predicts(sgd_clf, X_C, y_C)
show_predicts(sgd_clf, X_D, y_D)
show_predicts(sgd_clf, X_E, y_E)

0.967479674796748
0.975609756097561
0.975609756097561
0.9512195121951219
0.975609756097561
mean: 0.9691056910569106

0.972972972972973
0.954954954954955
0.9545454545454546
0.9636363636363636
0.9818181818181818
mean: 0.9655855855855856

0.9849624060150376
0.9699248120300752
0.9323308270676691
0.9548872180451128
0.9774436090225563
mean: 0.9639097744360902

0.9913793103448276
0.9741379310344828
0.9396551724137931
0.9482758620689655
0.9827586206896551
mean: 0.9672413793103448

0.9491525423728814
0.9491525423728814
0.940677966101695
0.9658119658119658
0.9914529914529915
mean: 0.959249601622483



In [14]:
# SVM time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# preparing dataset
df = pd.read_excel(r'/home/tuedo/Downloads/ChallengeDataSet-WellsFargo.xlsx')
X = df.drop(columns=["XC", "y"], axis=1).to_numpy()
y = df["y"].to_numpy()
X

array([[ 0.98570251,  0.89952702,  2.33228337, ..., -0.2186419 ,
        -0.33659053,  0.09678116],
       [ 0.53981647,  0.24360233,  1.63604842, ...,  1.11227118,
         1.4925845 , -0.49701979],
       [-1.42448337, -0.47053311,  0.09509954, ...,  0.1134828 ,
         0.52159431,  1.26920802],
       ...,
       [-0.88054027,  0.67847241,  0.49389683, ..., -1.64121657,
         0.28702535,  0.59160709],
       [ 0.15129148, -0.94992761, -1.96068344, ...,  1.05720288,
        -1.33557874, -0.1089474 ],
       [ 1.35191918, -0.3531196 , -0.91851878, ...,  0.86379156,
         0.55110331, -1.43263492]])

In [15]:
svm_clf = Pipeline((
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=1, loss="hinge")),
))

svm_clf.fit(X, y)



Pipeline(steps=[('scaler', StandardScaler()),
                ('linear_svc', LinearSVC(C=1, loss='hinge'))])

In [16]:
# fails to converge? perhaps we need a nonlinear classifier instead
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC

polynomial_svm_clf = Pipeline((
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("svm_clf", LinearSVC(C=10, loss="hinge"))
))

polynomial_svm_clf.fit(X, y)

Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('scaler', StandardScaler()),
                ('svm_clf', LinearSVC(C=10, loss='hinge'))])

In [17]:
poly_kernel_svm_clf = Pipeline((
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5)),
))

poly_kernel_svm_clf.fit(X,y)

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=5, coef0=1, kernel='poly'))])

In [18]:
show_predicts(svm_clf, X, y)
show_predicts(polynomial_svm_clf, X, y)
show_predicts(poly_kernel_svm_clf, X, y)



0.885
0.895
0.9016666666666666
0.8833333333333333
0.8766666666666667
mean: 0.8883333333333333

0.8033333333333333
0.8216666666666667
0.805
0.83
0.8116666666666666
mean: 0.8143333333333335

0.84
0.8583333333333333
0.865
0.83
0.85
mean: 0.8486666666666667



In [19]:
def test_params_linearSVC(X, y):
    C_list = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
    
    for param in C_list:
        clf = Pipeline((
            ("scaler", StandardScaler()),
            ("linear_svc", LinearSVC(C=param, loss="hinge")),
        ))
        clf.fit(X, y)
        
        print(param)
        show_predicts(clf, X, y)

test_params_linearSVC(X, y)

0.01
0.9
0.88
0.8716666666666667
0.88
0.8883333333333333
mean: 0.884

0.03
0.8966666666666666
0.8666666666666667
0.895
0.8766666666666667
0.8933333333333333
mean: 0.8856666666666667

0.1
0.905
0.8883333333333333
0.8816666666666667
0.8816666666666667
0.8666666666666667
mean: 0.8846666666666667

0.3
0.9
0.88
0.8983333333333333
0.895
0.87
mean: 0.8886666666666667

1
0.88
0.9033333333333333




0.8833333333333333
0.88
0.8983333333333333
mean: 0.8889999999999999

3
0.8933333333333333
0.89
0.8866666666666667
0.8883333333333333
0.875
mean: 0.8866666666666667





10
0.88
0.8783333333333333
0.8983333333333333
0.8816666666666667




0.8866666666666667
mean: 0.8850000000000001

30
0.9




0.8716666666666667
0.8783333333333333
0.8733333333333333
0.8666666666666667
mean: 0.8780000000000001





In [20]:
def mean_predicts(clf, X, y):
    skfolds = StratifiedKFold(n_splits=5, shuffle=True)
    s = 0;

    for train_index, cv_index in skfolds.split(X_train, y_train):
        clone_clf = clone(clf)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train[train_index])
        X_cv_fold = X_train[cv_index]
        y_cv_fold = (y_train[cv_index])
    
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_cv_fold)
        n_correct = sum(y_pred == y_cv_fold)
        s += n_correct / len(y_pred);
    return s / 5 # n_splits

In [21]:
# testing polynomial_svm_clf
C_list = [0.01, 0.03, 0.1, 0.3, 1, 3]
deg_list = [1, 2, 3]

max_mean = 0;
max_C = 0;
max_deg = 0;

for C in C_list:
    for deg in deg_list:
        clf = Pipeline((
            ("poly_features", PolynomialFeatures(degree=deg)),
            ("scaler", StandardScaler()),
            ("svm_clf", LinearSVC(C=C, loss="hinge"))
        ))

        clf.fit(X, y)
        
        mean = mean_predicts(clf, X, y)
        if (mean > max_mean):
            max_C = C
            max_deg = deg
            max_mean = mean

print(max_C, max_deg)





0.03 1


In [22]:
# testing poly_kernel_svm_clf
C_list = [0.01, 0.03, 0.1, 0.3, 1, 3]
deg_list = [1, 2, 3]

max_mean = 0;
max_C = 0;
max_deg = 0;

for C in C_list:
    for deg in deg_list:
        poly_kernel_svm_clf = Pipeline((
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="poly", degree=deg, coef0=1, C=C)),
        ))

        poly_kernel_svm_clf.fit(X,y)
        
        mean = mean_predicts(clf, X, y)
        if (mean > max_mean):
            max_C = C
            max_deg = deg
            max_mean = mean

print(max_C, max_deg, max_mean)

0.01 2 0.83375


In [23]:
# 0.3 1 seems like best C and deg for kernel, but its still not as good as our 5-part Schocastic Gradient Descent
# Time for 5-part LinearSVM

best_svm_clf = Pipeline((
    ("scaler", StandardScaler()),
    ("linear_svc", LinearSVC(C=0.3, loss="hinge")),
))

ind_A = np.where(df["XC"]=='A')
ind_A[0]

array([   7,    9,   10,   14,   16,   33,   35,   43,   51,   52,   56,
         61,   63,   64,   67,   74,   75,   88,   94,   96,  106,  113,
        134,  140,  146,  150,  153,  155,  165,  171,  187,  196,  203,
        205,  212,  217,  223,  227,  228,  229,  230,  238,  239,  244,
        249,  251,  254,  262,  264,  267,  271,  272,  277,  278,  281,
        294,  300,  305,  308,  314,  321,  322,  325,  326,  334,  344,
        349,  352,  356,  374,  380,  392,  397,  402,  403,  408,  409,
        410,  411,  412,  416,  434,  435,  442,  446,  448,  450,  451,
        462,  464,  465,  467,  469,  472,  476,  480,  481,  483,  488,
        490,  496,  498,  499,  502,  505,  535,  538,  545,  554,  555,
        562,  572,  580,  586,  588,  591,  596,  601,  602,  603,  606,
        615,  618,  626,  634,  640,  645,  646,  647,  655,  657,  659,
        667,  671,  672,  673,  676,  686,  704,  707,  712,  719,  725,
        731,  739,  749,  754,  768,  770,  772,  7

In [24]:
X_A, y_A = X[np.ma.masked_array(ind_A[0]), :], y[np.ma.masked_array(ind_A[0])]
X_A.shape
y_A.shape

(615,)

In [25]:
best_svm_clf.fit(X_A, y_A)
show_predicts(best_svm_clf, X_A, y_A)

0.983739837398374
0.975609756097561
0.975609756097561
0.9512195121951219
0.967479674796748
mean: 0.9707317073170731





In [26]:
ind_B = np.where(df["XC"]=='B')
X_B, y_B = X[np.ma.masked_array(ind_B[0]), :], y[np.ma.masked_array(ind_B[0])]
best_svm_clf.fit(X_B, y_B)
show_predicts(best_svm_clf, X_B, y_B)

ind_C = np.where(df["XC"]=='C')
X_C, y_C = X[np.ma.masked_array(ind_C[0]), :], y[np.ma.masked_array(ind_C[0])]
best_svm_clf.fit(X_C, y_C)
show_predicts(best_svm_clf, X_C, y_C)

ind_D = np.where(df["XC"]=='D')
X_D, y_D = X[np.ma.masked_array(ind_D[0]), :], y[np.ma.masked_array(ind_D[0])]
best_svm_clf.fit(X_D, y_D)
show_predicts(best_svm_clf, X_D, y_D)

ind_E = np.where(df["XC"]=='E')
X_E, y_E = X[np.ma.masked_array(ind_E[0]), :], y[np.ma.masked_array(ind_E[0])]
best_svm_clf.fit(X_E, y_E)
show_predicts(best_svm_clf, X_E, y_E)

0.9099099099099099
0.954954954954955
0.9545454545454546
0.9545454545454546
0.9454545454545454
mean: 0.9438820638820639

0.9774436090225563
0.9548872180451128
0.9548872180451128
0.9924812030075187
0.9548872180451128
mean: 0.9669172932330827

0.9396551724137931
0.9741379310344828
0.9913793103448276
0.9655172413793104
0.9568965517241379
mean: 0.9655172413793103

0.9491525423728814
0.940677966101695
0.9322033898305084
0.9487179487179487
0.9658119658119658
mean: 0.9473127625669999



