In [1]:
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_regression
import seaborn as sns
%matplotlib inline

## Importing and sampling the UCI data / basic EDA

### FIRST BATCH OF 2200

In [2]:
X = pd.read_pickle('./data/first_batch_X.p')

In [15]:
y = pd.read_pickle('./data/first_batch_labels.p')

In [16]:
X.head()

Unnamed: 0_level_0,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_990,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
68411,0.020855,0.578265,-0.13612,-0.410538,-1.772283,1.187936,0.402231,1.176466,-0.792536,-1.821266,...,-1.492082,-0.302283,-0.259279,-1.326535,-1.993,0.540589,-1.205038,-0.581775,-0.613431,0.365626
118486,-0.66898,0.411098,-0.818625,-0.68011,1.437646,0.638755,0.201362,0.467585,-0.347586,0.962761,...,0.337309,0.404857,0.594332,0.80181,0.108186,0.218923,-0.4331,-1.383996,1.760135,0.256725
26213,-0.255765,0.04038,1.29295,-0.478335,-0.688653,-0.094722,0.387218,-0.938971,-1.000622,1.521842,...,-0.475559,0.576874,2.402998,0.330567,1.089679,1.599995,0.173667,-0.705471,0.473086,-0.595255
121169,-0.118851,-1.572511,0.104159,1.283724,0.024256,-0.257493,0.374294,-0.893251,-0.098985,0.223367,...,0.102171,-0.256557,0.064411,-1.307146,-1.028803,-0.813137,1.492722,1.395631,1.233597,0.769766
87524,-0.269251,1.187003,-1.003767,1.106124,1.524958,0.18264,0.801729,-1.49161,-0.143916,0.399354,...,2.051286,-0.200767,-1.2924,-0.173536,0.93777,2.080136,-0.41223,2.234453,-0.906541,1.336438


In [17]:
y.head()

_id
68411     1
118486    1
26213     1
121169    0
87524     0
Name: target, dtype: int64

In [18]:
X.shape, y.shape

((2200, 1000), (2200,))

## Preparing train, test split for:

- logistic regression
- decision tree
- k nearest neighbors
- support vector classifier

"in order to do this, you will need to set a high C value in order to perform minimal regularization, in the case of logistic regression and support vector classifier."

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = .3,
                                                    )

In [20]:
X_train.shape, y_train.shape

((1540, 1000), (1540,))

In [21]:
simple_pipe_1 = Pipeline([
    ('lnr',  LogisticRegression())
])

In [22]:
simple_param_1 = {
    'lnr__C': [10,25,50,75,100]
}

In [23]:
simple2_gs = GridSearchCV(simple_pipe_1, param_grid=simple_param_1, cv=2, n_jobs=-1, verbose=1)

In [24]:
simple2_gs.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.4s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('lnr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'lnr__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [25]:
pd.DataFrame(simple2_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.229013,0.223942,0.236809,0.254783,0.247419
mean_score_time,0.0014652,0.00144172,0.00145471,0.00142419,0.00141573
mean_test_score,0.537662,0.536364,0.536364,0.535714,0.534416
mean_train_score,1,1,1,1,1
param_lnr__C,10,25,50,75,100
params,{'lnr__C': 10},{'lnr__C': 25},{'lnr__C': 50},{'lnr__C': 75},{'lnr__C': 100}
rank_test_score,1,2,3,4,5
split0_test_score,0.52987,0.525974,0.524675,0.525974,0.523377
split0_train_score,1,1,1,1,1
split1_test_score,0.545455,0.546753,0.548052,0.545455,0.545455


### Benchmark scores for Linear Regression with high C values

In [26]:
simple2_gs.score(X_train, y_train)

1.0

In [27]:
simple2_gs.best_score_

0.53766233766233762

# DECISION TREE


In [30]:
simple_pipe_2 =  Pipeline([
    ('tree', DecisionTreeClassifier())
])

In [31]:
simple_pipe_2.fit(X_train, y_train)

Pipeline(steps=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

### Benchmark scores for Decision Tree 

In [32]:
simple_pipe_2.score(X_train, y_train)

1.0

In [33]:
simple_pipe_2.score(X_test, y_test)

0.56212121212121213

# KNN


In [36]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [37]:
neigh.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [38]:
neigh.predict_proba(X_train)

array([[ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       [ 0.66666667,  0.33333333],
       ..., 
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667]])

### Benchmark scores for KNN


In [42]:
neigh.score(X_train, y_train, sample_weight=None)

0.80389610389610389

In [43]:
neigh.score(X_test, y_test, sample_weight=None)

0.58636363636363631

# SVC


In [44]:
simple_pipe_3 =  Pipeline([
    ('svm', svm.SVC())
])

In [45]:
simple_param_3 = {
    'svm__C': [10,25,50,75,100]
}

In [48]:
simple3_gs = GridSearchCV(simple_pipe_3, param_grid=simple_param_3, cv=2, n_jobs=-1, verbose=1)

In [49]:
simple3_gs.fit(X_test, y_test) 

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.9s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'svm__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

### Benchmark scores for SVC with high C values

In [50]:
simple3_gs.score(X_test, y_test) 

1.0

In [51]:
simple3_gs.best_score_

0.55454545454545456

In [52]:
pd.DataFrame(simple3_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.135491,0.134297,0.135167,0.132544,0.132144
mean_score_time,0.130336,0.128884,0.126179,0.127201,0.125927
mean_test_score,0.554545,0.554545,0.554545,0.554545,0.554545
mean_train_score,1,1,1,1,1
param_svm__C,10,25,50,75,100
params,{'svm__C': 10},{'svm__C': 25},{'svm__C': 50},{'svm__C': 75},{'svm__C': 100}
rank_test_score,1,1,1,1,1
split0_test_score,0.548485,0.548485,0.548485,0.548485,0.548485
split0_train_score,1,1,1,1,1
split1_test_score,0.560606,0.560606,0.560606,0.560606,0.560606


### SECOND BATCH OF 2200


In [54]:
X_2 = pd.read_pickle('./data/second_batch_X.p')

In [55]:
y_2 = pd.read_pickle('./data/second_batch_labels.p')

In [56]:
X_2.head()

Unnamed: 0_level_0,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_990,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58633,-0.414192,0.556678,-0.048062,-1.09321,0.32669,1.181827,1.322816,0.355778,-0.820547,-0.612402,...,1.014297,0.414293,0.320118,0.331135,0.998828,0.653802,-0.870826,-0.094518,0.261509,-0.609016
121974,0.909779,-0.117772,-1.633679,-0.365221,-1.126885,0.884262,-0.406014,-0.577241,0.543661,0.020514,...,-1.07537,-1.119594,0.336924,1.174172,-0.09944,0.326672,1.326339,0.786394,-1.855981,-0.794993
164272,-0.186508,0.138429,-0.732191,0.776423,-0.560702,-0.946012,-0.043432,-0.177078,-0.240173,-1.023106,...,0.454122,-0.171164,1.632371,-0.07914,-0.287585,-0.052406,-0.702572,-0.084256,0.171039,1.359428
142434,0.48713,0.798981,-0.742441,-1.094174,-1.566471,-0.211503,0.354208,0.429679,-0.249969,0.905306,...,0.975815,0.294657,-1.327187,-0.474763,1.495184,-0.055151,0.865955,-0.522299,-0.010232,-0.287562
35139,1.304757,0.638154,1.263587,-0.781058,0.804523,0.949374,0.72236,-0.147807,0.559629,-1.149253,...,0.022418,-1.448466,0.124061,1.144825,-0.22514,-0.291749,0.508428,0.630435,-0.807956,0.567494


In [57]:
y_2.head()

_id
58633     1
121974    0
164272    1
142434    1
35139     1
Name: target, dtype: int64

In [58]:
X_2.shape, y_2.shape

((2200, 1000), (2200,))

## Preparing train, test split for:

- logistic regression
- decision tree
- k nearest neighbors
- support vector classifier

"in order to do this, you will need to set a high C value in order to perform minimal regularization, in the case of logistic regression and support vector classifier."

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_2,
                                                    y_2,
                                                    test_size = .3,
                                                    )

In [60]:
X_train.shape, y_train.shape

((1540, 1000), (1540,))

In [61]:
simple_pipe_1 = Pipeline([
    ('lnr',  LogisticRegression())
])

In [62]:
simple_param_1 = {
    'lnr__C': [10,25,50,75,100]
}

In [63]:
simple2_gs = GridSearchCV(simple_pipe_1, param_grid=simple_param_1, cv=2, n_jobs=-1, verbose=1)

In [64]:
simple2_gs.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('lnr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'lnr__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [65]:
pd.DataFrame(simple2_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,1,0,3,4,2
mean_fit_time,0.241772,0.244029,0.268722,0.267987,0.25733
mean_score_time,0.00146413,0.00145745,0.00145447,0.00145257,0.00145745
mean_test_score,0.541558,0.53961,0.53961,0.538961,0.537662
mean_train_score,1,1,1,1,1
param_lnr__C,25,10,75,100,50
params,{'lnr__C': 25},{'lnr__C': 10},{'lnr__C': 75},{'lnr__C': 100},{'lnr__C': 50}
rank_test_score,1,2,2,4,5
split0_test_score,0.564202,0.560311,0.564202,0.562905,0.559014
split0_train_score,1,1,1,1,1
split1_test_score,0.518856,0.518856,0.514954,0.514954,0.516255


In [66]:
simple2_gs.score(X_train, y_train)

1.0

In [67]:
simple2_gs.best_score_

0.54155844155844157

# DECISION TREE


In [68]:
simple_pipe_2 =  Pipeline([
    ('tree', DecisionTreeClassifier())
])

In [69]:
simple_pipe_2.fit(X_train, y_train)

Pipeline(steps=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

### Benchmark scores for Decision Tree 

In [70]:
simple_pipe_2.score(X_train, y_train)

1.0

In [71]:
simple_pipe_2.score(X_test, y_test)

0.59999999999999998

# KNN


In [72]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [73]:
neigh.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [74]:
neigh.predict_proba(X_train)

array([[ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       ..., 
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667]])

### Benchmark scores for KNN


In [75]:
neigh.score(X_train, y_train, sample_weight=None)

0.80194805194805197

In [76]:
neigh.score(X_test, y_test, sample_weight=None)

0.58939393939393936

# SVC


In [77]:
simple_pipe_3 =  Pipeline([
    ('svm', svm.SVC())
])

In [78]:
simple_param_3 = {
    'svm__C': [10,25,50,75,100]
}

In [79]:
simple3_gs = GridSearchCV(simple_pipe_3, param_grid=simple_param_3, cv=2, n_jobs=-1, verbose=1)

In [80]:
simple3_gs.fit(X_test, y_test) 

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.0s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'svm__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

### Benchmark scores for SVC with high C values

In [81]:
simple3_gs.score(X_test, y_test) 

1.0

In [82]:
simple3_gs.best_score_

0.59090909090909105

In [83]:
pd.DataFrame(simple3_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.135547,0.135126,0.135311,0.135447,0.136195
mean_score_time,0.130765,0.129511,0.128968,0.12903,0.130018
mean_test_score,0.590909,0.590909,0.590909,0.590909,0.590909
mean_train_score,1,1,1,1,1
param_svm__C,10,25,50,75,100
params,{'svm__C': 10},{'svm__C': 25},{'svm__C': 50},{'svm__C': 75},{'svm__C': 100}
rank_test_score,1,1,1,1,1
split0_test_score,0.607251,0.607251,0.607251,0.607251,0.607251
split0_train_score,1,1,1,1,1
split1_test_score,0.574468,0.574468,0.574468,0.574468,0.574468


### THIRD BATCH OF 2200


In [84]:
X_3 = pd.read_pickle('./data/third_batch_X.p')

In [85]:
y_3 = pd.read_pickle('./data/third_batch_labels.p')

In [86]:
X_3.head()

Unnamed: 0_level_0,feat_000,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_990,feat_991,feat_992,feat_993,feat_994,feat_995,feat_996,feat_997,feat_998,feat_999
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
150087,1.733559,-0.761334,-1.240347,-1.415227,0.703603,-0.439027,-0.049728,0.745157,-1.155612,-0.737401,...,2.591919,-0.010986,0.71074,-0.737911,0.513733,1.053194,0.475458,0.783257,-0.851222,1.870741
154907,0.861287,0.237212,-1.297588,1.634156,-1.269498,-0.794096,-1.220777,-0.340135,-0.412629,-0.319816,...,-0.509825,-1.686618,0.165523,-0.935342,-0.406321,0.02049,-1.824565,-1.648503,-0.140154,-0.700938
106532,0.021128,1.186682,0.324742,0.755476,-0.816199,-0.759361,0.032196,-0.959081,-0.493622,-0.9937,...,-1.807779,-1.285744,-0.161095,1.228016,-0.050208,-0.033661,-0.440945,-1.543513,-1.336015,1.734729
47807,-0.517406,0.245016,0.521708,-1.459276,-1.23208,-1.328985,-0.50343,0.050424,-0.76541,-0.649276,...,-0.297998,-0.44344,0.404448,1.525297,-1.888619,0.604289,-1.572321,-1.411331,-0.714153,0.554626
194259,0.622187,0.179418,-0.447626,-0.522262,-0.26989,-0.016812,-1.618383,-1.311677,-1.007634,0.794815,...,-0.52565,0.654571,-0.534706,1.286618,1.72976,0.082108,1.324152,0.890771,0.294307,-1.57599


In [87]:
y_3.head()

_id
150087    1
154907    0
106532    1
47807     0
194259    1
Name: target, dtype: int64

In [88]:
X_3.shape, y_3.shape

((2200, 1000), (2200,))

## Preparing train, test split for:

- logistic regression
- decision tree
- k nearest neighbors
- support vector classifier

"in order to do this, you will need to set a high C value in order to perform minimal regularization, in the case of logistic regression and support vector classifier."

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X_3,
                                                    y_3,
                                                    test_size = .3,
                                                    )

In [91]:
X_train.shape, y_train.shape

((1540, 1000), (1540,))

In [92]:
simple_pipe_1 = Pipeline([
    ('lnr',  LogisticRegression())
])

In [93]:
simple_param_1 = {
    'lnr__C': [10,25,50,75,100]
}

In [94]:
simple2_gs = GridSearchCV(simple_pipe_1, param_grid=simple_param_1, cv=2, n_jobs=-1, verbose=1)

In [95]:
simple2_gs.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.5s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('lnr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'lnr__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [96]:
pd.DataFrame(simple2_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,4,1,3,2
mean_fit_time,0.232878,0.25433,0.23029,0.258338,0.260358
mean_score_time,0.00168622,0.00146556,0.001436,0.00143814,0.00139368
mean_test_score,0.522727,0.519481,0.517532,0.517532,0.516234
mean_train_score,1,1,1,1,1
param_lnr__C,10,100,25,75,50
params,{'lnr__C': 10},{'lnr__C': 100},{'lnr__C': 25},{'lnr__C': 75},{'lnr__C': 50}
rank_test_score,1,2,3,3,5
split0_test_score,0.539559,0.534371,0.533074,0.533074,0.531777
split0_train_score,1,1,1,1,1
split1_test_score,0.505852,0.504551,0.501951,0.501951,0.50065


In [97]:
simple2_gs.score(X_train, y_train)

1.0

In [98]:
simple2_gs.best_score_

0.52272727272727271

# DECISION TREE


In [99]:
simple_pipe_2 =  Pipeline([
    ('tree', DecisionTreeClassifier())
])

In [100]:
simple_pipe_2.fit(X_train, y_train)

Pipeline(steps=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

### Benchmark scores for Decision Tree 

In [101]:
simple_pipe_2.score(X_train, y_train)

1.0

In [102]:
simple_pipe_2.score(X_test, y_test)

0.58636363636363631

# KNN


In [103]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [104]:
neigh.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [105]:
neigh.predict_proba(X_train)

array([[ 0.66666667,  0.33333333],
       [ 1.        ,  0.        ],
       [ 0.        ,  1.        ],
       ..., 
       [ 1.        ,  0.        ],
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333]])

### Benchmark scores for KNN


In [106]:
neigh.score(X_train, y_train, sample_weight=None)

0.79805194805194801

In [107]:
neigh.score(X_test, y_test, sample_weight=None)

0.60606060606060608

# SVC


In [108]:
simple_pipe_3 =  Pipeline([
    ('svm', svm.SVC())
])

In [109]:
simple_param_3 = {
    'svm__C': [10,25,50,75,100]
}

In [110]:
simple3_gs = GridSearchCV(simple_pipe_3, param_grid=simple_param_3, cv=2, n_jobs=-1, verbose=1)

In [111]:
simple3_gs.fit(X_test, y_test) 

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.9s finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'svm__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

### Benchmark scores for SVC with high C values

In [112]:
simple3_gs.score(X_test, y_test) 

1.0

In [113]:
simple3_gs.best_score_

0.59999999999999998

In [114]:
pd.DataFrame(simple3_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.135085,0.134869,0.134425,0.13465,0.134994
mean_score_time,0.129717,0.129396,0.128925,0.129411,0.127337
mean_test_score,0.6,0.6,0.6,0.6,0.6
mean_train_score,1,1,1,1,1
param_svm__C,10,25,50,75,100
params,{'svm__C': 10},{'svm__C': 25},{'svm__C': 50},{'svm__C': 75},{'svm__C': 100}
rank_test_score,1,1,1,1,1
split0_test_score,0.6,0.6,0.6,0.6,0.6
split0_train_score,1,1,1,1,1
split1_test_score,0.6,0.6,0.6,0.6,0.6
