In [25]:
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
import numpy as np

In [2]:
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_regression
import seaborn as sns
%matplotlib inline

## Importing and sampling the UCI data / basic EDA

In [5]:
df_train_data = pd.read_csv('madelon_train.data.csv', delimiter=' ', header=None).drop(500, axis=1)

In [6]:
df_train_labels = pd.read_csv('madelon_train.labels.csv', delimiter=' ', header=None)

In [7]:
train_data_10pct_1 = df_train_data.sample(200)

In [8]:
y = df_train_labels[0]

In [9]:
y_10pct_1 = y.sample(200)

In [11]:
train_data_10pct_1.shape, y_10pct_1.shape

((200, 500), (200,))

In [12]:
df_train_data.insert(0, 'label', y)

In [13]:
train_data_10pct_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
508,474,423,448,485,481,480,445,476,477,478,...,516,475,464,636,485,578,465,465,537,499
444,491,486,514,480,576,479,458,478,482,484,...,486,479,489,442,523,480,491,480,564,531
536,472,452,580,481,474,466,519,476,487,482,...,489,474,458,501,459,548,475,466,491,501
37,488,462,538,479,542,485,477,476,528,471,...,519,475,489,689,448,465,483,462,525,513
901,474,526,476,488,511,492,490,476,473,488,...,459,475,460,348,456,552,491,473,495,484


In [14]:
y_10pct_1.head()

684     1
301    -1
1908   -1
1360   -1
154    -1
Name: 0, dtype: int64

## Preparing train, test split for:

- logistic regression
- decision tree
- k nearest neighbors
- support vector classifier

"in order to do this, you will need to set a high C value in order to perform minimal regularization, in the case of logistic regression and support vector classifier."

In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_data_10pct_1,
                                                    y_10pct_1,
                                                    test_size = .3,
                                                    )

In [16]:
X_train.shape, y_train.shape

((140, 500), (140,))

# LOGISTIC REGRESSION

In [17]:
simple_pipe_1 = Pipeline([
    ('lnr',  LogisticRegression())
])

In [18]:
simple_param_1 = {
    'lnr__C': [10,25,50,75,100]
}

In [19]:
simple2_gs = GridSearchCV(simple_pipe_1, param_grid=simple_param_1, cv=5, n_jobs=-1, verbose=1)

In [20]:
simple2_gs.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lnr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'lnr__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [21]:
pd.DataFrame(simple2_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.00849185,0.00811915,0.00875969,0.00921001,0.00860438
mean_score_time,0.000611401,0.000513744,0.000508165,0.000522184,0.000531101
mean_test_score,0.507143,0.507143,0.507143,0.507143,0.507143
mean_train_score,1,1,1,1,1
param_lnr__C,10,25,50,75,100
params,{'lnr__C': 10},{'lnr__C': 25},{'lnr__C': 50},{'lnr__C': 75},{'lnr__C': 100}
rank_test_score,1,1,1,1,1
split0_test_score,0.586207,0.586207,0.586207,0.586207,0.586207
split0_train_score,1,1,1,1,1
split1_test_score,0.482759,0.482759,0.482759,0.482759,0.482759


### Benchmark scores for Linear Regression with high C values

In [22]:
simple2_gs.score(X_train, y_train)

1.0

In [120]:
simple2_gs.best_score_

0.49285714285714288

In [23]:
#y predictions
y_test_pred = simple2_gs.predict(X_test)

In [27]:
print(classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

         -1       0.58      0.43      0.49        35
          1       0.41      0.56      0.47        25

avg / total       0.51      0.48      0.48        60



In [96]:
pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=['predicted -1', 'predicted 1'], index=['-1', '1'])

Unnamed: 0,predicted -1,predicted 1
-1,14,18
1,17,11


In [97]:
14+18+17+11

60

# DECISION TREE

In [28]:
simple_pipe_2 =  Pipeline([
    ('tree', tree.DecisionTreeClassifier())
])

In [29]:
simple_pipe_2.fit(X_train, y_train)

Pipeline(steps=[('tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

### Benchmark scores for Decision Tree 

In [30]:
simple_pipe_2.score(X_train, y_train)

1.0

In [31]:
simple_pipe_2.score(X_test, y_test)

0.46666666666666667

# KNN


In [32]:
nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(train_data_10pct_1)

In [33]:
nbrs

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=2, p=2, radius=1.0)

In [34]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [35]:
neigh.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [36]:
neigh.predict_proba(X_train)

array([[ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333],
       [ 1.        ,  0.        ],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 1.        ,  0.        ],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,  0.66666667],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667],
       [ 0.        ,  1.        ],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.33333333,  0.66666667],
       [ 0.33333333,

### Benchmark scores for KNN

In [37]:
neigh.score(X_train, y_train, sample_weight=None)

0.7142857142857143

In [38]:
neigh.score(X_test, y_test, sample_weight=None)

0.51666666666666672

# SVC

In [39]:
simple_pipe_3 =  Pipeline([
    ('svm', svm.SVC())
])

In [40]:
simple_param_3 = {
    'svm__C': [10,25,50,75,100]
}

In [41]:
simple3_gs = GridSearchCV(simple_pipe_3, param_grid=simple_param_3, cv=5, n_jobs=-1, verbose=1)

In [42]:
simple3_gs.fit(X_test, y_test) 

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'svm__C': [10, 25, 50, 75, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

### Benchmark scores for SVC with high C values

In [43]:
simple3_gs.score(X_test, y_test) 

1.0

In [44]:
simple3_gs.best_score_

0.58333333333333337

In [45]:
pd.DataFrame(simple3_gs.cv_results_).sort_values(by='rank_test_score').T

Unnamed: 0,0,1,2,3,4
mean_fit_time,0.00322428,0.00308552,0.00308661,0.00307021,0.00316405
mean_score_time,0.000713968,0.000794125,0.000695467,0.000690699,0.000705051
mean_test_score,0.583333,0.583333,0.583333,0.583333,0.583333
mean_train_score,1,1,1,1,1
param_svm__C,10,25,50,75,100
params,{'svm__C': 10},{'svm__C': 25},{'svm__C': 50},{'svm__C': 75},{'svm__C': 100}
rank_test_score,1,1,1,1,1
split0_test_score,0.583333,0.583333,0.583333,0.583333,0.583333
split0_train_score,1,1,1,1,1
split1_test_score,0.583333,0.583333,0.583333,0.583333,0.583333
