In [1]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Get the training data

In [2]:
# training data
df = pd.read_csv(
    'https://github.com/wintonw/ISE364/raw/master/Midterm/project_data.csv')

#### Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,target
0,A,6.8,0.63,0.04,1.3,0.058,25.0,133.0,0.99271,3.17,0.39,10.2,0
1,A,7.2,0.19,0.31,6.3,0.034,17.0,103.0,0.99305,3.15,0.52,11.4,0
2,A,6.5,0.36,0.31,13.55,0.053,20.0,113.0,0.99544,3.2,0.56,11.0,1
3,A,6.0,0.39,0.26,2.7,0.038,39.0,187.0,0.99325,3.41,0.5,10.8,1
4,A,6.2,0.36,0.45,10.4,0.06,22.0,184.0,0.99711,3.31,0.56,9.8,1


In [4]:
df.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,target
count,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0,5497.0
mean,7.219893,0.341168,0.318972,5.458959,0.056343,30.575678,116.016009,0.994745,3.218374,0.532176,10.47084,0.795707
std,1.292492,0.164918,0.146604,4.787672,0.035012,17.85284,56.923871,0.002995,0.160303,0.149103,1.185713,0.403221
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,0.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.9924,3.11,0.43,9.5,1.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,119.0,0.99494,3.21,0.51,10.3,1.0
75%,7.7,0.41,0.4,8.1,0.066,41.0,156.0,0.997,3.32,0.6,11.3,1.0
max,15.9,1.33,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5497 entries, 0 to 5496
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      5497 non-null   object 
 1   V1      5497 non-null   float64
 2   V2      5497 non-null   float64
 3   V3      5497 non-null   float64
 4   V4      5497 non-null   float64
 5   V5      5497 non-null   float64
 6   V6      5497 non-null   float64
 7   V7      5497 non-null   float64
 8   V8      5497 non-null   float64
 9   V9      5497 non-null   float64
 10  V10     5497 non-null   float64
 11  V11     5497 non-null   float64
 12  target  5497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 558.4+ KB


In [None]:
sns.pairplot(df)

In [8]:
df['target'].unique()

array([0, 1])

In [7]:
df['V0'].unique()

array(['A', 'B'], dtype=object)

**No nulls, but V0 will need to be transformed into numbers**

#### Clean up data

In [9]:
# V0 to binary
V0 = pd.get_dummies(df['V0'], drop_first=True)
df.drop(['V0'], axis=1, inplace=True)
df = pd.concat([df, V0], axis=1)

#### Train Test Split

In [10]:
X = df.drop('target', axis=1)
y = df.target
X_train, X_Test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=12)

### Train a Support Vector Machine Classifier Model (SVM), in grid

#### Set up train, with custom parameters 

In [11]:
param_grid = {'C':[0.1, 10, 100, 1000, 50], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.005], 'kernel':['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.795, total=   0.5s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.795, total=   0.6s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.8min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 10, 100, 1000, 50],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.005],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

#### Model Evaluation

In [13]:
print(grid.best_params_)
grid_predictions = grid.predict(X_Test)
print(confusion_matrix(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}
[[  78  255]
 [   6 1311]]
              precision    recall  f1-score   support

           0       0.93      0.23      0.37       333
           1       0.84      1.00      0.91      1317

    accuracy                           0.84      1650
   macro avg       0.88      0.61      0.64      1650
weighted avg       0.86      0.84      0.80      1650



**This is not prefect but it's not bad**

### Unbalance Cases

#### Treat it as underbalanced

In [14]:
from imblearn.under_sampling import RandomUnderSampler

In [15]:
rus = RandomUnderSampler(random_state=12)
X_resample, y_resample = rus.fit_sample(X_train, y_train)



In [17]:
param_grid = {'C':[0.1, 10, 100, 1000, 50], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.005], 'kernel':['rbf']}
grid_under = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid_under.fit(X_resample, y_resample)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.582, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.506, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.585, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.601, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.513, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.646, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.566, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.646, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.538, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   26.5s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 10, 100, 1000, 50],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.005],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [18]:
print(grid_under.best_params_)
grid_under_predictions = grid_under.predict(X_Test)
print(confusion_matrix(y_test, grid_under_predictions))
print(classification_report(y_test, grid_under_predictions))

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
[[218 115]
 [373 944]]
              precision    recall  f1-score   support

           0       0.37      0.65      0.47       333
           1       0.89      0.72      0.79      1317

    accuracy                           0.70      1650
   macro avg       0.63      0.69      0.63      1650
weighted avg       0.79      0.70      0.73      1650



**Worse than the first model where there was no balancing**

#### Overbalanced

In [16]:
from imblearn.over_sampling import SMOTE

In [19]:
sm = SMOTE(random_state=12, sampling_strategy='auto')
X_over, y_over = sm.fit_sample(X_train, y_train)



In [20]:
param_grid = {'C':[0.1, 10, 100, 1000, 50], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.005], 'kernel':['rbf']}
grid_over = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid_over.fit(X_over, y_over)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.590, total=   1.5s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.539, total=   1.4s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.9s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.518, total=   1.4s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.521, total=   1.4s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.532, total=   1.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.694, total=   1.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.724, total=   1.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.718, total=   1.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.696, total=   1.7s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  5.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 10, 100, 1000, 50],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.005],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [21]:
print(grid_over.best_params_)
grid_over_predictions = grid_over.predict(X_Test)
print(confusion_matrix(y_test, grid_over_predictions))
print(classification_report(y_test, grid_over_predictions))

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
[[ 148  185]
 [ 209 1108]]
              precision    recall  f1-score   support

           0       0.41      0.44      0.43       333
           1       0.86      0.84      0.85      1317

    accuracy                           0.76      1650
   macro avg       0.64      0.64      0.64      1650
weighted avg       0.77      0.76      0.76      1650



**Worse than the first model where there was no balancing**

### Predict new observation data

#### Get the observation data

In [49]:
df_obs = pd.read_csv('https://github.com/wintonw/ISE364/raw/master/Midterm/new_obs.csv')

#### Check for abnormalities 

In [23]:
df_obs.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11
0,A,5.7,0.21,0.25,1.1,0.035,26.0,81.0,0.9902,3.31,0.52,11.4
1,B,8.7,0.78,0.51,1.7,0.415,12.0,66.0,0.99623,3.0,1.17,9.2
2,A,7.3,0.26,0.53,12.7,0.047,60.5,156.0,0.9984,3.06,0.45,9.1
3,A,6.9,0.4,0.37,8.9,0.053,36.0,148.0,0.996,3.16,0.5,9.3
4,A,7.0,0.22,0.26,2.8,0.036,44.0,132.0,0.99078,3.34,0.41,12.0


In [24]:
df_obs.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,7.1901,0.33141,0.31677,5.3568,0.054337,30.2485,114.2525,0.994431,3.2192,0.52628,10.607023
std,1.318262,0.162915,0.138092,4.591515,0.03512,17.175724,54.262113,0.003007,0.163504,0.147138,1.224684
min,3.9,0.08,0.0,0.8,0.013,3.0,7.0,0.98713,2.84,0.23,8.4
25%,6.4,0.22,0.25,1.8,0.036,17.0,79.0,0.991888,3.1,0.42,9.5
50%,7.0,0.29,0.3,2.9,0.046,29.0,115.5,0.994525,3.21,0.5,10.5
75%,7.7,0.39,0.38,8.025,0.06,40.0,151.25,0.9968,3.3225,0.6,11.4
max,15.6,1.58,0.76,19.5,0.422,124.0,243.0,1.0032,3.9,1.36,14.2


In [37]:
df_obs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      1000 non-null   float64
 1   V2      1000 non-null   float64
 2   V3      1000 non-null   float64
 3   V4      1000 non-null   float64
 4   V5      1000 non-null   float64
 5   V6      1000 non-null   float64
 6   V7      1000 non-null   float64
 7   V8      1000 non-null   float64
 8   V9      1000 non-null   float64
 9   V10     1000 non-null   float64
 10  V11     1000 non-null   float64
 11  V0      1000 non-null   uint8  
 12  0       1000 non-null   int64  
dtypes: float64(11), int64(1), uint8(1)
memory usage: 94.9 KB


#### Format obs data for predictions

In [50]:
V0_obs = pd.get_dummies(df_obs['V0'], drop_first=True)
df_obs.drop(['V0'], axis=1, inplace=True)
df_obs = pd.concat([df_obs, V0_obs], axis=1)
df_obs.rename(columns={"B": "V0"}, inplace=True)

#### Predict

In [34]:
obs_predictions = grid.predict(df_obs)

#### Save the prediction to the same obs csv

In [51]:
df_obs = pd.concat([df_obs, pd.Series(obs_predictions, name='target')], axis=1)

In [53]:
df_obs

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V0,target
0,5.7,0.21,0.25,1.1,0.035,26.0,81.0,0.99020,3.31,0.52,11.4,0,1
1,8.7,0.78,0.51,1.7,0.415,12.0,66.0,0.99623,3.00,1.17,9.2,1,1
2,7.3,0.26,0.53,12.7,0.047,60.5,156.0,0.99840,3.06,0.45,9.1,0,1
3,6.9,0.40,0.37,8.9,0.053,36.0,148.0,0.99600,3.16,0.50,9.3,0,1
4,7.0,0.22,0.26,2.8,0.036,44.0,132.0,0.99078,3.34,0.41,12.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,5.7,0.21,0.24,2.3,0.047,60.0,189.0,0.99500,3.65,0.72,10.1,0,1
996,7.0,0.12,0.29,10.3,0.039,41.0,98.0,0.99564,3.19,0.38,9.8,0,1
997,11.6,0.41,0.54,1.5,0.095,22.0,41.0,0.99735,3.02,0.76,9.9,1,0
998,6.3,0.27,0.37,7.9,0.047,58.0,215.0,0.99542,3.19,0.48,9.5,0,1


In [54]:
df_obs.to_csv('predictions.csv')