In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
data = np.load('/content/gdrive/MyDrive/AIL_project/data/data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [4]:
data.allow_pickle = True

In [5]:
X = data['arr_0'] # pca data with 50 components
y = data['arr_1'] # target or dependent variable

In [6]:
X.shape,y.shape

((3020, 50), (3020,))

Spliting Data

In [7]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(2416, 50) (604, 50) (2416,) (604,)


SMOTE

In [8]:
from imblearn.over_sampling import SMOTE
from collections import Counter
counter = Counter(y)
print('Before', counter)
smt = SMOTE()
X_train_sm , y_train_sm = smt.fit_resample(x_train, y_train)
counter = Counter(y_train_sm)
print('After: ', counter)



Before Counter({'low': 923, 'empty': 882, 'medium': 686, 'high': 377, 'jam': 152})
After:  Counter({'low': 738, 'high': 738, 'medium': 738, 'empty': 738, 'jam': 738})


## Model

**Support Vector Machine**

In [9]:
model_svc = SVC(probability=True)

param_grid = {'C':[0.5,1,10,20,30,50],
             'kernel':['rbf','poly'],
             'gamma':[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]}

In [10]:
model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring='accuracy',cv=3,verbose=2)

In [11]:
model_grid.fit(X_train_sm , y_train_sm)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.5s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   3.6s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   2.8s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.4s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   1.5s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.0s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   2.3s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   3.2s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.5s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   1.3s
[CV] END ............C=0.5, coef0=0, gamma=0.0

In [12]:
model_grid.best_params_

{'C': 50, 'coef0': 0, 'gamma': 0.1, 'kernel': 'rbf'}

In [13]:
model_final = model_grid.best_estimator_

In [14]:
model_final.get_params()

{'C': 50,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 0.1,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [15]:
y_pred = model_final.predict(x_test)

**KNN**

In [16]:
scaler= StandardScaler()
knn = KNeighborsClassifier()

In [17]:
temp = [('scaler', scaler),('knn', knn)]

In [18]:
pipeline = Pipeline(temp)

In [19]:
from sklearn.model_selection import GridSearchCV
param_grid = {'knn__n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14, 15, 16, 17, 18, 19,
                                              20, 21, 22, 23, 24, 25, 26, 27,
                                              28, 29]}
grid_search = GridSearchCV(cv=5,estimator=pipeline, param_grid=param_grid, scoring='accuracy')
grid_search.fit(X_train_sm , y_train_sm)

In [20]:
print(grid_search)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                              12, 13, 14, 15, 16, 17, 18, 19,
                                              20, 21, 22, 23, 24, 25, 26, 27,
                                              28, 29]},
             scoring='accuracy')


In [22]:
pipeline.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())],
 'verbose': False,
 'scaler': StandardScaler(),
 'knn': KNeighborsClassifier(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'uniform'}

### Evaluation

**SVM**

In [23]:
cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
empty,0.779487,0.863636,0.819407,176.0
high,0.610169,0.473684,0.533333,76.0
jam,0.607143,0.566667,0.586207,30.0
low,0.596685,0.583784,0.590164,185.0
medium,0.524823,0.540146,0.532374,137.0
accuracy,0.640728,0.640728,0.640728,0.640728
macro avg,0.623661,0.605583,0.612297,604.0
weighted avg,0.635868,0.640728,0.636508,604.0


**KNN**

In [24]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(x_test)

In [25]:
cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
empty,0.765306,0.852273,0.806452,176.0
high,0.569444,0.539474,0.554054,76.0
jam,0.578947,0.733333,0.647059,30.0
low,0.634146,0.562162,0.595989,185.0
medium,0.529851,0.518248,0.523985,137.0
accuracy,0.642384,0.642384,0.642384,0.642384
macro avg,0.615539,0.641098,0.625508,604.0
weighted avg,0.637826,0.642384,0.638244,604.0


- Sau khi sử dụng cả SMOTE và PCA thì khả năng phân loại của cả hai mô hình đều tăng lên. Và có các chỉ số đánh giá khá tốt.

-----
# **END**