# Code lab for Skripsi

Feature extraction part is done in `deep_feature_extractior.ipynb` files. This file is used to feature selection and train model

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split
from sklearn import metrics

import joblib

In [2]:
label = np.array([1]*1000+[0]*1000)

In [3]:
df = pd.read_parquet('data/orig_img_feature.parquet')
df.head()

Unnamed: 0,resnet_1,resnet_2,resnet_3,resnet_4,resnet_5,resnet_6,resnet_7,resnet_8,resnet_9,resnet_10,...,resnet_51191,resnet_51192,resnet_51193,resnet_51194,resnet_51195,resnet_51196,resnet_51197,resnet_51198,resnet_51199,resnet_51200
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.028299,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.585785,0.0,3.971417,1.891622,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.213993,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

Feature selection is done using `sklearn.feature_selection.SelectFromModel` with `sklearn.svm.SVC` as estimator. The result is saved in `data/selected_feature.csv` file

### Prefit Model

Fit the model first to get the feature coefficient

In [4]:
svm = SVC(kernel="linear", random_state=42)
svm.fit(df, label)

In [5]:
joblib.dump(svm, r'dumps\fitted_svm_full_features.joblib')

['dumps\\fitted_svm_full_features.joblib']

In [6]:
# svm = joblib.load(r'dumps\fitted_svm_full_features_splitted_data.joblib')

In [7]:
# save and inspect the coefficient
np.savetxt(r'data\svm_coef.txt', svm.coef_, delimiter='\n')
svm.coef_[0,:5]

array([-3.89170279e-04, -5.02684736e-04, -5.11681476e-04,  7.29777117e-05,
       -9.71169055e-05])

In [8]:
# manually calculate the mean
print(svm.coef_.mean())
print(abs(svm.coef_).mean())

6.137016942547516e-07
0.0004996106915457431


In [9]:
# manually get feature that are more than or equal to the mean
np.where(np.any(abs(svm.coef_) >= abs(svm.coef_).mean(), axis=0))[0].shape

(17675,)

### Run the selection

In [10]:
selector = SelectFromModel(svm, prefit=True)
selector.fit(df)

In [24]:
# selector = joblib.load('dumps\SelectFromModel_selector.joblib')

In [11]:
# threshold
selector.threshold_

0.0004996106915457431

In [12]:
selector.get_feature_names_out()

array(['resnet_2', 'resnet_3', 'resnet_14', ..., 'resnet_51198',
       'resnet_51199', 'resnet_51200'], dtype=object)

In [18]:
selected_features_df = pd.DataFrame(selector.transform(df),
                                    columns=selector.get_feature_names_out())
selected_features_df.to_parquet(r'data\selected_features.parquet')

## Classification

Classification is done using SVM with linear kernel.

### Hyperparameter tuning with Cross Validation

Find Hyperparameter C with cross validation and will be using k=10. Metrics being used is F1-Score, Recall, Precision, and Accuracy.

In [None]:
METRICS = ['f1', 'recall', 'precision', 'accuracy']
PARAM_GRID = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

svm = SVC(kernel="linear", random_state=42)
grid = GridSearchCV(svm, cv=10, param_grid=PARAM_GRID, scoring=METRICS, refit=False, verbose=3)
grid.fit(X=selected_feature_df, y=y_train)

In [None]:
grid.cv_results_

In [36]:
cv_result = pd.DataFrame(grid.cv_results_)
cv_result.to_csv(r'result\cv_result.csv')
cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,...,split3_test_accuracy,split4_test_accuracy,split5_test_accuracy,split6_test_accuracy,split7_test_accuracy,split8_test_accuracy,split9_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,127.653659,0.857349,10.43588,0.104752,0.001,{'C': 0.001},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1
1,127.563728,1.206513,10.450411,0.217847,0.01,{'C': 0.01},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1
2,125.3235,0.786658,10.257011,0.09781,0.1,{'C': 0.1},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1
3,125.276721,0.691191,10.255443,0.09846,1.0,{'C': 1},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1
4,125.043476,0.582035,10.231555,0.07118,10.0,{'C': 10},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1
5,125.141665,0.689061,10.255118,0.087312,100.0,{'C': 100},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1
6,125.15188,0.651139,10.323431,0.250689,1000.0,{'C': 1000},0.937853,0.972678,0.961326,0.955556,...,0.955556,0.916667,0.938889,0.933333,0.944444,0.911111,0.955556,0.942778,0.018266,1


In [37]:
cv_result[['param_C', 'mean_test_f1', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy']]

Unnamed: 0,param_C,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy
0,0.001,0.942419,0.945346,0.940049,0.942778
1,0.01,0.942419,0.945346,0.940049,0.942778
2,0.1,0.942419,0.945346,0.940049,0.942778
3,1.0,0.942419,0.945346,0.940049,0.942778
4,10.0,0.942419,0.945346,0.940049,0.942778
5,100.0,0.942419,0.945346,0.940049,0.942778
6,1000.0,0.942419,0.945346,0.940049,0.942778


### Build and run the model

Because all metrics are the same on all C, we train it using the C that provide the least fit time, that is C=10

In [15]:
best_param_svm = SVC(kernel="linear", C=1, random_state=42)

## Evaluate Model

Evaluate using cross validation

In [None]:
# selected_features_df = pd.read_parquet(r'data\selected_features.parquet')

In [19]:
scores = cross_validate(best_param_svm, selected_features_df, label,
                        cv=10, scoring=['f1', 'accuracy', 'recall', 'precision'],
                        verbose=3)

[CV] END  accuracy: (test=0.890) f1: (test=0.896) precision: (test=0.848) recall: (test=0.950) total time= 3.0min
[CV] END  accuracy: (test=0.935) f1: (test=0.932) precision: (test=0.978) recall: (test=0.890) total time= 3.0min
[CV] END  accuracy: (test=0.905) f1: (test=0.897) precision: (test=0.976) recall: (test=0.830) total time= 2.9min
[CV] END  accuracy: (test=0.870) f1: (test=0.856) precision: (test=0.963) recall: (test=0.770) total time= 3.0min
[CV] END  accuracy: (test=0.940) f1: (test=0.937) precision: (test=0.989) recall: (test=0.890) total time= 2.9min
[CV] END  accuracy: (test=0.970) f1: (test=0.970) precision: (test=0.980) recall: (test=0.960) total time= 3.0min
[CV] END  accuracy: (test=0.780) f1: (test=0.802) precision: (test=0.730) recall: (test=0.890) total time= 2.8min
[CV] END  accuracy: (test=0.960) f1: (test=0.961) precision: (test=0.934) recall: (test=0.990) total time= 3.1min
[CV] END  accuracy: (test=0.940) f1: (test=0.943) precision: (test=0.900) recall: (test=

In [21]:
cv_scores_df = pd.DataFrame(scores)
cv_scores_df.to_csv(r'result\cv_scores.csv')
cv_scores_df

Unnamed: 0,fit_time,score_time,test_f1,test_accuracy,test_recall,test_precision
0,165.425424,13.626449,0.896226,0.89,0.95,0.848214
1,167.296643,13.811072,0.931937,0.935,0.89,0.978022
2,163.531021,13.304779,0.897297,0.905,0.83,0.976471
3,165.811874,12.9338,0.855556,0.87,0.77,0.9625
4,162.199522,13.184747,0.936842,0.94,0.89,0.988889
5,168.695342,13.323353,0.969697,0.97,0.96,0.979592
6,157.792019,12.811998,0.801802,0.78,0.89,0.729508
7,169.70142,13.536185,0.961165,0.96,0.99,0.933962
8,164.03042,12.657279,0.942857,0.94,0.99,0.9
9,152.751681,12.349239,0.902913,0.9,0.93,0.877358
