# Code lab for Skripsi

Feature extraction part is done in `deep_feature_extractior.ipynb` files. This file is used to feature selection and train model

In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

import joblib

In [2]:
label = np.array([1]*1000+[0]*1000)
df = pd.read_parquet('data/orig_img_feature.parquet')
df.head()

Unnamed: 0,resnet_1,resnet_2,resnet_3,resnet_4,resnet_5,resnet_6,resnet_7,resnet_8,resnet_9,resnet_10,...,resnet_51191,resnet_51192,resnet_51193,resnet_51194,resnet_51195,resnet_51196,resnet_51197,resnet_51198,resnet_51199,resnet_51200
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.028299,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.585785,0.0,3.971417,1.891622,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.213993,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

Feature selection is done using `sklearn.feature_selection.SelectFromModel` with `sklearn.svm.SVC` as estimator. The result is saved in `data/selected_feature.csv` file

### Prefit Model

Fit the model first to get the feature coefficient

In [4]:
svm = SVC(kernel="linear", random_state=42)
svm.fit(df, label)

In [8]:
joblib.dump(svm, r'dumps\fitted_svm_full_features.joblib')

['dumps\\fitted_svm_full_features.joblib']

In [None]:
# svm = joblib.load(r'dumps\fitted_svm_full_features.joblib')

In [12]:
# save and inspect the coefficient
np.savetxt(r'data\svm_coef.txt', svm.coef_, delimiter='\n')
svm.coef_[0,:5]

array([-3.89170279e-04, -5.02684736e-04, -5.11681476e-04,  7.29777117e-05,
       -9.71169055e-05])

In [32]:
# manually calculate the mean
print(svm.coef_.mean())
print(abs(svm.coef_).mean())

6.137016942547516e-07
0.0004996106915457431


In [22]:
# manually get feature that are more than or equal to the mean
np.where(np.any(svm.coef_ >= svm.coef_.mean(), axis=0))[0].shape

(25762,)

### Run the selection

In [28]:
selector = SelectFromModel(svm, prefit=True)
selected_feature = selector.fit_transform(df)
selected_feature.shape

(2000, 17675)

In [30]:
# threshold
selector.threshold_

0.0004996106915457431

In [33]:
selector.get_feature_names_out()

array(['resnet_2', 'resnet_3', 'resnet_14', ..., 'resnet_51198',
       'resnet_51199', 'resnet_51200'], dtype=object)

In [41]:
selected_feature_df = pd.DataFrame(selected_feature, columns=selector.get_feature_names_out())
selected_feature_df.to_parquet('data/selected_feature.parquet', index=False)
selected_feature_df.head()

Unnamed: 0,resnet_2,resnet_3,resnet_14,resnet_17,resnet_21,resnet_27,resnet_30,resnet_33,resnet_39,resnet_40,...,resnet_51173,resnet_51177,resnet_51183,resnet_51185,resnet_51188,resnet_51190,resnet_51196,resnet_51198,resnet_51199,resnet_51200
0,0.0,0.0,0.0,0.355274,0.0,0.0,0.0,0.0,0.0,1.514804,...,0.0,0.0,0.0,0.0,0.0,0.0,2.028299,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016129,...,0.0,0.0,0.0,0.0,0.0,2.296938,0.0,0.0,0.0,0.0
2,0.0,1.585785,0.0,5.645706,0.0,2.928657,0.0,3.938727,0.0,2.319195,...,0.0,0.0,0.0,0.0,0.0,0.349672,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.634,0.0,0.0,0.0,0.0,0.0,3.425371,...,0.0,6.813828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,10.007561,0.0,0.0,0.0,5.586689,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Classification

Classification is done using SVM with linear kernel.

In [None]:
# selected_feature_df = pd.read_parquet('data/selected_feature.parquet')

In [42]:
selected_feature_svm = SVC(kernel="linear", random_state=42)
selected_feature_svm.fit(selected_feature_df, label)

In [45]:
joblib.dump(selected_feature_svm, 'dumps/fitted_svm_selected_feature.joblib')

['dumps/fitted_svm_selected_feature.joblib']

### Cross Validation

Cross validation will be using k=10. Metrics being used is F1-Score, Recall, Precision, and Accuracy.

In [None]:
METRICS = ['f1', 'recall', 'precision', 'accuracy']

svm = SVC(kernel="linear", random_state=42)
result = cross_validate(svm, X=selected_feature_df,
                        y=label, cv=10, scoring=METRICS)