In [65]:
import pandas as pd
import numpy as np
import sklearn.datasets as datasets

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error

# Data loading

In [51]:
SEED = 42
EPOCH = 50
LR = 1e-3
BS = 16

iris = datasets.load_iris()
diabetes = datasets.load_diabetes()
digits = datasets.load_digits()
wine = datasets.load_wine()
cancer = datasets.load_breast_cancer()

In [52]:
def seperate_x_y(df):
    train_y = df['target']
    train_x = df.drop('target', axis=1)
    return train_x, train_y
def make_df(data):
    df = pd.DataFrame(data.data, columns = data.feature_names)
    df['target'] = data.target
    return df
def return_shape(df): # return dataframe shape dim-0, dim-1 int data type
    return df.shape[0], df.shape[1]

def check_nan(df): #return nan check result boolean data type
    nan_check = df.isnull().sum().tolist()
    flag = False
    for value in nan_check:
        if value !=0:
            flag = True
            break
    return flag
#https://gibles-deepmind.tistory.com/m/138 -> whether variable is numeric or categorical
def column_type_check(df):
    numeric_col = df._get_numeric_data().columns.tolist()
    categorical_col = list(set(df.columns) - set(numeric_col))

    return numeric_col, categorical_col


# Iris data

In [53]:
iris_df = make_df(iris)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## EDA

In [54]:
row_cnt, col_cnt = return_shape(iris_df)
nan_flag = check_nan(iris_df)
numeric_col, categorical_col = column_type_check(iris_df)

print("DataFrame shape : (" + str(row_cnt) + ", " + str(col_cnt)+")")
print("DataFrame has nan ? : " +str(nan_flag))
print("Numeric_col")
print(numeric_col)
print("Categorical_col")
print(categorical_col)

DataFrame shape : (150, 5)
DataFrame has nan ? : False
Numeric_col
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'target']
Categorical_col
[]


## RandomForestClassifier

In [55]:
train_x, train_y = seperate_x_y(iris_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))


n_list = [i for i in range(10, 105, 5)]
best_n = 0
best_f1 = 0
for n in n_list:
    rf = RandomForestClassifier(n_estimators = n, criterion = 'entropy', random_state=SEED)
    rf.fit(train_x, train_y)
    pred = rf.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_n = n
        best_f1 = f1

rf_best = RandomForestClassifier(n_estimators = best_n, criterion = 'entropy', random_state=SEED)
rf_best.fit(train_x, train_y)
pred = rf_best.predict(val_x)

print("best_n : "+ str(best_n))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (135, 4)
Validation size: (15, 4)
best_n : 10
F1 score : 0.9326599326599326
Accuracy score : 0.9333333333333333
confustion matrix
[[5 0 0]
 [0 4 1]
 [0 0 5]]


## Support Vector Machine

In [86]:
train_x, train_y = seperate_x_y(iris_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)


c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
best_c = 0
best_f1 = 0
for c in c_list:
    svm = SVC(C=c, gamma='auto', kernel='linear', random_state=SEED)
    svm.fit(train_x, train_y)
    pred = svm.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_c = c
        best_f1 = f1

svm_best =  SVC(C=c, gamma='auto', kernel='linear', random_state=SEED)
svm_best.fit(train_x, train_y)
pred = svm_best.predict(val_x)

print("best C : "+ str(best_c))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (135, 4)
Validation size: (15, 4)
best C : 1000
F1 score : 1.0
Accuracy score : 1.0
confustion matrix
[[5 0 0]
 [0 5 0]
 [0 0 5]]


# diabetes data

In [38]:
diabetes_df = make_df(diabetes)
diabetes_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


## RandomForestRegressor

In [80]:
train_x, train_y = seperate_x_y(diabetes_df)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))


n_list = [i for i in range(10, 105, 5)]
best_n = 0
best_loss = 1000000000
for n in n_list:
    rf = RandomForestRegressor(n_estimators = n, random_state=SEED)
    rf.fit(train_x, train_y)
    pred = rf.predict(val_x)
    loss = mean_squared_error(val_y, pred)
    if loss < best_loss:
        best_n = n
        best_loss = loss
        
rf_best = RandomForestRegressor(n_estimators = best_n, random_state=SEED)
rf_best.fit(train_x, train_y)
pred = rf_best.predict(val_x)

print("best_n : "+ str(best_n))
print("MSE : " + str(mean_squared_error(val_y, pred)))



Train size : (397, 10)
Validation size: (45, 10)
best_n : 85
MSE : 2855.5568719723183


## Support Vector Machine

In [79]:
train_x, train_y = seperate_x_y(diabetes_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)

c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
best_c = 0
best_loss = 1000000000
for c in c_list:
    svm = SVR(C=c, gamma='auto')
    svm.fit(train_x, train_y)
    pred = svm.predict(val_x)
    loss = mean_squared_error(val_y, pred)
    if loss < best_loss:
        best_c = c
        best_loss = loss

svm_best = SVR(C=c, gamma='auto')
svm_best.fit(train_x, train_y)
pred = svm_best.predict(val_x)

print("best_c : "+ str(best_c))
print("MSE : " + str(mean_squared_error(val_y, pred)))


Train size : (397, 10)
Validation size: (45, 10)
best_c : 1000
MSE : 2440.116691339051


# digits

In [41]:
digits_df = make_df(digits)
digits_df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


## RandomForestClassifier

In [42]:
train_x, train_y = seperate_x_y(digits_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))


n_list = [i for i in range(10, 105, 5)]
best_n = 0
best_f1 = 0
for n in n_list:
    rf = RandomForestClassifier(n_estimators = n, criterion = 'entropy', random_state=SEED)
    rf.fit(train_x, train_y)
    pred = rf.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_n = n
        best_f1 = f1

rf_best = RandomForestClassifier(n_estimators = best_n, criterion = 'entropy', random_state=SEED)
rf_best.fit(train_x, train_y)
pred = rf_best.predict(val_x)

print("best_n : "+ str(best_n))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (1617, 64)
Validation size: (180, 64)
best_n : 20
F1 score : 0.9780083509495274
Accuracy score : 0.9777777777777777
confustion matrix
[[18  0  0  0  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0  0  0  0]
 [ 0  0 18  0  0  0  0  0  0  0]
 [ 0  0  0 18  0  0  0  0  0  0]
 [ 0  1  0  0 17  0  0  0  0  0]
 [ 0  0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0  0 18  0  0  0]
 [ 0  0  0  0  0  0  0 18  0  0]
 [ 0  2  0  0  0  0  0  0 16  0]
 [ 0  0  0  1  0  0  0  0  0 17]]


## Support Vector Machine

In [61]:
train_x, train_y = seperate_x_y(digits_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)

c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
best_c = 0
best_f1 = 0
kernel = 'linear'
for c in c_list:
    svm = SVC(C=c, gamma='auto', kernel=kernel, random_state=SEED)
    svm.fit(train_x, train_y)
    pred = svm.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_c = c
        best_f1 = f1

svm_best =  SVC(C=c, gamma='auto',kernel=kernel, random_state=SEED)
svm_best.fit(train_x, train_y)
pred = svm_best.predict(val_x)

print("best C : "+ str(best_c))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (1617, 64)
Validation size: (180, 64)
best C : 0.1
F1 score : 0.9832946436042412
Accuracy score : 0.9833333333333333
confustion matrix
[[18  0  0  0  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0  0  0  0]
 [ 0  0 18  0  0  0  0  0  0  0]
 [ 0  0  0 17  0  0  0  0  0  1]
 [ 0  0  0  0 18  0  0  0  0  0]
 [ 0  0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0  0 18  0  0  0]
 [ 0  0  0  0  0  0  0 18  0  0]
 [ 0  2  0  0  0  0  0  0 16  0]
 [ 0  0  0  0  0  0  0  0  0 18]]


# wine

In [44]:
wine_df = make_df(wine)
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


## RandomForestClassifier

In [45]:
train_x, train_y = seperate_x_y(wine_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))


n_list = [i for i in range(10, 105, 5)]
best_n = 0
best_f1 = 0
for n in n_list:
    rf = RandomForestClassifier(n_estimators = n, criterion = 'entropy', random_state=SEED)
    rf.fit(train_x, train_y)
    pred = rf.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_n = n
        best_f1 = f1

rf_best = RandomForestClassifier(n_estimators = best_n, criterion = 'entropy', random_state=SEED)
rf_best.fit(train_x, train_y)
pred = rf_best.predict(val_x)

print("best_n : "+ str(best_n))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (160, 13)
Validation size: (18, 13)
best_n : 10
F1 score : 1.0
Accuracy score : 1.0
confustion matrix
[[6 0 0]
 [0 7 0]
 [0 0 5]]


## Support Vector Machine

In [62]:
train_x, train_y = seperate_x_y(wine_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)

c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
kernel = 'linear'
best_c = 0
best_f1 = 0
for c in c_list:
    svm = SVC(C=c, gamma='auto', kernel=kernel, random_state=SEED)
    svm.fit(train_x, train_y)
    pred = svm.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_c = c
        best_f1 = f1

svm_best = SVC(C=c, gamma='auto', kernel=kernel, random_state=SEED)
svm_best.fit(train_x, train_y)
pred = svm_best.predict(val_x)

print("best C : "+ str(best_c))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (160, 13)
Validation size: (18, 13)
best C : 0.1
F1 score : 1.0
Accuracy score : 1.0
confustion matrix
[[6 0 0]
 [0 7 0]
 [0 0 5]]


# cancer

In [47]:
cancer_df = make_df(cancer)
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


## RandomForestClassifier

In [48]:
train_x, train_y = seperate_x_y(cancer_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))


n_list = [i for i in range(10, 105, 5)]
best_n = 0
best_f1 = 0
for n in n_list:
    rf = RandomForestClassifier(n_estimators = n, criterion = 'entropy', random_state=SEED)
    rf.fit(train_x, train_y)
    pred = rf.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_n = n
        best_f1 = f1

rf_best = RandomForestClassifier(n_estimators = best_n, criterion = 'entropy', random_state=SEED)
rf_best.fit(train_x, train_y)
pred = rf.predict(val_x)

print("best_n : "+ str(best_n))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (512, 30)
Validation size: (57, 30)
best_n : 15
F1 score : 0.9623015873015872
Accuracy score : 0.9649122807017544
confustion matrix
[[20  1]
 [ 1 35]]


## Support Vector Machine

In [76]:
train_x, train_y = seperate_x_y(cancer_df)

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state=SEED, shuffle=True, stratify=train_y)
print("Train size : " +str(train_x.shape))
print("Validation size: " +str(val_x.shape))

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)

c_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
best_c = 0
best_f1 = 0
for c in c_list:
    svm = SVC(C=c, gamma='auto', random_state=SEED)
    svm.fit(train_x, train_y)
    pred = svm.predict(val_x)
    f1 = f1_score(val_y, pred, average='macro')
    if f1 > best_f1:
        best_c = c
        best_f1 = f1

svm_best = SVC(C=c, gamma='auto', random_state=SEED)
svm_best.fit(train_x, train_y)
pred = svm_best.predict(val_x)

print("best C : "+ str(best_c))
print("F1 score : " + str(f1_score(val_y, pred, average='macro')))
print("Accuracy score : " + str(accuracy_score(val_y, pred)))
print("confustion matrix")
print(confusion_matrix(val_y, pred))



Train size : (512, 30)
Validation size: (57, 30)
best C : 1
F1 score : 0.9439895185063871
Accuracy score : 0.9473684210526315
confustion matrix
[[20  1]
 [ 2 34]]
