In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler


In [6]:
# function to load and label datasets
def load_data(file_num):
    # File paths
    train_neg_path = f'./data/Hemo40D/data{file_num}/train/neg.fa_encod.csv'
    train_pos_path = f'./data/Hemo40D/data{file_num}/train/pos.fa_encod.csv'
    test_neg_path = f'./data/Hemo40D/data{file_num}/test/neg.fa_encod.csv'
    test_pos_path = f'./data/Hemo40D/data{file_num}/test/pos.fa_encod.csv'
    
    # Load the CSV files
    train_neg = pd.read_csv(train_neg_path)
    train_pos = pd.read_csv(train_pos_path)
    test_neg = pd.read_csv(test_neg_path)
    test_pos = pd.read_csv(test_pos_path)
    
    # Insert labels (0 for negative, 1 for positive)
    train_neg.insert(0, 'Label', 0)
    train_pos.insert(0, 'Label', 1)
    test_neg.insert(0, 'Label', 0)
    test_pos.insert(0, 'Label', 1)

    frames_trian = [train_neg,train_pos]
    frames_test  = [test_neg,test_pos]
    train_df     = pd.concat(frames_trian)
    test_df      = pd.concat(frames_test)

    return train_df, test_df

def pre_process(file_num):
    train_df, test_df = load_data(file_num)
    column_names = train_df.columns[3:].tolist()

    scaler        = StandardScaler()  # mean 0 and std 1
    undersampleer = RandomUnderSampler(random_state=42)

    train_df[column_names] = scaler.fit_transform(train_df[column_names])
    test_df[column_names]  = scaler.transform(test_df[column_names])
    
    x      = train_df[column_names].to_numpy()
    y      = train_df['Label'].to_numpy()
    x_test = test_df[column_names].to_numpy()
    y_test = test_df['Label'].to_numpy()

    X,y = undersampleer.fit_resample(x, y)
    return X,y,x_test,y_test


In [14]:
# Grid search function, note it output the prarmeters that give the
# highest validation accuracy and tend to overfit.
def grid_serach_cv(param_grid,kernel,file_num):
    X,y,_,_ = pre_process(file_num)
    skfold = StratifiedKFold(n_splits=5, random_state=42, shuffle= True)
    model = SVC(kernel=kernel)
    grid_search = GridSearchCV(model, param_grid, cv=skfold, n_jobs=-1, return_train_score=True,error_score='raise')
    grid_search.fit(X, y)

    best_index     = grid_search.best_index_
    best_train_acc = grid_search.cv_results_['mean_train_score'][best_index]
    best_val_acc   = grid_search.cv_results_['mean_test_score'][best_index]

    print(grid_search.best_params_)
    print(f"Training accuracy: {best_train_acc:.5f}")
    print(f"Validation accuracy: {best_val_acc:.5f}")

    return grid_search.cv_results_

# Function to manually search for parameters that give the best validation accuracy and do not overfit. 
# The function takes the grid search results and the maximum and minimum range of accuracy
def manual_search(grid_result,max=0.9,min=0.7):

    val_array = grid_result['mean_test_score'] > min  
    train_array = grid_result['mean_train_score'] < max
    true_list = train_array & val_array
    true_indices = np.where(true_list)[0]

    for i in true_indices:
        train_acc = grid_result['mean_train_score'][i]
        val_acc   = grid_result['mean_test_score'][i]
        params    = grid_result['params'][i] 
        print('Index: {} ,Train Acc: {:3f} , Val Acc: {:3f}, diff: {:3f},params {}'.format(i,train_acc,val_acc, (train_acc-val_acc)*100,params))


# Function to test the model on the test set
def test_metric(param,file_num):
    X,y,x_test,y_test = pre_process(file_num)
    model = SVC(**param)
    model.fit(X,y)
    y_pred         = model.predict(x_test)
    y_pred_prob    = model.decision_function(x_test)
    test_score     = model.score(x_test,y_test)

    print(f"Testing accuracy: {test_score:.5f}")
    print(f"Precision: {precision_score(y_test, y_pred, pos_label=1):.5f}")
    print(f"Recall: {recall_score(y_test, y_pred, pos_label=1):.5f}")
    print(f"F1: {f1_score(y_test, y_pred, pos_label=1):.5f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred_prob):.5f}")

# Grid search parameters
param_grid_linear = {'C': [0.0001,0.001,0.1, 1, 10, 100,1000]}
param_grid_rbf    = {'C': [0.0001,0.001,0.1, 1, 10, 100,1000],  
                    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}
param_grid_poly   = {'C': [0.0001,0.001,0.1, 1, 10, 100,1000],
                    'degree': [2,3,4]}

## Hemo1

In [18]:
hemo1_linear = grid_serach_cv(param_grid_linear,'linear',1)

{'C': 10}
Training accuracy: 0.96691
Validation accuracy: 0.94794


In [21]:
test_metric({'C': 10,'kernel':'linear'},1)

Testing accuracy: 0.94545
Precision: 0.96226
Recall: 0.92727
F1: 0.94444
ROC AUC: 0.98603


In [25]:
hemo1_rbf = grid_serach_cv(param_grid_rbf,'rbf',1)

{'C': 1000, 'gamma': 0.001}
Training accuracy: 0.98982
Validation accuracy: 0.96152


In [27]:
test_metric({'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'},1)

Testing accuracy: 0.96364
Precision: 0.98113
Recall: 0.94545
F1: 0.96296
ROC AUC: 0.99364


In [28]:
hemo1_poly = grid_serach_cv(param_grid_poly,'poly',1)

{'C': 10, 'degree': 3}
Training accuracy: 0.99038
Validation accuracy: 0.94681


In [29]:
test_metric({'C': 10,'degree':3,'kernel':'poly'},1)

Testing accuracy: 0.95000
Precision: 0.96262
Recall: 0.93636
F1: 0.94931
ROC AUC: 0.97314


## Hemo2
 To reduce overfitting in grid search, we set a constraint where the difference 
 between training and validation accuracy must be less than 5%. As shown by calling 'manual_search' function.


In [30]:
hemo2_linear = grid_serach_cv(param_grid_linear,'linear',2)

{'C': 1000}
Training accuracy: 0.73204
Validation accuracy: 0.71410


In [32]:
test_metric({'C': 1000, 'kernel': 'linear'},2)

Testing accuracy: 0.72277
Precision: 0.73684
Recall: 0.76364
F1: 0.75000
ROC AUC: 0.78063


In [33]:
hemo2_rbf = grid_serach_cv(param_grid_rbf,'rbf',2)

{'C': 100, 'gamma': 0.01}
Training accuracy: 0.93835
Validation accuracy: 0.76017


In [34]:
manual_search(hemo2_rbf,0.8,0.6)

Index: 1 ,Train Acc: 0.672881 , Val Acc: 0.612153, diff: 6.072781,params {'C': 0.0001, 'gamma': 0.1}
Index: 6 ,Train Acc: 0.672881 , Val Acc: 0.612153, diff: 6.072781,params {'C': 0.001, 'gamma': 0.1}
Index: 11 ,Train Acc: 0.742932 , Val Acc: 0.654330, diff: 8.860170,params {'C': 0.1, 'gamma': 0.1}
Index: 12 ,Train Acc: 0.669708 , Val Acc: 0.636891, diff: 3.281736,params {'C': 0.1, 'gamma': 0.01}
Index: 13 ,Train Acc: 0.611452 , Val Acc: 0.615177, diff: -0.372542,params {'C': 0.1, 'gamma': 0.001}
Index: 17 ,Train Acc: 0.749653 , Val Acc: 0.709983, diff: 3.966982,params {'C': 1, 'gamma': 0.01}
Index: 18 ,Train Acc: 0.668020 , Val Acc: 0.638233, diff: 2.978660,params {'C': 1, 'gamma': 0.001}
Index: 19 ,Train Acc: 0.612807 , Val Acc: 0.611114, diff: 0.169319,params {'C': 1, 'gamma': 0.0001}
Index: 23 ,Train Acc: 0.731712 , Val Acc: 0.708669, diff: 2.304354,params {'C': 10, 'gamma': 0.001}
Index: 24 ,Train Acc: 0.669036 , Val Acc: 0.640954, diff: 2.808131,params {'C': 10, 'gamma': 0.0001}


In [37]:
test_metric({'C': 1000, 'gamma': 0.0001,'kernel': 'rbf'},2) # index 34

Testing accuracy: 0.72277
Precision: 0.74107
Recall: 0.75455
F1: 0.74775
ROC AUC: 0.78745


In [38]:
hemo2_poly = grid_serach_cv(param_grid_poly,'poly',2)

{'C': 10, 'degree': 3}
Training accuracy: 0.91056
Validation accuracy: 0.75060


In [50]:
manual_search(hemo2_poly,0.8,0.6)

Index: 7 ,Train Acc: 0.729352 , Val Acc: 0.666538, diff: 6.281418,params {'C': 0.1, 'degree': 3}
Index: 8 ,Train Acc: 0.682907 , Val Acc: 0.611160, diff: 7.174731,params {'C': 0.1, 'degree': 4}
Index: 9 ,Train Acc: 0.749321 , Val Acc: 0.662594, diff: 8.672695,params {'C': 1, 'degree': 2}
Index: 11 ,Train Acc: 0.788273 , Val Acc: 0.685678, diff: 10.259516,params {'C': 1, 'degree': 4}


In [40]:
test_metric({'C': 0.1, 'degree':3,'kernel': 'poly'},2) # index 7

Testing accuracy: 0.71782
Precision: 0.74766
Recall: 0.72727
F1: 0.73733
ROC AUC: 0.78498


## Hemo3

In [41]:
hemo3_linear = grid_serach_cv(param_grid_linear,'linear',3)

{'C': 10}
Training accuracy: 0.73578
Validation accuracy: 0.71307


In [43]:
test_metric({'C': 1000, 'kernel': 'linear'},3)

Testing accuracy: 0.75385
Precision: 0.72986
Recall: 0.87006
F1: 0.79381
ROC AUC: 0.78390


In [44]:
hemo3_rbf = grid_serach_cv(param_grid_rbf,'rbf',3)

{'C': 1, 'gamma': 0.1}
Training accuracy: 0.92509
Validation accuracy: 0.77590


In [46]:
manual_search(hemo3_rbf,0.9,0.7)

Index: 17 ,Train Acc: 0.758278 , Val Acc: 0.722441, diff: 3.583627,params {'C': 1, 'gamma': 0.01}
Index: 22 ,Train Acc: 0.837862 , Val Acc: 0.754659, diff: 8.320296,params {'C': 10, 'gamma': 0.01}
Index: 23 ,Train Acc: 0.732174 , Val Acc: 0.713924, diff: 1.825072,params {'C': 10, 'gamma': 0.001}
Index: 28 ,Train Acc: 0.773134 , Val Acc: 0.730047, diff: 4.308693,params {'C': 100, 'gamma': 0.001}
Index: 29 ,Train Acc: 0.723472 , Val Acc: 0.710519, diff: 1.295280,params {'C': 100, 'gamma': 0.0001}
Index: 33 ,Train Acc: 0.821944 , Val Acc: 0.740220, diff: 8.172426,params {'C': 1000, 'gamma': 0.001}
Index: 34 ,Train Acc: 0.736844 , Val Acc: 0.718168, diff: 1.867624,params {'C': 1000, 'gamma': 0.0001}


In [48]:
test_metric({'C': 10, 'gamma': 0.001, 'kernel': 'rbf'},3)

Testing accuracy: 0.75077
Precision: 0.72430
Recall: 0.87571
F1: 0.79284
ROC AUC: 0.79485


In [45]:
hemo3_poly = grid_serach_cv(param_grid_poly,'poly',3)

{'C': 100, 'degree': 3}
Training accuracy: 0.97008
Validation accuracy: 0.74451


In [51]:
manual_search(hemo3_poly,0.85,0.6)  

Index: 7 ,Train Acc: 0.646009 , Val Acc: 0.616336, diff: 2.967305,params {'C': 0.1, 'degree': 3}
Index: 9 ,Train Acc: 0.716468 , Val Acc: 0.644345, diff: 7.212272,params {'C': 1, 'degree': 2}
Index: 10 ,Train Acc: 0.803690 , Val Acc: 0.717342, diff: 8.634817,params {'C': 1, 'degree': 3}
Index: 11 ,Train Acc: 0.764853 , Val Acc: 0.667241, diff: 9.761185,params {'C': 1, 'degree': 4}
Index: 12 ,Train Acc: 0.805391 , Val Acc: 0.703758, diff: 10.163372,params {'C': 10, 'degree': 2}


In [52]:
test_metric({'C': 1, 'degree': 3,  'kernel': 'poly'},3)

Testing accuracy: 0.75692
Precision: 0.72897
Recall: 0.88136
F1: 0.79795
ROC AUC: 0.80115
