This script succeeds DS_1_Generate_Dataset_for_Analysis.py and performs the GridSearch using cross validation on AdaBoost model to determine the optimal parameters of AdaBoost associated with each dataset with accuracy as the metric

In [1]:
import os
import pandas as pd

# preprocessing
from sklearn.model_selection import train_test_split

# Modelling algorithm
from sklearn.ensemble import AdaBoostClassifier

# Model selection
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, cross_val_score

#metrics
from sklearn.metrics import roc_curve,auc,accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report


In [2]:
def FindOptimalParm(X,y):    
    # Adaboost classifiers - find the optimal parameters using unscaled raw data 

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y)

    gs_parm_grid = {'n_estimators':[100,200,500,800,1000,1500],
                   'learning_rate':[0.01,0.05,0.1,0.2,0.3]}
    ab_clf = AdaBoostClassifier()
    cv_model = RepeatedStratifiedKFold(n_splits = 5, n_repeats=1,random_state=1)
    grid_search_model_raw = GridSearchCV(estimator = ab_clf, 
                                     param_grid = gs_parm_grid, 
                                     n_jobs=1,
                                     cv=cv_model, 
                                     scoring = 'accuracy',
                                     verbose = 2)

    grid_search_score_raw = grid_search_model_raw.fit(X,y)

    # cv_scores = cross_val_score(ab_clf, X,y,scoring='accuracy',cv=cv_model,n_jobs=1,error_score='raise')

    # print('Accuracy: %.3f (%.3f)' % (np.mean(cv_scores), np.std(cv_scores)))

    print(grid_search_score_raw)
    print("Best scores:")
    print("Best: %f using %s" % (grid_search_score_raw.best_score_, grid_search_score_raw.best_params_))
    return grid_search_score_raw.best_score_, grid_search_score_raw.best_params_

In [None]:

root_path = "Data\Model Datasets"
cntr_val = 0
for file_name in os.scandir(root_path):
    if cntr_val >=1:
        break
    file_path = file_name.path
    print(file_path)
    dataset_df = pd.read_csv(file_path)
    db_row = {'File Name':[file_path]}
    
    num_classes = len(set(list(dataset_df.sound_file_class)))
    db_row = db_row | {'# classes':[num_classes]}

    data_size = int(dataset_df.shape[0])
    db_row = db_row | {'N':[data_size]}

    data_size = int(dataset_df.shape[1])
    db_row = db_row | {'# features':[data_size]}
    
    hilbert_flag = "Yes"
    if "_nht" in file_path:
        hilbert_flag = "No"

    db_row = db_row | {'Hilbert Transformed?':[hilbert_flag]}

    mfcc_flag = "Yes"
    if "_nMFCC" in file_path:
        mfcc_flag = "No"

    db_row = db_row | {'MFCC included?':[mfcc_flag]}

    sample_dur = "30 seconds"
    if "_5_sec" in file_path:
        sample_dur = "30 seconds"
    elif "_1_sec" in file_path:
        sample_dur = "1 second"

    db_row = db_row | {'Sample Duration':[sample_dur]}

    scaled_flag = "Yes"
    if "_ns" in file_path:
        scaled_flag = "No"
    
    db_row = db_row | {'Data Scaled?':[scaled_flag]}
    
    anomaly_flag = "Yes"
    if "_nat" in file_path:
        anomaly_flag = "No"
    
    db_row = db_row | {'Anomaly Treated?':[anomaly_flag]}
    
    # Determine the optimal parameters
    
    #1. Convert the sound_file_class label into a ordinal variable 
    X_raw = dataset_df
    X_raw['sound_file_class_num'] = X_raw.apply(lambda x: 1 if x.sound_file_class=='Q1' 
                                                        else (2 if x.sound_file_class=='Q2' 
                                                              else (3 if x.sound_file_class=='Q3' else 4)), axis=1)

    X = X_raw[X_raw.columns[~X_raw.columns.isin(['sound_file_name', 'sound_file_class','sound_file_class_num'])]]
    #print(list(X.columns))
    y,unique_vals = pd.factorize(X_raw['sound_file_class_num'], sort=True)
    
    best_score, best_parms = FindOptimalParm(X,y)
    data_df = pd.DataFrame(db_row)

    # Build feature dataset
    if cntr_val == 0:
        final_df = data_df
        #break
    else:
        final_df = pd.concat([final_df,data_df],axis=0)
    cntr_val +=1
final_df

Data\Model Datasets\DS_1_All_Feature_1_sec_nMFCC_nht_ns_nat.csv
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ...............learning_rate=0.01, n_estimators=100; total time= 1.8min
[CV] END ...............learning_rate=0.01, n_estimators=100; total time= 1.8min
[CV] END ...............learning_rate=0.01, n_estimators=100; total time= 1.8min
[CV] END ...............learning_rate=0.01, n_estimators=100; total time= 2.0min
[CV] END ...............learning_rate=0.01, n_estimators=100; total time= 1.8min
[CV] END ...............learning_rate=0.01, n_estimators=200; total time= 3.6min
[CV] END ...............learning_rate=0.01, n_estimators=200; total time= 3.7min
[CV] END ...............learning_rate=0.01, n_estimators=200; total time= 3.6min
[CV] END ...............learning_rate=0.01, n_estimators=200; total time= 3.5min
[CV] END ...............learning_rate=0.01, n_estimators=200; total time= 3.6min
[CV] END ...............learning_rate=0.01, n_estimators=500; to