In [None]:
from os import listdir
from os.path import isfile, join
all_files = [f for f in listdir('classification_datasets') if isfile(join('classification_datasets', f))]
all_files = sorted(all_files)

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import copy
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import warnings
import csv
from datetime import date
from sklearn.utils.fixes import loguniform
import scipy.stats as stats

from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score
import time
from sklearn.preprocessing import label_binarize

from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")

In [None]:
def load_data(path):
    df = pd.read_csv(path)
    for col in df.columns:
        if df[col].dtype == object:
            df = pd.get_dummies(df, columns=[col])
            
    if df[df.columns[-1]].dtype == object:
        last_col = df.columns[-1]
        df['new_class'] = pd.factorize(df[last_col])[0]
        df = df[[i for i in df.columns if i != last_col]]
            
    df = df.interpolate()
    df.fillna(value=0.0, inplace=True)
    return df

In [None]:
def binary_params(D, file_name):
    
    Data_params = []
    
    X = D[D.columns[:-1]]
    y = D[D.columns[-1]]

    #10-fold cross validation
    kf = KFold(n_splits=10, shuffle=True)
    CV_num = 0
    for train_index, test_index in kf.split(X):
#         print('CV_num -',CV_num)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        x = find_params(X_train, y_train)
            
        classifier = ExtraTreesClassifier(random_state=0, criterion=x['criterion'], n_estimators=x['n_estimators'])
    
        #time.process_time() will measure the CPU time
        start1 = time.process_time()
        classifier.fit(X_train, y_train)
        train_time = time.process_time() - start1

        #prediction
        start2 = time.process_time()
        y_pred = classifier.predict(X_test)
        predict_time = time.process_time() - start2

        #auc calculate
        y_prob = clf.predict_proba(X_test)

        #dataframe size
        r, c = X_test.shape

        # Scale
        infer_time = predict_time*(1000/r)
        
        #Accuracy
        #preds_classes = [x.index(max(x)) for x in prediction_probab]
        acc_score = accuracy_score(y_test, y_pred)

        # TPR, FPR ,Precision
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()

        # Sensitivity, hit rate, recall, or true positive rate
        TPR = tp/(tp+fn)

        # Fall out or false positive rate
        FPR = fp/(fp+tn)

        # Precision or positive predictive value
        #Precision = tp/(tp+fp)
        precision = precision_score(y_test, y_prob, average='binary')
        
        score = roc_auc_score(y_test, y_prob)

        #Area under Precision-Recall Curve 
        average_precision = average_precision_score(y_test, y_pred)

        best_ests, best_crit = x['n_estimators'], x['criterion']
        #save all calculations in a dictionary
        hyper = 'ests = '+str(best_ests)+'. criterion = '+str(best_crit)+"."

        params = {'Dataset Name': file_name, 
                  'Algorithm Name': 'EXTRA TREES',
                  'Cross Validation': CV_num, 
                  'Hyper-Parameters Values': hyper,
                  'Accuracy':acc_score,
                  'TPR':TPR,
                  'FPR':FPR,
                  'Precision':precision,
                  'AUC':score,
                  'PR-Curve':average_precision,
                  'Training Time':train_time,
                  'Inference Time':infer_time}

        Data_params.append(params)
        CV_num += 1

    return Data_params


In [None]:
def fix_data(X, y):
    if not y.isnull().values.any():
        return X, y
    for idx, row in y.iterrows():
        if y.iloc[idx] == np.nan:
            y.drop(index=idx, inplace=True)
            X.drop(index=idx, inplace=True)
            
    y = pd.factorize(y)
            
    return X, y

In [None]:
def find_params(X_train, y_train):
    clf = ExtraTreesClassifier(random_state=0)
    param_dist = {'criterion': ['gini', 'entropy'],
                  'n_estimators': stats.randint(5, 100)}
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,n_iter=50,cv=3, random_state=0, n_jobs=2)
    random_search.fit(X_train, y_train)
    return random_search.best_params_ 

In [None]:
def fix_one_class(new_test):
    classes = np.zeros(len(new_test[0]))
    for idx in range(len(classes)):
        classes[idx] = sum([x[idx] for x in new_test])
    num_change = 1
    for idx in range(len(classes)):
        if classes[idx] == 0:
            to_add = np.zeros(len(new_test[0]))
            to_add[idx] = 1.0
            new_test[len(new_test)-num_change] = to_add
            num_change += 1
    return new_test

In [None]:
def multi_params(D, file_name, num_uni):
    
    Data_params = []

    X = D[D.columns[:-1]]
    y = D[D.columns[-1]]
    
    
    X, y = fix_data(X, y)
    
    #10-fold cross validation
    kf = KFold(n_splits=10, shuffle=True)
    CV_num = 0
    for train_index, test_index in kf.split(X):
#         print('CV_num -',CV_num)
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        x = find_params(X_train, y_train)
            
        clf = ExtraTreesClassifier(random_state=0, criterion=x['criterion'], n_estimators=x['n_estimators'])

        classifier  = OneVsRestClassifier(clf)

         #time.process_time() will measure the CPU time
        start1 = time.process_time()
        clf = classifier.fit(X_train, y_train)
        train_time = time.process_time() - start1

        #prediction
        start2 = time.process_time()
        y_pred = clf.predict(X_test)
        predict_time = time.process_time() - start2


        #dataframe size
        r, c = X_test.shape

        # Scale
        infer_time = predict_time*(1000/r)

        #Accuracy
        #In binary and multiclass classification, this function is equal to the jaccard_score function (doc.)
        acc_score = accuracy_score(y_test, y_pred)

        #auc calculate
        y_prob = clf.predict_proba(X_test)

        new_test = np.zeros((y_test.size, num_uni+1))
        new_test[np.arange(y_test.size),y_test] = 1
        
        
        macro_roc_auc_ovr = roc_auc_score(new_test, y_prob, multi_class="ovr", average="macro")
        #Area under Precision-Recall Curve

        #AP and the trapezoidal area under the operating points (sklearn.metrics.auc)
        #are common ways to summarize a precision-recall curve that lead to different results.

        # for use average_precision_score it is necessary to binarize the output of predict_proba
        # predict_proba returns the probability of the sample for each class in the model,
        #where classes are ordered as they are in self.classes


        # binarize
        label_y_test = label_binarize(y_test, classes= clf.classes_)

        average_precision =  average_precision_score(label_y_test, y_prob, average='macro')


        # TPR, FPR
        # confusion matrix
        cnf_matrix = confusion_matrix(y_test, y_pred)

        # computetion of TPR and FPR for each class
        FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) 
        FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
        TP = np.diag(cnf_matrix)
        TN = cnf_matrix.sum() - (FP + FN + TP)

        FP = FP.astype(float)
        FN = FN.astype(float)
        TP = TP.astype(float)
        TN = TN.astype(float)

        # Sensitivity, hit rate, recall, or true positive rate
        TPR = TP/(TP+FN)
        # Fall out or false positive rate
        FPR = FP/(FP+TN)
        # Precision or positive predictive value
        PPV = TP/(TP+FP)

        # macro average
        classes_num = clf.n_classes_
        macro_TPR= sum(TPR)/classes_num
        macro_FPR= sum(FPR)/classes_num
        macro_PPV= sum(PPV)/classes_num

        # Precision
        precision = precision_score(y_test, y_pred, average='macro')

        best_ests, best_crit = x['n_estimators'], x['criterion']
        #save all calculations in a dictionary
        hyper = 'ests = '+str(best_ests)+'. criterion = '+str(best_crit)+"."


        params = {'Dataset Name': file_name, 
                  'Algorithm Name': 'EXTRA TREES',
                  'Cross Validation': CV_num, 
                  'Hyper-Parameters Values': hyper,
                  'Accuracy':acc_score,
                  'TPR':macro_TPR,
                  'FPR':macro_FPR,
                  'Precision':precision,
                  'AUC':macro_roc_auc_ovr,
                  'PR-Curve':average_precision,
                  'Training Time':train_time,
                  'Inference Time':infer_time}
#             print(params)

        Data_params.append(params)
        CV_num += 1

    return Data_params
    
    return params
    

In [None]:
def fix_prob(y_prob, y_test_uni, classes):
    ret_arr = []
    for prob in y_prob:
        to_add = np.zeros(len(classes))
        for idx, uni in enumerate(y_test_uni):
            to_add[classes.index(uni)] = prob[idx]
        ret_arr.append(to_add)
    return np.array(ret_arr)
            

In [None]:
all_accs = dict()
for file in tqdm(all_files):
    D = load_data('classification_datasets/'+file)     

    if D.nunique()[D.columns[-1]] > 2:
        params = multi_params(D, file, D.nunique()[D.columns[-1]])

    if D.nunique()[D.columns[-1]] == 2:
        params = binary_params(D, file)

    all_accs[file] = params

In [None]:
import csv
with open('EXTRA.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)

    writer.writerow(cols)
    for file in tqdm(all_accs):
        for line in all_accs[file]:
            writer.writerow(list(line.values()))