In [95]:
from sklearn.svm import SVC
from timeit import default_timer as timer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import sklearn.feature_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

In [36]:
df = pd.read_csv('data/real_data_classification_X.csv', index_col=0)
gt = pd.read_csv('data/real_data_classification_y.csv', index_col=0)
gt=gt['0'].values

In [38]:
dfNormStandard = df
dfNormMinMax = df
scalerStandard = StandardScaler()
scalerMinMax = MinMaxScaler()
# Normaliza also categorial data
for col in df.columns:
    scaledValuesStand = scalerStandard.fit_transform(df[col].values.reshape(-1,1))
    scaledValuesMinMax = scalerMinMax.fit_transform(df[col].values.reshape(-1,1))
    dfNormStandard[col] = scaledValuesStand
    dfNormMinMax[col] = scaledValuesMinMax
    
dfNormStandard.shape == dfNormMinMax.shape

True

In [103]:
scalerStandard

StandardScaler(copy=True, with_mean=True, with_std=True)

In [39]:
dfNormStandard= PCA(n_components=15).fit_transform(dfNormStandard)
dfNormMinMax= PCA(n_components=15).fit_transform(dfNormMinMax)

In [104]:
def k_fold_test(classifierName,datasetName,dataset, labels, classifier, k=3, shuffleDataset=True):
    kFold = KFold(n_splits=k, shuffle=shuffleDataset)
    kFoldResult = []
    kFoldIndex = 0
    for train_index, test_index in kFold.split(dataset):
        X_train = dataset[train_index]
        X_test = dataset[test_index]
        # X_train, X_test = dataset[train_index], dataset[test_index]
        Y_train = labels[train_index]
        Y_test = labels[test_index]
        print(X_train.shape)
        # print()
        # Y_train, Y_test = labels[train_index], labels[test_index]
        start = timer()
        classifier.fit(X_train, Y_train)
        fitTime = timer() - start
        
        start = timer()
        predictedLabels = classifier.predict(X_test)
        predictionTime = timer() - start

        predictionProbs = classifier.predict_proba(X_test)
        rocAucScore = roc_auc_score(Y_test, predictionProbs[:,1])
        
        score = float(sum([p[0] == p[1] for p in zip(predictedLabels, Y_test)])) / float(len(Y_test))
        f1Score = f1_score(Y_test, predictedLabels)

        kFoldResult.append({
            "Dataset": datasetName,
            "Classifier": classifierName,
            "KFoldIndex": kFoldIndex,
            "FitTime": fitTime,
            "PredictionTime": predictionTime,
            "Precision": score,
            "F1Score": f1Score,
            "RocAucScore": rocAucScore
          })
        kFoldIndex += 1
    print(f"Finished classification for {classifierName}-{datasetName}")
    return kFoldResult

In [57]:
results = []

In [105]:

decisionTreeClassifier = DecisionTreeClassifier(min_samples_leaf=80)
# results += k_fold_test("DecTree","NormMinMax",dfNormMinMax,gt,decisionTreeClassifier,2,False)
results += k_fold_test("DecTree","NormStandard",dfNormStandard,gt,decisionTreeClassifier,2,False)

(5000, 15)
(5000, 15)
Finished classification for DecTree-NormStandard


In [59]:
nnClassifier = MLPClassifier(hidden_layer_sizes=(5,5))
# results += k_fold_test("MLPC","NormMinMax",dfNormMinMax,gt,nnClassifier,2,False)
results += k_fold_test("MLPC","NormStandard",dfNormStandard,gt,nnClassifier,2,False)

Finished classification for MLPC-NormStandard


In [60]:
svmClassifier = SVC(C=1000, max_iter=2000, probability=True)
# results += k_fold_test("SVM","NormMinMax",dfNormMinMax,gt,svmClassifier,2,False)
results += k_fold_test("SVM","NormStandard",dfNormStandard,gt,svmClassifier,2,False)

Finished classification for SVM-NormStandard


In [61]:
resultsDf = pd.DataFrame(results)
resultsDf.sort_values(by=['F1Score'], ascending=False)

Unnamed: 0,Classifier,Dataset,F1Score,FitTime,KFoldIndex,Precision,PredictionTime,RocAucScore
5,SVM,NormStandard,0.952663,0.966659,1,0.9328,0.09112,0.968215
4,SVM,NormStandard,0.950986,0.995484,0,0.9304,0.090026,0.899407
2,MLPC,NormStandard,0.94973,1.567295,0,0.9292,0.001037,0.974033
1,DecTree,NormStandard,0.947804,0.042158,1,0.9268,0.000582,0.972905
3,MLPC,NormStandard,0.947052,1.735883,1,0.926,0.000885,0.972375
0,DecTree,NormStandard,0.942624,0.048246,0,0.9198,0.000946,0.971039


In [108]:
cross_val_score(nnClassifier, dfNormStandard, gt)

# kFold = KFold(n_splits=2, shuffle=True)
# X_tr=[]
# X_te=[]
# Y_tr=[]
# Y_te=[]
# for train_index, test_index in kFold.split(dfNormStandard):
#     X_tr, X_te = dfNormStandard[train_index], dfNormStandard[test_index]
#     Y_tr, Y_te = gt[train_index], gt[test_index]
    
#     print(X_tr)
#     print(X_te)
#     nnClassifier.fit(X_tr, Y_tr)
#     print(nnClassifier.score(X_te, Y_te))



array([0.9280144 , 0.92979298, 0.9279928 ])

In [83]:
classifiers = [decisionTreeClassifier, nnClassifier, svmClassifier]
for classifier in classifiers:
    bagginClassifier = BaggingClassifier(base_estimator=classifier, n_jobs=6, n_estimators=20)
    bagginClassifier.fit(X_tr, Y_tr)
    print(f"Classifier: {classifier} score: {bagginClassifier.score(X_te,Y_te)}")
    # bagginClassifier.estimators_

Classifier: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=80, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') score: 0.9172
Classifier: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False) score: 0.9308
Classifier: SVC(C=1000, cache_size=200, class

In [86]:
randomForest = RandomForestClassifier()
randomForest.fit(X_tr,Y_tr)
randomForest.score(X_te,Y_te)
k_fold_test("RandomForest","DfNumStand",dfNormStandard,gt,randomForest,2,False)

Finished classification for RandomForest-DfNumStand


[{'Dataset': 'DfNumStand',
  'Classifier': 'RandomForest',
  'KFoldIndex': 0,
  'FitTime': 0.06873220000034053,
  'PredictionTime': 0.0058328999998593645,
  'Precision': 0.93,
  'F1Score': 0.9491279069767442,
  'RocAucScore': 0.973139791128774},
 {'Dataset': 'DfNumStand',
  'Classifier': 'RandomForest',
  'KFoldIndex': 1,
  'FitTime': 0.07011600000032558,
  'PredictionTime': 0.004960199999914039,
  'Precision': 0.9236,
  'F1Score': 0.9442009932807479,
  'RocAucScore': 0.9701736710654588}]

In [101]:
adaBoostClf = AdaBoostClassifier(base_estimator=decisionTreeClassifier,n_estimators=100)
ada = k_fold_test("AdaBoostDecTree","NormStandard",dfNormStandard,gt,adaBoostClf,2,False)
resultsDf2 = pd.DataFrame(ada)
resultsDf2.sort_values(by=['F1Score'], ascending=False)

Finished classification for AdaBoostDecTree-NormStandard


Unnamed: 0,Classifier,Dataset,F1Score,FitTime,KFoldIndex,Precision,PredictionTime,RocAucScore
1,AdaBoostDecTree,NormStandard,0.947322,3.821253,1,0.9276,0.065747,0.97551
0,AdaBoostDecTree,NormStandard,0.947292,3.819608,0,0.927,0.065348,0.974852


In [100]:
votingClass = VotingClassifier(estimators=[('DecTree', decisionTreeClassifier), ('MPLC', nnClassifier)], n_jobs=6,voting='soft')
k_fold_test("AdaBoostDecTree","NormStandard",dfNormStandard,gt,votingClass,2,False)

Finished classification for AdaBoostDecTree-NormStandard


[{'Dataset': 'NormStandard',
  'Classifier': 'AdaBoostDecTree',
  'KFoldIndex': 0,
  'FitTime': 2.13050770000018,
  'PredictionTime': 0.0016116000001602515,
  'Precision': 0.9298,
  'F1Score': 0.9504167255262044,
  'RocAucScore': 0.9780806927028718},
 {'Dataset': 'NormStandard',
  'Classifier': 'AdaBoostDecTree',
  'KFoldIndex': 1,
  'FitTime': 2.0395406000002367,
  'PredictionTime': 0.0016670999998495972,
  'Precision': 0.9298,
  'F1Score': 0.9498213009292352,
  'RocAucScore': 0.9775288075644977}]