In [1]:
%matplotlib notebook

In [2]:
# general imports

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split #sklearn.cross_validation is deprecated

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

In [3]:
# kNN imports

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [4]:
# RBF SVM imports

from sklearn.svm import SVC

In [5]:
# Gaussian process imports

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [6]:
# Random forest imports

from sklearn.ensemble import RandomForestClassifier

In [7]:
# function for confusion matrix

def plot_confusion_matrix(y_true, y_pred ,classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    ax.set_ylim(1.5,-0.5)
    return ax

In [8]:
# function for kNN

def run_kNN(X_features, Y_labels, outfilename_base,
            brem = "", npks = "", nplt = "", dataStyle = ""):

    # X_features       -->
    # Y_labels         --> 
    # outfilename_base --> "../classificationImgs/<CURR_DATE>/"
    # brem             --> "N" or "Y"
    # npks             --> str(number of pks)
    # nplt             --> str(number of plateaus)
    # dataStyle        --> "preprocessed", "SFFS", "MDS" or "corrected"
    
    # output formatting
    
    sans = False
    if len(npks) < 1 :
        sans = True
    
    cm_title_str = ""
    outfilename  = ""
    
    class_names = np.array(['1e','b2b'])
    if not sans : 
        cm_title_str = "kNN, " + brem + npks + nplt + ",  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + brem + npks + nplt + "_" + dataStyle + "_kNN.png"
    else :
        cm_title_str = "kNN, sans,  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + "sans_" + dataStyle + "_kNN.png"
    
    # procedure
    
    x_train, x_test, y_train, y_test = train_test_split(X_features, Y_labels, test_size = 0.3)
    
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    label_pred = knn.predict(x_test)
    
    # generating output
    
    np.set_printoptions(precision=2)
    plot_confusion_matrix(y_test, label_pred, classes=class_names, normalize=True,
                          title=cm_title_str)
    
    plt.savefig(outfilename)
    plt.close()
    

In [9]:
# function for RBF SVM

def run_RBF_SVM(X_features, Y_labels, outfilename_base,
                brem = "", npks = "", nplt = "", dataStyle = ""):

    # X_features       -->
    # Y_labels         --> 
    # outfilename_base --> "../classificationImgs/<CURR_DATE>/"
    # brem             --> "N" or "Y"
    # npks             --> str(number of pks)
    # nplt             --> str(number of plateaus)
    # dataStyle        --> "preprocessed", "SFFS", "MDS" or "corrected"
    
    # output formatting
    
    sans = False
    if len(npks) < 1 :
        sans = True
    
    cm_title_str = ""
    outfilename  = ""
    
    class_names = np.array(['1e','b2b'])
    if not sans : 
        cm_title_str = "RBF_SVM, " + brem + npks + nplt + ",  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + brem + npks + nplt + "_" + dataStyle + "_RBF_SVM.png"
    else :
        cm_title_str = "RBF_SVM, sans,  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + "sans" + dataStyle + "_RBF_SVM.png"
    
    # procedure
    
    x_train, x_test, y_train, y_test = train_test_split(X_features, Y_labels, test_size = 0.3)
    
    rbf_svm = SVC(gamma='auto')
    rbf_svm.fit(x_train, y_train)
    label_pred = rbf_svm.predict(x_test)
    
    # generating output
    
    np.set_printoptions(precision=2)
    plot_confusion_matrix(y_test, label_pred, classes=class_names, normalize=True,
                          title=cm_title_str)
    
    plt.savefig(outfilename)
    plt.close()
    

In [10]:
# function for Gaussian process

def run_gaus_proc(X_features, Y_labels, outfilename_base,
                  brem = "", npks = "", nplt = "", dataStyle = ""):
    
    # X_features       -->
    # Y_labels         --> 
    # outfilename_base --> "../classificationImgs/<CURR_DATE>/"
    # brem             --> "N" or "Y"
    # npks             --> str(number of pks)
    # nplt             --> str(number of plateaus)
    # dataStyle        --> "preprocessed", "SFFS", "MDS" or "corrected"
    
    # output formatting
    
    sans = False
    if len(npks) < 1 :
        sans = True
    
    cm_title_str = ""
    outfilename  = ""
    
    class_names = np.array(['1e','b2b'])
    if not sans : 
        cm_title_str = "gausProc, " + brem + npks + nplt + ",  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base  + brem + npks + nplt + "_" + dataStyle + "_gausProc.png"
    else :
        cm_title_str = "gausProc, sans,  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + "sans_" + dataStyle + "_gausProc.png"
    
    # procedure
    
    x_train, x_test, y_train, y_test = train_test_split(X_features, Y_labels, test_size = 0.3)
    
    gpc = GaussianProcessClassifier(1.0 * RBF(1.0))
    gpc.fit(x_train, y_train)
    label_pred = gpc.predict(x_test)
    
    # generating output
    
    np.set_printoptions(precision=2)
    plot_confusion_matrix(y_test, label_pred, classes=class_names, normalize=True,
                          title=cm_title_str)
    
    plt.savefig(outfilename)
    plt.close()
   

In [11]:
# function for random forest

def run_rand_frst(X_features, Y_labels, outfilename_base,
                  brem = "", npks = "", nplt = "", dataStyle = ""):

    # X_features       -->
    # Y_labels         --> 
    # outfilename_base --> "../classificationImgs/<CURR_DATE>/"
    # brem             --> "N" or "Y"
    # npks             --> str(number of pks)
    # nplt             --> str(number of plateaus)
    # dataStyle        --> "preprocessed", "SFFS", "MDS" or "corrected"
    
    # output formatting
    
    sans = False
    if len(npks) < 1 :
        sans = True
    
    cm_title_str = ""
    outfilename  = ""
    
    class_names = np.array(['1e','b2b'])
    if not sans : 
        cm_title_str = "randFrst, " + brem + npks + nplt + ",  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + brem + npks + nplt + "_" + dataStyle + "_randFrst.png"
    else :
        cm_title_str = "randFrst, sans,  " + dataStyle + "\n" + str(X_features.shape[0]) + " b2b or 1e events"
        outfilename = outfilename_base + "sans_" + dataStyle + "_randFrst.png"
    
    # procedure
    
    x_train, x_test, y_train, y_test = train_test_split(X_features, Y_labels, test_size = 0.3)
    
    rand_frst = RandomForestClassifier(n_estimators=100)
    rand_frst.fit(x_train, y_train)
    label_pred = rand_frst.predict(x_test)
    
    # generating output
    
    np.set_printoptions(precision=2)
    plot_confusion_matrix(y_test, label_pred, classes=class_names, normalize=True,
                          title=cm_title_str)
    
    plt.savefig(outfilename)
    plt.close()
  

In [12]:
from os import listdir
from os.path import isfile, join
 
directory_path = '../data'
files = [f for f in listdir(directory_path) if isfile(join(directory_path, f))]

files_b2b = sorted([f for f in files if f[:3] == 'b2b'])
files_one = sorted([f for f in files if f[:3] == 'one'])
files_both_list = [files_b2b, files_one]
files_both_DF = pd.DataFrame(files_both_list)
files_both_DF = files_both_DF.transpose()
files_both_DF = (directory_path + '/') + files_both_DF

In [13]:
plt.ioff()

for i in range(files_both_DF.shape[0]) :
    
    b2b_D = pd.read_excel(files_both_DF.at[i,0])
    one_D = pd.read_excel(files_both_DF.at[i,1])
    
    brem = ""
    npks = ""
    nplt = ""
    if len(files_both_DF.at[i,0].split('_')) >= 5 : # has categorization
        brem = files_both_DF.at[i,0].split('_')[1]
        npks = files_both_DF.at[i,0].split('_')[2]
        nplt = files_both_DF.at[i,0].split('_')[3]
    
    dataStyle = files_both_DF.at[i,0].split('_')[-1].split('.')[0] # "SFFS", "MDS", etc.
    
    X_features_D = pd.concat([b2b_D, one_D], ignore_index = True)
    X_features = X_features_D.to_numpy()
    
    b2b_labels = np.ones( (b2b_D.shape[0],), dtype=int) # 1 == found 0vbb
    one_labels = np.zeros((one_D.shape[0],), dtype=int) # 0 == found 1e
    Y_labels = np.concatenate((b2b_labels,one_labels), axis=None)
    
    outfilename_base = "../classificationImgs/19_10_28/"
    
    print(files_both_DF.at[i,0])
    run_kNN(X_features, Y_labels, outfilename_base, brem, npks, nplt, dataStyle)
    print("ran kNN")
    run_RBF_SVM(X_features, Y_labels, outfilename_base, brem, npks, nplt, dataStyle)
    print("ran RBF SVM")
    run_gaus_proc(X_features, Y_labels, outfilename_base, brem, npks, nplt, dataStyle)
    print("ran gausProc")
    run_rand_frst(X_features, Y_labels, outfilename_base, brem, npks, nplt, dataStyle)
    print("ran randFrst")
    print(" ")
print("done")

../data/b2b_N_1_0_MDS.xlsx
Normalized confusion matrix
[[0.49 0.51]
 [0.23 0.77]]
ran kNN
Normalized confusion matrix
[[0.01 0.99]
 [0.   1.  ]]
ran RBF SVM
Normalized confusion matrix
[[0.56 0.44]
 [0.34 0.66]]
ran gausProc
Normalized confusion matrix
[[0.68 0.32]
 [0.18 0.82]]
ran randFrst
 
../data/b2b_N_1_0_MDS_corrected.xlsx
Normalized confusion matrix
[[0.6  0.4 ]
 [0.19 0.81]]
ran kNN
Normalized confusion matrix
[[0.63 0.37]
 [0.14 0.86]]
ran RBF SVM
Normalized confusion matrix
[[0.72 0.28]
 [0.17 0.83]]
ran gausProc
Normalized confusion matrix
[[0.7  0.3 ]
 [0.23 0.77]]
ran randFrst
 
../data/b2b_N_1_0_preprocessed.xlsx
Normalized confusion matrix
[[0.62 0.38]
 [0.18 0.82]]
ran kNN
Normalized confusion matrix
[[0.65 0.35]
 [0.09 0.91]]
ran RBF SVM
Normalized confusion matrix
[[0.69 0.31]
 [0.14 0.86]]
ran gausProc
Normalized confusion matrix
[[0.66 0.34]
 [0.15 0.85]]
ran randFrst
 
../data/b2b_N_1_0_preprocessed_SFFS.xlsx
Normalized confusion matrix
[[0.6  0.4 ]
 [0.24 0.76]]


In [27]:
print(len(files_both_DF.at[16,0].split('_')))
print(files_both_DF.at[16,0].split('_'))

3
['../data/b2b', 'sans', 'MDS.xlsx']
