In [1]:
!activate PythonGPU
import numpy as np
from scipy.stats import skewnorm, skew
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, accuracy_score

def simulate_data(classes, n_vars, n, max_mu, max_sigma, max_skew):
    #The multivariate skew normal number generator
    def rng(mu, sigma, skew, n=1):
        k = len(mu)
        if not (k == len(sigma) and k ==len(skew)): 
            raise Exception("Mu, Sigma and Skew should be same length")

        data = np.zeros((int(n),k))

        for i in range(k):
            data[:,i] = skewnorm.rvs(skew[i], loc=mu[i], scale=sigma[i], size=int(n)) 

        return data
    
    if(np.sum(classes) != 1):
        raise Exception("Classes dont sum up to 1")
        
    n_classes = len(classes)
#     sigma = np.random.randint(1,max_sigma,n_vars)
#     skew = np.random.randint(-max_skew,max_skew,n_vars)
#     mu =  np.random.randint(-max_mu, max_mu, (n_classes, n_vars))
    
    sigma = (max_sigma * np.random.rand(1,n_vars))[0]
    skew = ((2 * max_skew  * np.random.rand(1, n_vars)) - max_skew)[0]
    mu = (2 *  max_mu * np.random.rand(n_classes, n_vars)) - max_mu
    
    n_obs_class = np.round(np.dot(classes,n))
    
    data = np.zeros((int(np.sum(n_obs_class)),n_vars+1))
    for i in range(n_classes):
        #calculate indexes
        start = int(np.sum(n_obs_class[0:i]))
        end = int(np.sum(n_obs_class[0:i+1]))
        
        #set the data
        data[start:end,0] = i
        data[start:end,1:] = rng(mu[i,:], sigma, skew, n_obs_class[i])
        
    X = data[:,1:]
    y = data[:,0]
        
    X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=42,
    stratify=y)
    
    return X_train, X_test, y_train, y_test

In [2]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def classify_lda(X_train, X_test, y_train, y_test, priors, plot=False):
    lda = LinearDiscriminantAnalysis(priors=priors)
    X_lda = lda.fit_transform(X_train, y_train)

    predictions = lda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("LDA Test accuracy "+ str(accuracy))
    print(predictions)

    if plot:    
        plt.xlabel('LD1')
        plt.ylabel('LD2')
        plt.scatter(
            X_lda[:,0],
            X_lda[:,1],
            c=y_train,
            cmap='Accent',
        )
        
    return {"method": "LDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": lda}

In [3]:
#Quadratic
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def classify_qda(X_train, X_test, y_train, y_test, priors):
    qda = QuadraticDiscriminantAnalysis(priors=priors)
    X_qda = qda.fit(X_train, y_train)



    predictions = qda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("QDA Test accuracy "+ str(accuracy))

    return {"method": "QDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": qda}

In [4]:
from sklearn.linear_model import LogisticRegression

def classify_logit(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                             multi_class='multinomial').fit(X_train, y_train)

    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Test accuracy "+ str(accuracy))
    
    return {"method": "Logit", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": clf}

In [5]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

def classify_knn(X_train, X_test, y_train, y_test, n_neighbors):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='euclidean')
    knn.fit(X_train, y_train)

    predictions = knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("KNN-"+str(n_neighbors)+" Test accuracy "+ str(accuracy))
    
    return {"method": "KNN-"+str(n_neighbors), 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": knn}

In [6]:
#Naive bayes
from sklearn.naive_bayes import GaussianNB

def classify_naivebayes(X_train, X_test, y_train, y_test, priors):
    NB = GaussianNB(priors)
    NB.fit(X_train, y_train)
    
    predictions = NB.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("Naive Bayes Test accuracy "+ str(accuracy))
    
    return {"method": "Naive Bayes", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": NB}

In [7]:
#SVM
from sklearn.svm import LinearSVC

def classify_svm(X_train, X_test, y_train, y_test):
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    
    predictions = svm.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("SVM Test accuracy "+ str(accuracy))
    
    return {"method": "SVM", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": svm}

In [8]:
from tensorflow.keras import layers
from tensorflow import keras

def classify_neuralnet(X_train, X_test, y_train, y_test, n_vars, n_classes, depth=1, nodes=10, epochs=20):
    inputs = keras.Input(shape=(n_vars,), name='obs')
    x = layers.Dense(nodes, activation='relu')(inputs)
    
    if(depth>1):
        for i in range(depth-1):
            x = layers.Dense(nodes, activation='relu')(x)
            
    outputs = layers.Dense(n_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='Dataset')

    display(model.summary())


    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=keras.optimizers.RMSprop(),
                  metrics=['accuracy'])

    history = model.fit(X_train, y_train,
                        batch_size=64,
                        epochs=epochs,
                        validation_split=0.2)

    predictions = model.predict(X_test)
    predictions = np.argmax(predictions, axis=1)
    print(predictions)

    accuracy = accuracy_score(y_test, predictions)
    print("Neural Network Test accuracy "+ str(accuracy))
    
    return {"method": "Net "+"-".join([str(nodes) for i in range(depth)])+ " E"+str(epochs), 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": model}

In [None]:
from datetime import datetime
import pandas as pd
import time


np.random.seed(12345)

timelabel =  datetime.now().strftime("%H-%M-%S - %d-%m-%Y")


# configs = [{
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 2,
#     "n": 500,
#     "max_mu": 2,
#     "max_sigma": 2,
#     "max_skew": 1
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 5,
#     "n": 10000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.05, 0.05, 0.05, 0.85],
#     "n_vars": 5,
#     "n": 10000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 50,
#     "n": 10000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 50,
#     "n": 100000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 5,
#     "n": 10000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 5,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# }
# configs = [{
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 2,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# },
# {
#     "classes": [0.125 for x in range(8)],
#     "n_vars": 5,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# }] 

configs = [{
    "classes": [0.5, 0.5],
    "n_vars": 2,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 10
},
{
    "classes": [0.5, 0.5],
    "n_vars": 2,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 5
},
{
    "classes": [0.5, 0.5],
    "n_vars": 2,
    "n": 1000000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 5
}
] 


columns = ['method', 'accuracy','predictions', "model", "config"] + list(configs[0].keys())
results = pd.DataFrame(columns=columns)
results.style.format({
    'accuracy': '{:,.3%}'.format
})



#run analysis
for i, c in enumerate(configs):
    X_train, X_test, y_train, y_test = simulate_data(c["classes"], c["n_vars"], c["n"], c["max_mu"], c["max_sigma"], c["max_skew"])
    
    lda = classify_lda(X_train, X_test, y_train, y_test, c["classes"], False)                                     
    results = results.append({**lda, **c, "config":i+1},ignore_index=True)
    
    qda = classify_qda(X_train, X_test, y_train, y_test, c["classes"])                                     
    results = results.append({**qda, **c, "config":i+1},ignore_index=True)
    
    logit = classify_logit(X_train, X_test, y_train, y_test)                                     
    results = results.append({**logit, **c, "config":i+1},ignore_index=True)
    
    for k in [5,10,50,100]:
        knn = classify_knn(X_train, X_test, y_train, y_test, k)
        results = results.append({**knn, **c, "config":i+1},ignore_index=True)

    bayes = classify_naivebayes(X_train, X_test, y_train, y_test, c["classes"])
    results = results.append({**bayes, **c, "config":i+1},ignore_index=True)
    
    svm = classify_svm(X_train, X_test, y_train, y_test)
    results = results.append({**svm, **c, "config":i+1},ignore_index=True)
    
    for n in [{"d":1,"n":len(c["classes"]), "e":25}, {"d":4,"n":30, "e":10}, {"d":4,"n":50, "e":50}]:
        neuralnet = classify_neuralnet(X_train, X_test, y_train, y_test, c["n_vars"], len(c["classes"]),  depth=n["d"], nodes=n["n"], epochs=n["e"])                                 
        results = results.append({**neuralnet, **c, "config":i+1+7+2},ignore_index=True)
    
    print("Results after config "+str(i+1)+" of "+str(len(configs)))

    results.sort_values(by='accuracy', ascending=False, inplace=True)
        
    display(results.style.format({
    'accuracy': '{:,.3%}'.format
    }))
    

    #saving results to file
    results.drop(columns=['model']).to_pickle("./results/config "+str(i+1)+" of "+str(len(configs))+" "+timelabel+".pkl")

LDA Test accuracy 0.29303030303030303
[3. 3. 3. ... 1. 3. 0.]
QDA Test accuracy 0.2923030303030303
Logistic Test accuracy 0.29278787878787876
KNN-5 Test accuracy 0.2674848484848485
KNN-10 Test accuracy 0.2723939393939394


Example to load old results

In [101]:
import pandas as pd

old_results = pd.read_pickle("./results/config 7 of 7 21-02-19 - 02-12-2019.pkl")


# .style.format({'accuracy': '{:,.3%}'.format})

old_results['accuracy'] = pd.Series(["{0:.2f}%".format(val * 100) for val in old_results['accuracy']], index = old_results.index)


def bold(data):
    attr = 'font-weight: {}'.format("700")
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    is_max = data == data.max()
    return [attr if v else "" for v in is_max]
    
def boldlatex(data):
    #remove % and cast to float
    return list(map(lambda x: r"\\textbf{"+x+"}" if x==data.max() else x, data))
    
#     return [attr if v else "" for v in is_max]
    
    
table = old_results.pivot(index='method', columns='config', values='accuracy').sort_values(by='method')

display(table.style.apply(bold))


latex = table.apply(boldlatex).to_latex()
print(latex)

file = open("results.tex", "w")
file.write(latex)
file.close()


config,1,2,3,4,5,6,7
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNN-10,49.09%,94.88%,99.24%,100.00%,100.00%,38.85%,34.71%
KNN-100,47.27%,92.36%,98.64%,100.00%,100.00%,42.27%,37.74%
KNN-5,44.85%,95.12%,99.36%,100.00%,100.00%,35.73%,33.19%
KNN-50,53.94%,93.45%,98.88%,100.00%,100.00%,41.03%,37.41%
LDA,55.76%,96.82%,99.97%,100.00%,100.00%,45.24%,39.94%
Logit,56.36%,96.82%,99.88%,100.00%,100.00%,45.67%,40.03%
Naive Bayes,53.33%,96.85%,99.97%,100.00%,100.00%,45.06%,39.95%
Net 30-30-30-30 E10,52.12%,96.36%,99.70%,100.00%,100.00%,45.85%,41.18%
Net 4 E25,33.94%,95.52%,99.24%,100.00%,100.00%,44.73%,41.04%
Net 50-50-50-50 E50,52.12%,96.15%,99.97%,100.00%,100.00%,44.91%,41.23%


\begin{tabular}{llllllll}
\toprule
config &                 1 &                 2 &                 3 &                  4 &                  5 &                 6 &                 7 \\
method              &                   &                   &                   &                    &                    &                   &                   \\
\midrule
KNN-10              &            49.09\% &            94.88\% &            99.24\% &  \textbackslash \textbackslash textbf\{100.00\%\} &  \textbackslash \textbackslash textbf\{100.00\%\} &            38.85\% &            34.71\% \\
KNN-100             &            47.27\% &            92.36\% &            98.64\% &  \textbackslash \textbackslash textbf\{100.00\%\} &  \textbackslash \textbackslash textbf\{100.00\%\} &            42.27\% &            37.74\% \\
KNN-5               &            44.85\% &            95.12\% &            99.36\% &  \textbackslash \textbackslash textbf\{100.00\%\} &  \textbackslash \textbackslash textbf\