In [129]:
!activate PythonGPU
import numpy as np
from scipy.stats import skewnorm, skew
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, accuracy_score

def simulate_data(classes, n_vars, n, max_mu, max_sigma, max_skew):
    #The multivariate skew normal number generator
    def rng(mu, sigma, skew, n=1):
        k = len(mu)
        if not (k == len(sigma) and k ==len(skew)): 
            raise Exception("Mu, Sigma and Skew should be same length")

        data = np.zeros((int(n),k))

        for i in range(k):
            data[:,i] = skewnorm.rvs(skew[i], loc=mu[i], scale=sigma[i], size=int(n)) 

        return data
    
    if(np.sum(classes) != 1):
        raise Exception("Classes dont sum up to 1")
        
    n_classes = len(classes)
    sigma = np.random.randint(1,max_sigma,n_vars)
    skew = np.random.randint(-max_skew,max_skew,n_vars)
    mu =  np.random.randint(-max_mu, max_mu, (n_classes, n_vars))
    
    n_obs_class = np.round(np.dot(classes,n))
    
    data = np.zeros((int(np.sum(n_obs_class)),n_vars+1))
    for i in range(n_classes):
        #calculate indexes
        start = int(np.sum(n_obs_class[0:i]))
        end = int(np.sum(n_obs_class[0:i+1]))
        
        #set the data
        data[start:end,0] = i
        data[start:end,1:] = rng(mu[i,:], sigma, skew, n_obs_class[i])
        
    X = data[:,1:]
    y = data[:,0]
        
    X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=42,
    stratify=y)
    
    return X_train, X_test, y_train, y_test

In [130]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def classify_lda(X_train, X_test, y_train, y_test, priors, plot=False):
    lda = LinearDiscriminantAnalysis(priors=priors)
    X_lda = lda.fit_transform(X_train, y_train)

    predictions = lda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("LDA Test accuracy "+ str(accuracy))
    print(predictions)

    if plot:    
        plt.xlabel('LD1')
        plt.ylabel('LD2')
        plt.scatter(
            X_lda[:,0],
            X_lda[:,1],
            c=y_train,
            cmap='Accent',
        )
        
    return {"method": "LDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": lda}

In [131]:
#Quadratic
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def classify_qda(X_train, X_test, y_train, y_test, priors):
    qda = QuadraticDiscriminantAnalysis(priors=priors)
    X_qda = qda.fit(X_train, y_train)



    predictions = qda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("QDA Test accuracy "+ str(accuracy))

    return {"method": "QDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": qda}

In [132]:
from sklearn.linear_model import LogisticRegression

def classify_logit(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                             multi_class='multinomial').fit(X_train, y_train)

    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Test accuracy "+ str(accuracy))
    
    return {"method": "Logit", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": clf}

In [133]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

def classify_knn(X_train, X_test, y_train, y_test, n_neighbors):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='euclidean')
    knn.fit(X_train, y_train)

    predictions = knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("KNN-"+str(n_neighbors)+" Test accuracy "+ str(accuracy))
    
    return {"method": "KNN-"+str(n_neighbors), 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": knn}

In [134]:
#Naive bayes
from sklearn.naive_bayes import GaussianNB

def classify_naivebayes(X_train, X_test, y_train, y_test, priors):
    NB = GaussianNB(priors)
    NB.fit(X_train, y_train)
    
    predictions = NB.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("Naive Bayes Test accuracy "+ str(accuracy))
    
    return {"method": "Naive Bayes", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": NB}

In [135]:
#SVM
from sklearn.svm import LinearSVC

def classify_svm(X_train, X_test, y_train, y_test):
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    
    predictions = svm.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("SVM Test accuracy "+ str(accuracy))
    
    return {"method": "SVM", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": svm}

In [136]:
from tensorflow.keras import layers
from tensorflow import keras

def classify_neuralnet(X_train, X_test, y_train, y_test, n_vars, n_classes, depth=1, nodes=10, epochs=20):
    inputs = keras.Input(shape=(n_vars,), name='obs')
    x = layers.Dense(nodes, activation='relu')(inputs)
    
    if(depth>1):
        for i in range(depth-1):
            x = layers.Dense(nodes, activation='relu')(x)
            
    outputs = layers.Dense(n_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='Dataset')

    display(model.summary())


    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=keras.optimizers.RMSprop(),
                  metrics=['accuracy'])

    history = model.fit(X_train, y_train,
                        batch_size=64,
                        epochs=epochs,
                        validation_split=0.2)

    predictions = model.predict(X_test)
    predictions = np.argmax(predictions, axis=1)
    print(predictions)

    accuracy = accuracy_score(y_test, predictions)
    print("Neural Network Test accuracy "+ str(accuracy))
    
    return {"method": "Net "+"-".join([str(nodes) for i in range(depth)])+ " E"+str(epochs), 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": model}

In [147]:
from datetime import datetime
import pandas as pd

np.random.seed(12345)

timelabel =  datetime.now().strftime("%H-%M-%S - %d-%m-%Y")


configs = [{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 5,
    "n": 500,
    "max_mu": 1,
    "max_sigma": 2,
    "max_skew": 1
}] 

columns = ['method', 'accuracy','predictions', "model", "config"] + list(configs[0].keys())
results = pd.DataFrame(columns=columns)
results.style.format({
    'accuracy': '{:,.3%}'.format
})



#run analysis
for i, c in enumerate(configs):
    X_train, X_test, y_train, y_test = simulate_data(c["classes"], c["n_vars"], c["n"], c["max_mu"], c["max_sigma"], c["max_skew"])
    
    lda = classify_lda(X_train, X_test, y_train, y_test, c["classes"], False)                                     
    results = results.append({**lda, **c, "config":i+1},ignore_index=True)
    
#     qda = classify_qda(X_train, X_test, y_train, y_test, c["classes"])                                     
#     results = results.append({**qda, **c, "config":i+1},ignore_index=True)
    
#     logit = classify_logit(X_train, X_test, y_train, y_test)                                     
#     results = results.append({**logit, **c, "config":i+1},ignore_index=True)

#     for k in [5,10,50,100]:
#         knn = classify_knn(X_train, X_test, y_train, y_test, k)
#         results = results.append({**knn, **c, "config":i+1},ignore_index=True

    nb = classify_naivebayes(X_train, X_test, y_train, y_test, c["classes"])
    results = results.append({**nb, **c, "config":i+1},ignore_index=True)
    
    svm = classify_svm(X_train, X_test, y_train, y_test)
    results = results.append({**svm, **c, "config":i+1},ignore_index=True)
    
#     for n in [{"d":2,"n":20, "e":1}]:
#         neuralnet = classify_neuralnet(X_train, X_test, y_train, y_test, c["n_vars"], len(c["classes"]),  depth=n["d"], nodes=n["n"], epochs=n["e"])                                 
#         results = results.append({**neuralnet, **c, "config":i+1},ignore_index=True)
    
    print("Results after config "+str(i+1))

    results.sort_values(by='accuracy', ascending=False, inplace=True)
        
    display(results.style.format({
    'accuracy': '{:,.3%}'.format
    }))
    
    #saving results to file
    results.to_pickle("./results/config "+str(i+1)+" of "+str(len(configs))+" "+timelabel+".pkl")

LDA Test accuracy 0.5515151515151515
[3. 2. 1. 0. 2. 1. 1. 2. 0. 1. 3. 0. 1. 2. 2. 2. 3. 1. 2. 1. 3. 2. 2. 3.
 2. 3. 0. 2. 2. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 1. 1. 3. 1. 1. 1. 0. 3.
 3. 0. 3. 2. 0. 3. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 3. 1. 3.
 0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 2. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.
 1. 0. 1. 1. 3. 3. 0. 3. 2. 0. 0. 3. 0. 2. 2. 3. 2. 0. 2. 2. 3. 2. 2. 0.
 0. 2. 3. 1. 1. 3. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.
 1. 2. 0. 0. 1. 3. 3. 1. 1. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.]
Naive Bayes Test accuracy 0.5454545454545454
SVM Test accuracy 0.5636363636363636
Results after config 1


Unnamed: 0,method,accuracy,predictions,model,config,classes,n_vars,n,max_mu,max_sigma,max_skew
2,SVM,56.364%,[3. 2. 1. 0. 2. 1. 1. 2. 0. 1. 3. 0. 1. 2. 2. 3. 3. 1. 2. 1. 3. 2. 2. 3.  3. 3. 3. 2. 2. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 1. 1. 3. 1. 1. 1. 0. 3.  3. 0. 3. 2. 0. 3. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 0. 1. 3.  0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 1. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.  1. 0. 1. 1. 3. 3. 0. 3. 2. 0. 0. 3. 0. 2. 2. 3. 2. 0. 2. 2. 3. 2. 2. 0.  3. 2. 3. 1. 1. 1. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.  1. 2. 0. 0. 1. 3. 3. 1. 0. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.],"LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,  intercept_scaling=1, loss='squared_hinge', max_iter=1000,  multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,  verbose=0)",1,"[0.25, 0.25, 0.25, 0.25]",5,500,1,2,1
0,LDA,55.152%,[3. 2. 1. 0. 2. 1. 1. 2. 0. 1. 3. 0. 1. 2. 2. 2. 3. 1. 2. 1. 3. 2. 2. 3.  2. 3. 0. 2. 2. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 1. 1. 3. 1. 1. 1. 0. 3.  3. 0. 3. 2. 0. 3. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 3. 1. 3.  0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 2. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.  1. 0. 1. 1. 3. 3. 0. 3. 2. 0. 0. 3. 0. 2. 2. 3. 2. 0. 2. 2. 3. 2. 2. 0.  0. 2. 3. 1. 1. 3. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.  1. 2. 0. 0. 1. 3. 3. 1. 1. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.],"LinearDiscriminantAnalysis(n_components=None, priors=[0.25, 0.25, 0.25, 0.25],  shrinkage=None, solver='svd', store_covariance=False,  tol=0.0001)",1,"[0.25, 0.25, 0.25, 0.25]",5,500,1,2,1
1,Naive Bayes,54.545%,[3. 2. 1. 0. 2. 1. 2. 2. 0. 1. 3. 0. 1. 2. 2. 3. 3. 1. 2. 1. 3. 2. 2. 3.  3. 3. 0. 3. 1. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 2. 1. 3. 1. 1. 1. 0. 3.  3. 0. 3. 2. 0. 2. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 3. 2. 3.  0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 2. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.  1. 0. 1. 1. 3. 2. 0. 3. 2. 0. 0. 3. 0. 2. 1. 3. 2. 0. 2. 2. 3. 2. 2. 0.  0. 2. 3. 1. 1. 3. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.  1. 2. 0. 1. 1. 3. 3. 1. 0. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.],"GaussianNB(priors=[0.25, 0.25, 0.25, 0.25], var_smoothing=1e-09)",1,"[0.25, 0.25, 0.25, 0.25]",5,500,1,2,1


Example to load results

In [2]:
import pandas as pd

old_results = pd.read_pickle("./results/config 1 of 1 19-58-47 - 02-12-2019.pkl")

display(old_results.style.format({'accuracy': '{:,.3%}'.format}))

Unnamed: 0,method,accuracy,predictions,model,config,classes,n_vars,n,max_mu,max_sigma,max_skew
2,SVM,56.364%,[3. 2. 1. 0. 2. 1. 1. 2. 0. 1. 3. 0. 1. 2. 2. 3. 3. 1. 2. 1. 3. 2. 2. 3.  3. 3. 3. 2. 2. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 1. 1. 3. 1. 1. 1. 0. 3.  3. 0. 3. 2. 0. 3. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 0. 1. 3.  0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 1. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.  1. 0. 1. 1. 3. 3. 0. 3. 2. 0. 0. 3. 0. 2. 2. 3. 2. 0. 2. 2. 3. 2. 2. 0.  3. 2. 3. 1. 1. 1. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.  1. 2. 0. 0. 1. 3. 3. 1. 0. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.],"LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,  intercept_scaling=1, loss='squared_hinge', max_iter=1000,  multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,  verbose=0)",1,"[0.25, 0.25, 0.25, 0.25]",5,500,1,2,1
0,LDA,55.152%,[3. 2. 1. 0. 2. 1. 1. 2. 0. 1. 3. 0. 1. 2. 2. 2. 3. 1. 2. 1. 3. 2. 2. 3.  2. 3. 0. 2. 2. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 1. 1. 3. 1. 1. 1. 0. 3.  3. 0. 3. 2. 0. 3. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 3. 1. 3.  0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 2. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.  1. 0. 1. 1. 3. 3. 0. 3. 2. 0. 0. 3. 0. 2. 2. 3. 2. 0. 2. 2. 3. 2. 2. 0.  0. 2. 3. 1. 1. 3. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.  1. 2. 0. 0. 1. 3. 3. 1. 1. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.],"LinearDiscriminantAnalysis(n_components=None, priors=[0.25, 0.25, 0.25, 0.25],  shrinkage=None, solver='svd', store_covariance=False,  tol=0.0001)",1,"[0.25, 0.25, 0.25, 0.25]",5,500,1,2,1
1,Naive Bayes,54.545%,[3. 2. 1. 0. 2. 1. 2. 2. 0. 1. 3. 0. 1. 2. 2. 3. 3. 1. 2. 1. 3. 2. 2. 3.  3. 3. 0. 3. 1. 3. 2. 1. 2. 3. 1. 1. 0. 2. 3. 2. 2. 1. 3. 1. 1. 1. 0. 3.  3. 0. 3. 2. 0. 2. 1. 3. 3. 1. 0. 2. 0. 1. 0. 1. 1. 0. 2. 3. 1. 3. 2. 3.  0. 1. 2. 1. 2. 1. 1. 1. 2. 3. 1. 2. 2. 0. 1. 3. 0. 0. 2. 3. 1. 3. 2. 1.  1. 0. 1. 1. 3. 2. 0. 3. 2. 0. 0. 3. 0. 2. 1. 3. 2. 0. 2. 2. 3. 2. 2. 0.  0. 2. 3. 1. 1. 3. 1. 2. 1. 2. 2. 0. 2. 1. 3. 2. 2. 1. 2. 2. 2. 1. 0. 3.  1. 2. 0. 1. 1. 3. 3. 1. 0. 3. 0. 3. 3. 0. 3. 3. 2. 2. 1. 2. 2.],"GaussianNB(priors=[0.25, 0.25, 0.25, 0.25], var_smoothing=1e-09)",1,"[0.25, 0.25, 0.25, 0.25]",5,500,1,2,1
