In [2]:
!activate PythonGPU
import numpy as np
from scipy.stats import skewnorm, skew
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, accuracy_score

def simulate_data(classes, n_vars, n, max_mu, max_sigma, max_skew):
    #The multivariate skew normal number generator
    def rng(mu, sigma, skew, n=1):
        k = len(mu)
        if not (k == len(sigma) and k ==len(skew)): 
            raise Exception("Mu, Sigma and Skew should be same length")

        data = np.zeros((int(n),k))

        for i in range(k):
            data[:,i] = skewnorm.rvs(skew[i], loc=mu[i], scale=sigma[i], size=int(n)) 

        return data
    
    if(np.sum(classes) != 1):
        raise Exception("Classes dont sum up to 1")
        
    n_classes = len(classes)
#     sigma = np.random.randint(1,max_sigma,n_vars)
#     skew = np.random.randint(-max_skew,max_skew,n_vars)
#     mu =  np.random.randint(-max_mu, max_mu, (n_classes, n_vars))
    
    sigma = (max_sigma * np.random.rand(1,n_vars))[0]
    skew = ((2 * max_skew  * np.random.rand(1, n_vars)) - max_skew)[0]
    mu = (2 *  max_mu * np.random.rand(n_classes, n_vars)) - max_mu
    
    n_obs_class = np.round(np.dot(classes,n))
    
    data = np.zeros((int(np.sum(n_obs_class)),n_vars+1))
    for i in range(n_classes):
        #calculate indexes
        start = int(np.sum(n_obs_class[0:i]))
        end = int(np.sum(n_obs_class[0:i+1]))
        
        #set the data
        data[start:end,0] = i
        data[start:end,1:] = rng(mu[i,:], sigma, skew, n_obs_class[i])
        
    X = data[:,1:]
    y = data[:,0]
        
    X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=42,
    stratify=y)
    
    return X_train, X_test, y_train, y_test

In [3]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def classify_lda(X_train, X_test, y_train, y_test, priors, plot=False):
    lda = LinearDiscriminantAnalysis(priors=priors)
    X_lda = lda.fit_transform(X_train, y_train)

    predictions = lda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("LDA Test accuracy "+ str(accuracy))
    print(predictions)

    if plot:    
        plt.xlabel('LD1')
        plt.ylabel('LD2')
        plt.scatter(
            X_lda[:,0],
            X_lda[:,1],
            c=y_train,
            cmap='Accent',
        )
        
    return {"method": "LDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": lda}

In [4]:
#Quadratic
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def classify_qda(X_train, X_test, y_train, y_test, priors):
    qda = QuadraticDiscriminantAnalysis(priors=priors)
    X_qda = qda.fit(X_train, y_train)



    predictions = qda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("QDA Test accuracy "+ str(accuracy))

    return {"method": "QDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": qda}

In [5]:
from sklearn.linear_model import LogisticRegression

def classify_logit(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                             multi_class='multinomial').fit(X_train, y_train)

    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Test accuracy "+ str(accuracy))
    
    return {"method": "Logit", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": clf}

In [6]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

def classify_knn(X_train, X_test, y_train, y_test, n_neighbors):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='euclidean')
    knn.fit(X_train, y_train)

    predictions = knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("KNN-"+str(n_neighbors)+" Test accuracy "+ str(accuracy))
    
    return {"method": "KNN-"+str(n_neighbors), 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": knn}

In [7]:
#Naive bayes
from sklearn.naive_bayes import GaussianNB

def classify_naivebayes(X_train, X_test, y_train, y_test, priors):
    NB = GaussianNB(priors)
    NB.fit(X_train, y_train)
    
    predictions = NB.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("Naive Bayes Test accuracy "+ str(accuracy))
    
    return {"method": "Naive Bayes", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": NB}

In [8]:
#SVM
from sklearn.svm import LinearSVC

def classify_svm(X_train, X_test, y_train, y_test):
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    
    predictions = svm.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    print("SVM Test accuracy "+ str(accuracy))
    
    return {"method": "SVM", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": svm}

In [9]:
from tensorflow.keras import layers
from tensorflow import keras

def classify_neuralnet(X_train, X_test, y_train, y_test, n_vars, n_classes, depth=1, nodes=10, epochs=20):
    inputs = keras.Input(shape=(n_vars,), name='obs')
    x = layers.Dense(nodes, activation='relu')(inputs)
    
    if(depth>1):
        for i in range(depth-1):
            x = layers.Dense(nodes, activation='relu')(x)
            
    outputs = layers.Dense(n_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='Dataset')

    display(model.summary())


    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=keras.optimizers.RMSprop(),
                  metrics=['accuracy'])

    history = model.fit(X_train, y_train,
                        batch_size=64,
                        epochs=epochs,
                        validation_split=0.2)

    predictions = model.predict(X_test)
    predictions = np.argmax(predictions, axis=1)
    print(predictions)

    accuracy = accuracy_score(y_test, predictions)
    print("Neural Network Test accuracy "+ str(accuracy))
    
    return {"method": "Net "+"-".join([str(nodes) for i in range(depth)])+ " E"+str(epochs), 
            "accuracy": accuracy, 
            "predictions":predictions,
            "model": model}

In [10]:
from datetime import datetime
import pandas as pd
import time


np.random.seed(12345)

timelabel =  datetime.now().strftime("%H-%M-%S - %d-%m-%Y")


configs = [{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 2,
    "n": 500,
    "max_mu": 2,
    "max_sigma": 2,
    "max_skew": 1
},
{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 5,
    "n": 10000,
    "max_mu": 5,
    "max_sigma": 5,
    "max_skew": 5
},
{
    "classes": [0.05, 0.05, 0.05, 0.85],
    "n_vars": 5,
    "n": 10000,
    "max_mu": 5,
    "max_sigma": 5,
    "max_skew": 5
},
{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 50,
    "n": 10000,
    "max_mu": 5,
    "max_sigma": 5,
    "max_skew": 5
},
{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 5,
    "n": 10000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 10
},
{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 5,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 10
},
{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 2,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 10
},
{
    "classes": [0.125 for x in range(8)],
    "n_vars": 5,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 10
},
{
    "classes": [0.5, 0.5],
    "n_vars": 2,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 10,
    "max_skew": 10
},
{
    "classes": [0.5, 0.5],
    "n_vars": 2,
    "n": 100000,
    "max_mu": 1,
    "max_sigma": 5,
    "max_skew": 5
},
{
    "classes": [0.5, 0.5],
    "n_vars": 2,
    "n": 1000000,
    "max_mu": 1,
    "max_sigma": 5,
    "max_skew": 5
}]



columns = ['method', 'accuracy','predictions', "model", "config"] + list(configs[0].keys())
results = pd.DataFrame(columns=columns)
results.style.format({
    'accuracy': '{:,.3%}'.format
})



#run analysis
for i, c in enumerate(configs):
    X_train, X_test, y_train, y_test = simulate_data(c["classes"], c["n_vars"], c["n"], c["max_mu"], c["max_sigma"], c["max_skew"])
    
    c_n = 7
    
    lda = classify_lda(X_train, X_test, y_train, y_test, c["classes"], False)                                     
    results = results.append({**lda, **c, "config":c_n},ignore_index=True)
    
    qda = classify_qda(X_train, X_test, y_train, y_test, c["classes"])                                     
    results = results.append({**qda, **c, "config":c_n},ignore_index=True)
    
    logit = classify_logit(X_train, X_test, y_train, y_test)                                     
    results = results.append({**logit, **c, "config":c_n},ignore_index=True)
    
    for k in [5,10,50,100]:
        knn = classify_knn(X_train, X_test, y_train, y_test, k)
        results = results.append({**knn, **c, "config":c_n},ignore_index=True)

    bayes = classify_naivebayes(X_train, X_test, y_train, y_test, c["classes"])
    results = results.append({**bayes, **c, "config":c_n},ignore_index=True)
    
    svm = classify_svm(X_train, X_test, y_train, y_test)
    results = results.append({**svm, **c, "config":c_n},ignore_index=True)
    
    for n in [{"d":1,"n":len(c["classes"]), "e":25}, {"d":4,"n":30, "e":10}, {"d":4,"n":50, "e":50}]:
        neuralnet = classify_neuralnet(X_train, X_test, y_train, y_test, c["n_vars"], len(c["classes"]),  depth=n["d"], nodes=n["n"], epochs=n["e"])                                 
        results = results.append({**neuralnet, **c, "config":c_n},ignore_index=True)
    
    print("Results after config "+str(i+1)+" of "+str(len(configs)))

    results.sort_values(by='accuracy', ascending=False, inplace=True)
        
    display(results.style.format({
    'accuracy': '{:,.3%}'.format
    }))
    

    #saving results to file
    results.drop(columns=['model']).to_pickle("./results/config "+str(i+1)+" of "+str(len(configs))+" "+timelabel+".pkl")

LDA Test accuracy 0.4674242424242424
[3. 0. 0. ... 1. 2. 2.]
QDA Test accuracy 0.4656060606060606




Logistic Test accuracy 0.46736363636363637
KNN-5 Test accuracy 0.42696969696969694
KNN-10 Test accuracy 0.44966666666666666
KNN-50 Test accuracy 0.4709090909090909
KNN-100 Test accuracy 0.47563636363636363
Naive Bayes Test accuracy 0.4661212121212121




SVM Test accuracy 0.45566666666666666
Model: "Dataset"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
obs (InputLayer)             [(None, 5)]               0         
_________________________________________________________________
dense (Dense)                (None, 4)                 24        
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 20        
Total params: 44
Trainable params: 44
Non-trainable params: 0
_________________________________________________________________


None

Train on 53600 samples, validate on 13400 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
[3 0 0 ... 1 2 2]
Neural Network Test accuracy 0.49218181818181816
Model: "Dataset"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
obs (InputLayer)             [(None, 5)]               0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                180       
_________________________________________________________________
dense_3 (Dense)              (None, 30)                930       
_________________________________________________________________
dense_4 (Dense)              (None, 30)        

None

Train on 53600 samples, validate on 13400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[3 0 0 ... 1 2 1]
Neural Network Test accuracy 0.49657575757575756
Model: "Dataset"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
obs (InputLayer)             [(None, 5)]               0         
_________________________________________________________________
dense_7 (Dense)              (None, 50)                300       
_________________________________________________________________
dense_8 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_9 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_10 (Dense)             (None, 50)                2550      
_____________________________

None

Train on 53600 samples, validate on 13400 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[3 0 0 ... 1 2 2]
Neural Network Test accuracy 0.49663636363636365
Results after config 1 of 1


Unnamed: 0,method,accuracy,predictions,model,config,classes,n_vars,n,max_mu,max_sigma,max_skew
11,Net 50-50-50-50 E50,49.664%,[3 0 0 ... 1 2 2],,7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
10,Net 30-30-30-30 E10,49.658%,[3 0 0 ... 1 2 1],,7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
9,Net 4 E25,49.218%,[3 0 0 ... 1 2 2],,7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
6,KNN-100,47.564%,[3. 0. 0. ... 1. 2. 1.],"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',  metric_params=None, n_jobs=None, n_neighbors=100, p=2,  weights='uniform')",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
5,KNN-50,47.091%,[3. 0. 0. ... 1. 3. 2.],"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',  metric_params=None, n_jobs=None, n_neighbors=50, p=2,  weights='uniform')",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
0,LDA,46.742%,[3. 0. 0. ... 1. 2. 2.],"LinearDiscriminantAnalysis(n_components=None, priors=[0.25, 0.25, 0.25, 0.25],  shrinkage=None, solver='svd', store_covariance=False,  tol=0.0001)",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
2,Logit,46.736%,[3. 0. 0. ... 1. 2. 2.],"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,  intercept_scaling=1, max_iter=100, multi_class='multinomial',  n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',  tol=0.0001, verbose=0, warm_start=False)",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
7,Naive Bayes,46.612%,[3. 0. 0. ... 1. 2. 2.],"GaussianNB(priors=[0.25, 0.25, 0.25, 0.25], var_smoothing=1e-09)",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
1,QDA,46.561%,[3. 0. 0. ... 1. 2. 2.],"QuadraticDiscriminantAnalysis(priors=array([0.25, 0.25, 0.25, 0.25]),  reg_param=0.0, store_covariance=False,  store_covariances=None, tol=0.0001)",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10
8,SVM,45.567%,[3. 0. 0. ... 1. 0. 2.],"LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,  intercept_scaling=1, loss='squared_hinge', max_iter=1000,  multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,  verbose=0)",7,"[0.25, 0.25, 0.25, 0.25]",5,100000,1,10,10


Example to load old results

In [13]:
import pandas as pd

def bold(data):
    attr = 'font-weight: {}'.format("700")
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    is_max = data == data.max()
    return [attr if v else "" for v in is_max]
    
def boldlatex(data):
    #remove % and cast to float
    return list(map(lambda x: r"\\textbf{"+x+"}" if x==data.max() else x, data))

# configs = [{
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 2,
#     "n": 500,
#     "max_mu": 2,
#     "max_sigma": 2,
#     "max_skew": 1
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 5,
#     "n": 10000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.05, 0.05, 0.05, 0.85],
#     "n_vars": 5,
#     "n": 10000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 50,
#     "n": 10000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 50,
#     "n": 100000,
#     "max_mu": 5,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 5,
#     "n": 10000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# },
# {
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 5,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# }
# configs = [{
#     "classes": [0.25, 0.25, 0.25, 0.25],
#     "n_vars": 2,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# },
# {
#     "classes": [0.125 for x in range(8)],
#     "n_vars": 5,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# }] 

# configs = [{
#     "classes": [0.5, 0.5],
#     "n_vars": 2,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 10,
#     "max_skew": 10
# },
# {
#     "classes": [0.5, 0.5],
#     "n_vars": 2,
#     "n": 100000,
#     "max_mu": 1,
#     "max_sigma": 5,
#     "max_skew": 5
# },
# {
#     "classes": [0.5, 0.5],
#     "n_vars": 2,
#     "n": 1000000,
#     "max_mu": 1,
#     "max_sigma": 5,
#     "max_skew": 5
# }
# ]
   

    

old_results = pd.read_pickle("./results/config 7 of 7 21-02-19 - 02-12-2019.pkl")


# .style.format({'accuracy': '{:,.3%}'.format})



table = pd.concat([batch1])
table['accuracy'] = pd.Series(["{0:.2f}%".format(val * 100) for val in table['accuracy']], index = table.index)
table = table.pivot(index='method', columns='config', values='accuracy').sort_values(by='method')
display(table.style.apply(bold))
latex = table.apply(boldlatex).to_latex()
print(latex)

NameError: name 'batch1' is not defined