In [35]:
!activate PythonGPU
import numpy as np
from scipy.stats import skewnorm, skew
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, accuracy_score

def simulate_data(classes, n_vars, n, max_mu, max_sigma, max_skew):
    #The multivariate skew normal number generator
    def rng(mu, sigma, skew, n=1):
        k = len(mu)
        if not (k == len(sigma) and k ==len(skew)): 
            raise Exception("Mu, Sigma and Skew should be same length")

        data = np.zeros((int(n),k))

        for i in range(k):
            data[:,i] = skewnorm.rvs(skew[i], loc=mu[i], scale=sigma[i], size=int(n)) 

        return data
    
    if(np.sum(classes) != 1):
        raise Exception("Classes dont sum up to 1")
        
    n_classes = len(classes)
    sigma = np.random.randint(1,max_sigma,n_vars)
    skew = np.random.randint(-max_skew,max_skew,n_vars)
    mu =  np.random.randint(-max_mu, max_mu, (n_classes, n_vars))
    
    n_obs_class = np.round(np.dot(classes,n))
    
    data = np.zeros((int(np.sum(n_obs_class)),n_vars+1))
    for i in range(n_classes):
        #calculate indexes
        start = int(np.sum(n_obs_class[0:i]))
        end = int(np.sum(n_obs_class[0:i+1]))
        
        #set the data
        data[start:end,0] = i
        data[start:end,1:] = rng(mu[i,:], sigma, skew, n_obs_class[i])
        
    X = data[:,1:]
    y = data[:,0]
    
#     columns = ["x"+str(x) for x in range(n_vars + 1)]
#     columns[0] = "class"
    
#     df = pd.DataFrame(data,columns=columns)
#     df["class"] = df["class"].astype(int)
    
    
    X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=42,
    stratify=y)
    
    return X_train, X_test, y_train, y_test

In [36]:
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def classify_lda(X_train, X_test, y_train, y_test, priors, plot=False):
    lda = LinearDiscriminantAnalysis(priors=priors)
    X_lda = lda.fit_transform(X_train, y_train)

    predictions = lda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("LDA Test accuracy "+ str(accuracy))
    
    report = classification_report(y_test, predictions)
    print(report)

    if plot:    
        plt.xlabel('LD1')
        plt.ylabel('LD2')
        plt.scatter(
            X_lda[:,0],
            X_lda[:,1],
            c=y_train,
            cmap='Accent',
        )
        
    return {"method": "LDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "report": report,
            "model": lda}

In [37]:
#Quadratic
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def classify_qda(X_train, X_test, y_train, y_test, priors):
    qda = QuadraticDiscriminantAnalysis(priors=priors)
    X_qda = qda.fit(X_train, y_train)



    predictions = qda.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("QDA Test accuracy "+ str(accuracy))
    
    report = classification_report(y_test, predictions)
    print(report)

    return {"method": "QDA", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "report": report,
            "model": qda}

In [38]:
from sklearn.linear_model import LogisticRegression

def classify_logit(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                             multi_class='multinomial').fit(X_train, y_train)


    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Test accuracy "+ str(accuracy))

    report = classification_report(y_test, predictions)
    print(report)
    
    return {"method": "Logit", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "report": report,
            "model": clf}

In [None]:
#KNN

In [None]:
#Naive bayes

In [None]:
#SVM

In [41]:
from tensorflow.keras import layers
from tensorflow import keras

def classify_neuralnet(X_train, X_test, y_train, y_test, n_vars, depth=1, nodes=10, epochs=20):
    inputs = keras.Input(shape=(n_vars,), name='obs')
    x = layers.Dense(nodes, activation='relu')(inputs)
    
    if(depth>1):
        for i in range(depth-1):
            x = layers.Dense(nodes, activation='relu')(x)
            
    outputs = layers.Dense(10, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='Dataset')

    display(model.summary())


    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=keras.optimizers.RMSprop(),
                  metrics=['accuracy'])

    history = model.fit(X_train, y_train,
                        batch_size=64,
                        epochs=epochs,
                        validation_split=0.2)

    predictions = model.predict(X_test, y_test, verbose=2)
    accuracy = accuracy_score(y_test, predictions)
    print("Neural Network Test accuracy "+ str(accuracy))
    
    report = classification_report(y_test, predictions)
    print(report)
    
    return {"method": "Neural Net", 
            "accuracy": accuracy, 
            "predictions":predictions,
            "report": report,
            "model": model}


# print('Test loss:', test_scores[0])
# print('Test accuracy:', test_scores[1])

# accuracies = accuracies.append({"method": "Neural Net", 
#                    "accuracy": test_scores[1], 
#                    "predictions":clf_pred}, 
#                     ignore_index=True)

In [42]:
np.random.seed(12345)

configs = [{
    "classes": [0.25, 0.25, 0.25, 0.25],
    "n_vars": 2,
    "n": 100000,
    "max_mu": 4,
    "max_sigma": 30,
    "max_skew": 10
}] 

columns = ['method', 'accuracy','predictions', "report", "model", "config"] + list(configs[0].keys())
results = pd.DataFrame(columns=columns)

#run analysis
for i, c in enumerate(configs):
    X_train, X_test, y_train, y_test = simulate_data(c["classes"], c["n_vars"], c["n"], c["max_mu"], c["max_sigma"], c["max_skew"])
    
    lda = classify_lda(X_train, X_test, y_train, y_test, c["classes"], False)                                     
    results = results.append({**lda, **c, "config":i+1},ignore_index=True)
    
    qda = classify_qda(X_train, X_test, y_train, y_test, c["classes"])                                     
    results = results.append({**qda, **c, "config":i+1},ignore_index=True)
    
    logit = classify_logit(X_train, X_test, y_train, y_test)                                     
    results = results.append({**logit, **c, "config":i+1},ignore_index=True)
    
    neuralnet = classify_neuralnet(X_train, X_test, y_train, y_test, c["n_vars"], depth=2, nodes=20, epochs=5)                                 
    results = results.append({**neuralnet, **c, "config":i+1},ignore_index=True)
    
    print("Results after config "+str(i+1))
    display(results)

LDA Test accuracy 0.5007272727272727
              precision    recall  f1-score   support

         0.0       0.38      0.32      0.35      8250
         1.0       0.89      0.86      0.87      8250
         2.0       0.37      0.56      0.44      8250
         3.0       0.39      0.28      0.32      8250

   micro avg       0.50      0.50      0.50     33000
   macro avg       0.51      0.50      0.50     33000
weighted avg       0.51      0.50      0.50     33000

QDA Test accuracy 0.5025151515151515
              precision    recall  f1-score   support

         0.0       0.40      0.21      0.28      8250
         1.0       0.89      0.86      0.87      8250
         2.0       0.37      0.55      0.44      8250
         3.0       0.38      0.39      0.39      8250

   micro avg       0.50      0.50      0.50     33000
   macro avg       0.51      0.50      0.49     33000
weighted avg       0.51      0.50      0.49     33000

Logistic Test accuracy 0.49542424242424243
             

TypeError: '>' not supported between instances of 'module' and 'int'