In [None]:
import os
import csv
import yaml
import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from src import read_nz_file, read_jg_file, update_meta_data, split_df, aggregate_files, add_moving_window
from sklearn.model_selection import train_test_split
from wandb.keras import WandbCallback

from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder

from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.convolutional import MaxPooling1D
from tensorflow.keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
import keras

import seaborn as sns

In [None]:
def read_preprocessing(folder):
    X_train = pd.read_parquet(f'tmp/{folder}/X_train.parquet')
    X_test = pd.read_parquet(f'tmp/{folder}/X_test.parquet')
    y_train = pd.read_parquet(f'tmp/{folder}/y_train.parquet')['y']
    y_test = pd.read_parquet(f'tmp/{folder}/y_test.parquet')['y']

    with open(rf'./tmp/{folder}/metadata.yaml') as file:
        settings = yaml.full_load(file)

    return X_train, X_test, y_train, y_test, settings

In [None]:
X_train, X_test, y_train, y_test, settings = read_preprocessing('basic_20hz_20sec')

In [None]:
#label encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

## Decision Tree

In [None]:
#-----------------Experiment 1-----------------------------------------------------------------------------------------------------
#-----------------Decision tree max depth 3------------------------------------

#----------1.define hyperparameters here------------------------------
max_depth = 3
clf_criterion = 'entropy'
random_state = 2
labels = y_train.unique()
wandb_name="DecisionTree"

#----------2.wandb logging code /config here----------------------------
config = {
    "architecture": 'Decision tree',
    "moving_window_size": settings['MOVING_WINDOW_SIZE'],
    "hz": settings['HZ'],
    "step_size": settings['STEP_SIZE'],
    "test_proportion": settings['TEST_PROPORTION'],
    "aggregation": settings['AGGREGATION'],
    "preprocessing": settings['PREPROCESSING'],
    "features": settings['FEATURES'],
    "max_depth": max_depth,
    "clf_criterion": clf_criterion,
    "random_state": random_state,
    "labels": labels,
}
run = wandb.init(entity='cdl1',project='CDL1',name = wandb_name, config = config)
config = run.config

#-----------3.model training-----------------------------------------
# y label encoder
le = LabelEncoder()
y_le_train = le.fit_transform(y_train)
y_le_test = le.fit_transform(y_test)


#initialize model 

clf = tree.DecisionTreeClassifier(criterion=config.clf_criterion, 
                                  max_depth = config.max_depth, 
                                  random_state=config.random_state)

# Train the model for epochs with batch_size
clf = clf.fit(X_train, 
              y_le_train
              )

# predict on X_test and X_train
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_probas = clf.predict_proba(X_test)

#---------model performance on train and test dataset-----------------------
test_accuracy = metrics.accuracy_score(y_le_test, y_pred_test)
train_accuracy = metrics.accuracy_score(y_le_train, y_pred_train)

print("Accuracy on train dataset:{:.2f}".format(train_accuracy))
print("Accuracy on test dataset:{:.2f}".format(test_accuracy))


wandb.log({"accuracy": train_accuracy, "val_acc": test_accuracy})

#confusion matrix on test dataset
cm=confusion_matrix(y_le_test, y_pred_test)
print("Confusion Matrix: \n", cm)
print(classification_report(y_le_test, y_pred_test ))

wandb.sklearn.plot_confusion_matrix(y_le_test, y_pred_test, config.labels)
wandb.sklearn.plot_summary_metrics(clf, X_train, y_le_train, X_test, y_le_test)
wandb.sklearn.plot_classifier(clf, X_train, X_test, y_le_train, y_le_test, y_pred_test,y_probas,labels,model_name='Decision Tree', feature_names=None)

run.finish()

In [None]:
import graphviz
features = X_train.columns
dot_data = tree.export_graphviz(clf, out_file = None, filled=True, rounded=True, feature_names=features, class_names = labels)
graph=graphviz.Source(dot_data)
graph

In [None]:
print(clf.feature_importances_)
np.argmax(clf.feature_importances_)
X_train.columns[3]

In [None]:
#-----------------Experiment 2-----------------------------------------------------------------------------------------------------
#-------------------Decision Tree max depth 4--------------------------------------------------------------
max_depth = 4
clf_criterion = 'entropy'
random_state = 41
labels = y_train.unique()
wandb_name = "Decision Tree"

#----------2.wandb logging code /config here----------------------------
config = {
    "architecture": 'Decision tree',
    "moving_window_size": settings['MOVING_WINDOW_SIZE'],
    "hz": settings['HZ'],
    "step_size": settings['STEP_SIZE'],
    "test_proportion": settings['TEST_PROPORTION'],
    "aggregation": settings['AGGREGATION'],
    "preprocessing": settings['PREPROCESSING'],
    "features": settings['FEATURES'],
    "max_depth": max_depth,
    "clf_criterion": clf_criterion,
    "random_state": random_state,
    "labels": labels,
}
run = wandb.init(entity='cdl1',project='CDL1',name = wandb_name, config = config)
config = run.config


#-----------3.model training-----------------------------------------
# y label encoder
le = LabelEncoder()
y_le_train = le.fit_transform(y_train)
y_le_test = le.fit_transform(y_test)


#initialize model 

clf = tree.DecisionTreeClassifier(criterion=config.clf_criterion, 
                                    max_depth = config.max_depth, 
                                    random_state=config.random_state)

# Train the model for epochs with batch_size
clf = clf.fit(X_train, 
                y_le_train
                )

# predict on X_test and X_train
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

#---------model performance on train and test dataset-----------------------
test_accuracy = metrics.accuracy_score(y_le_test, y_pred_test)
train_accuracy = metrics.accuracy_score(y_le_train, y_pred_train)

wandb.log({"accuracy": train_accuracy, "val_acc": test_accuracy})

#confusion matrix on test dataset
cm=confusion_matrix(y_le_test, y_pred_test)
print("Confusion Matrix: \n", cm)
print(classification_report(y_le_test, y_pred_test ))
print("Accuracy on train dataset:{:.2f}".format(train_accuracy))
print("Accuracy on test dataset:{:.2f}".format(test_accuracy))

wandb.sklearn.plot_confusion_matrix(y_le_test, y_pred_test, config.labels)

run.finish()

In [None]:
import graphviz
features = X_train.columns
dot_data = tree.export_graphviz(clf, out_file = None, filled=True, rounded=True, feature_names=features, class_names = labels)
graph=graphviz.Source(dot_data)
graph

## Random Forest

In [None]:
#-----------------Experiment 1-----------------------------------------------------------------------------------------------------
#-----------------Random Forest classification Model------------------------------------

random_state = 45
labels = y_train.unique()
wandb_name = "Random Forest"
n_estimators = 1000                  #number of trees in the forest
clf_criterion = 'entropy'           #function to measure quality of split /possible options- gini, log_loss, entropy
n_jobs = 10                       #number of jobs to run in parallel

#----------2.wandb logging code /config here----------------------------

config = {
    "architecture": 'Random Forest',
    "moving_window_size": settings['MOVING_WINDOW_SIZE'],
    "hz": settings['HZ'],
    "step_size": settings['STEP_SIZE'],
    "test_proportion": settings['TEST_PROPORTION'],
    "aggregation": settings['AGGREGATION'],
    "preprocessing": settings['PREPROCESSING'],
    "features": settings['FEATURES'],
    "clf_criterion": clf_criterion,
    "random_state": random_state,
    "labels": labels,
    "n_jobs": n_jobs,
    "n_estimators": n_estimators,
}
run = wandb.init(entity = 'cdl1',project='CDL1',name = wandb_name, config = config)
config = run.config

#-----------3.model training-----------------------------------------
# y label encoder
le = LabelEncoder()
y_le_train = le.fit_transform(y_train) #label encoded ytrain
y_le_test = le.fit_transform(y_test)  #label encoded ytest

#model
rf_mdl = RandomForestClassifier(n_estimators=n_estimators, n_jobs=config.n_jobs, criterion=config.clf_criterion)
rf_mdl.fit(X_train, y_le_train)

#predict
y_pred = rf_mdl.predict(X_test)
y_probs = rf_mdl.predict_proba(X_test)

#accuracy
test_accuracy = metrics.accuracy_score(y_le_test, y_pred)

wandb.log({ "val_acc": test_accuracy})

#confusion matrix on test dataset
cm=confusion_matrix(y_le_test, y_pred)
print("Confusion Matrix: \n", cm)
print(classification_report(y_le_test, y_pred ))

print("Accuracy on test dataset:{:.2f}".format(test_accuracy))

#wand plots
wandb.sklearn.plot_confusion_matrix(y_le_test, y_pred, config.labels)
#wandb.sklearn.plot_class_proportions(y_le_train, y_le_test, labels) #plots the distribution of target classes in training & test sets. 
#wandb.sklearn.plot_roc(y_le_test, y_probs, labels) #plots true positive rate(y-axis) vs false positive rate(x-axis)
#wandb.sklearn.plot_precision_recall(y_le_test, y_probs, labels) #tradeoff between precision and recall for different thresholds.

run.finish()
