In [None]:
"""
Note: The output of this notebook has been cleared to reduce storage space while emailing.
"""

In [None]:
#Import dependencies
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, make_scorer
from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

#Config tensorflow
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)



In [None]:
# load the dataset and display a summary
dataset = pd.read_csv(r"./data/training.csv", delimiter=",")
dataset

In [None]:
#Listing possible values for categorical features to gain insight on how to proceed with numeric conversions
categorical = ['DataSource1_Feature1','DataSource1_Feature2','DataSource1_Feature3','DataSource3_Feature3','DataSource4_Feature6']
for feature in categorical:
    print(dataset[feature].unique())

In [None]:
#Firstly generate feature combination graphs to look for immediately interesting features
import matplotlib.pyplot as plt
import seaborn as sns

combinations = itertools.combinations(dataset.columns, 2)

for combo in combinations:
    #We can immediately Isolate DataSource1_Feature 3 here because it contains the same value for all entries
    if 'ID' in combo or 'Target' in combo or 'DataSource1_Feature3' in combo:
        continue
    sns.scatterplot(x=combo[0], y=combo[1], data=dataset, hue='Target')
    plt.show()

In [None]:
#Based on the graphs above, I've selected the following features as being candidates for valueable features.
#DataSource1_Feature1
#DataSource2_Feature1
#DataSource3_Feature2
#DataSource3_Feature3

#We'll create a separate dataframe containing just these features and the target
isolated_features = dataset[['DataSource1_Feature1','DataSource2_Feature1','DataSource3_Feature2','DataSource3_Feature3','DataSource4_Feature6','DataSource4_Feature5','Target']]
isolated_features

In [None]:
#We can see from the dataframe summary above, we have to tend to some missing values
isolated_features.isna().sum()

In [None]:
#DataSource3 seems to be a very sparse datasource, so we can try just removing it for the time being
isolated_features = isolated_features[['DataSource1_Feature1','DataSource2_Feature1','DataSource4_Feature6','DataSource4_Feature5','Target']]

In [None]:
isolated_features

In [None]:
#We can go ahead and One-Hot encode the categorical variables
isolated_features = pd.get_dummies(isolated_features, columns=['DataSource1_Feature1','DataSource4_Feature6'], dtype=float)

In [None]:
#We still have to tend to the missing values in DataSource2_Feature1, we'll use a KNN method to deal with those.
imputer = KNNImputer(n_neighbors=5)
isolated_features = pd.DataFrame(imputer.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#and make sure everything looks good
isolated_features.isna().sum()


In [None]:
#We'll also go ahead and scale / normalize the data while we're at it, using the Min-Max method

scaler = MinMaxScaler()
isolated_features = pd.DataFrame(scaler.fit_transform(isolated_features),
                                 columns = isolated_features.columns)


In [None]:
isolated_features

In [None]:
#Converting the pandas objects to numpy arrays
target = isolated_features.pop('Target')
X = isolated_features.to_numpy()
Y = target.to_numpy()

In [None]:
#Using K-Folds cross-validation and an 80:20 Train/Validiton split, run a simple logistic regression
kfold = StratifiedKFold(n_splits=10, shuffle=True)
for train, test in kfold.split(X, Y):
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    
    # create model
    model = Sequential()
    model.add(Dense(1, input_dim=18, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['AUC'])
    
    print(model.summary())
    
    # Fit the model
    history = model.fit(X[train], Y[train], epochs=100, batch_size=64, verbose=1, validation_split=0.2, callbacks=[callback])
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=1)
    
    #Plot the Training / Validation Loss
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()
    


In [None]:
#Generate a confusion matrix to get some insight into the performance
y_pred = model.predict(X[test])
y_pred = np.where(y_pred > 0.5, 1, 0)
cm = confusion_matrix(y_pred, Y[test])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
#Try adding back and cleaning up the sparse columns and see if that causes any improvement.
dataset = pd.read_csv(r"./data/training.csv", delimiter=",")
isolated_features = dataset[['DataSource1_Feature1','DataSource2_Feature1','DataSource3_Feature2','DataSource3_Feature3','DataSource4_Feature6','DataSource4_Feature5','Target']]

isolated_features = pd.get_dummies(isolated_features,
                                   columns=['DataSource1_Feature1','DataSource4_Feature6','DataSource3_Feature3'],
                                   dtype=float)

#Using the KNN to replace ALL of the missing data
imputer = KNNImputer(n_neighbors=5)
isolated_features = pd.DataFrame(imputer.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#Scaling the data again
scaler = MinMaxScaler()
isolated_features = pd.DataFrame(scaler.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#re-defining our training variables
target = isolated_features.pop('Target')
X = isolated_features.to_numpy()
Y = target.to_numpy()

In [None]:
#Using K-Folds cross-validation and an 80:20 Train/Validiton split, re-run the logisitc regression with the new data
kfold = StratifiedKFold(n_splits=10, shuffle=True)
for train, test in kfold.split(X, Y):
    
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    
    # create model
    model = Sequential()
    model.add(Dense(1, input_dim=25, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['AUC'])
    
    print(model.summary())
    
    # Fit the model
    history = model.fit(X[train],
                        Y[train],
                        epochs=100,
                        batch_size=64,
                        verbose=1,
                        validation_split=0.2,
                        callbacks=[callback])
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=1)
    
    #Save the model to the disk
    model.save("./model.h5")
    
    #Plot the Training / Validation Loss
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()

In [None]:
#Make a new confusion matrix
y_pred = model.predict(X[test])
y_pred = np.where(y_pred > 0.5, 1, 0)

cm = confusion_matrix(y_pred, Y[test])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
#Marginally better, perhaps adding back additional features will further increase our performance
dataset = pd.read_csv(r"./data/training.csv", delimiter=",")
isolated_features = dataset[['DataSource1_Feature1','DataSource2_Feature1','DataSource3_Feature2','DataSource3_Feature3','DataSource4_Feature6','DataSource4_Feature5','DataSource3_Feature1','DataSource2_Feature9','Target']]

isolated_features = pd.get_dummies(isolated_features,
                                   columns=['DataSource1_Feature1','DataSource4_Feature6','DataSource3_Feature3'],
                                   dtype=float)

#Using the KNN to replace ALL of the missing data
imputer = KNNImputer(n_neighbors=5)
isolated_features = pd.DataFrame(imputer.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#Scaling the data again
scaler = MinMaxScaler()
isolated_features = pd.DataFrame(scaler.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#re-defining our training variables
target = isolated_features.pop('Target')
X = isolated_features.to_numpy()
Y = target.to_numpy()

In [None]:
#Using K-Folds cross-validation and an 80:20 Train/Validiton split, re-run the logisitc regression with the new data

kfold = StratifiedKFold(n_splits=10, shuffle=True)
for train, test in kfold.split(X, Y):
    
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    
    # create model
    model = Sequential()
    model.add(Dense(1, input_dim=27, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['AUC'])
    
    print(model.summary())
    
    # Fit the model
    history = model.fit(X[train],
                        Y[train],
                        epochs=100,
                        batch_size=64,
                        verbose=1,
                        validation_split=0.2,
                        callbacks=[callback])
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=1)
    
    #Plot the Training / Validation Loss
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()

In [None]:
#Make a new confusion matrix
y_pred = model.predict(X[test])
y_pred = np.where(y_pred > 0.5, 1, 0)
cm = confusion_matrix(y_pred, Y[test])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
#It appears to preform worse now, perhaps a different model approach will put us in the right direction
#We'll re-use the setup from the second logistic regression, as that seemed to be the highest performer.

dataset = pd.read_csv(r"./data/training.csv", delimiter=",")
isolated_features = dataset[['DataSource1_Feature1','DataSource2_Feature1','DataSource3_Feature2','DataSource3_Feature3','DataSource4_Feature6','DataSource4_Feature5','Target']]
isolated_features = pd.get_dummies(isolated_features,
                                   columns=['DataSource1_Feature1','DataSource4_Feature6','DataSource3_Feature3'],
                                   dtype=float)

#Using the KNN to replace ALL of the missing data
imputer = KNNImputer(n_neighbors=5)
isolated_features = pd.DataFrame(imputer.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#Scaling the data again
scaler = MinMaxScaler()
isolated_features = pd.DataFrame(scaler.fit_transform(isolated_features),
                                 columns = isolated_features.columns)

#re-defining our training variables
target = isolated_features.pop('Target')
X = isolated_features.to_numpy()
Y = target.to_numpy()

In [None]:
#Trying a Random Forest model for our next approach
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20)

# define the model
model = RandomForestClassifier()

# evaluate the model
model.fit(X_train, Y_train)


In [None]:
y_pred = model.predict(X_test)

# report performance
print(roc_auc_score(Y_test, y_pred))
y_pred = np.where(y_pred > 0.5, 1, 0)
cm = confusion_matrix(y_pred, Y_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
#The random forest model performed about the same as the logistic regression, even after some additional tweaking / tuning
#(Omitted for the sake of brevity)
#The similar performance regardless of tuning, class weighting, feature engineering, etc could suggest that the positive 
#classifications might not be distinct enough for patterns to effectively emerge in the data.

In [None]:
#Before proceeding with one of our previously detailed models, we'll pull out all the stops and try a neural network
#with some additional feature selection techniques.

#Reload the data
dataset = pd.read_csv(r"./data/training.csv", delimiter=",")

#Remove DataSource1_Feature3 from dataframe due to it containing the same value for all examples
#(For the sake of simplicity, I'll some of the dataframe modifications in-place, which isn't always ideal, but will
#suffice for our purposes)

#Similarly, remove it from our running list of categorical features
dataset.drop('DataSource1_Feature3', axis = 1, inplace = True)
categorical.pop(2)

In [None]:
#We can also drop the "ID" column, as it doesn't provide us with any useful information
dataset.drop('ID', axis = 1, inplace = True)

In [None]:
#One-Hot encode the remaining categorical features (ignoring the NaN values in these for now)
numeric_dataset = pd.get_dummies(dataset,
                                 columns=categorical,
                                 dtype=float)


In [None]:
#Check how many entries in the dataset contain NaN values
null_data = numeric_dataset[numeric_dataset.isnull().any(axis=1)]
null_data

In [None]:
#The majority of the entries contain at least 1 NaN value, so it's in our best interest to take a more granular look,
#rather than just, say, call something like numeric_dataset.dropna(), which would effectively remove all of the null values

#Print the number of missing values for each feature
numeric_dataset.isna().sum()

In [None]:
#Based on that function, There are NaN values for the majority of DataSource3_Feature1 & DataSource3_Feature2,
#so we will simply remove those features and see where we stand in terms of remaining NaN values

numeric_dataset.drop('DataSource3_Feature1', axis = 1, inplace = True)
numeric_dataset.drop('DataSource3_Feature2', axis = 1, inplace = True)
null_data = numeric_dataset[numeric_dataset.isnull().any(axis=1)]
null_data

In [None]:
#This is a lot better when considering missing values, as that's a reasonable threshold to simply remove
#anything containing NaN values without sacrificing the statistical significance of our sample too harshly, but we
#can likely do better
numeric_dataset.isna().sum(axis = 1).sort_values(ascending = False)

In [None]:
#Looking at that function call, it looks like theres a handful of rows with a high proportion of null values,
#so we'll remove those manually and reassess.
#Of the multiple values I tried, 7 appeared to be a sweet spot for a rejection threshold.

clean_data = numeric_dataset[numeric_dataset.isnull().sum(axis=1) < 7]
clean_data[clean_data.isnull().any(axis=1)]


In [None]:
#For the remaining missing values, we'll use a Nearest Neighbors implementation to fill them out,
#This requires us to normalize our data first, which we were going to do anyway. 

#Normalize the data using the Min-Max Method

scaler = MinMaxScaler()
clean_data = pd.DataFrame(scaler.fit_transform(clean_data),
                          columns = clean_data.columns)

#Use KNN on the scaled data to fill  in the missing values

imputer = KNNImputer(n_neighbors=5)
clean_data = pd.DataFrame(imputer.fit_transform(clean_data),
                          columns = clean_data.columns)

In [None]:
clean_data

In [None]:
#DataSource2_Feature2 and DataSource2_Feature3 appear to be identical, so We'll verify that
clean_data.query('DataSource2_Feature2 != DataSource2_Feature3')

In [None]:
#And, since they are, in fact, identical, we'll remove one of them
clean_data.drop('DataSource2_Feature3', axis = 1, inplace = True)

In [None]:
#While We're at it, We'll pop out the target column
target = clean_data.pop('Target')

In [None]:
#Convert the pandas objects to numpy arrays
X = clean_data.to_numpy()
Y = target.to_numpy()


In [None]:
#Use the Mutual Information algorithm to reduce dimensionality, determining the most useful features 
importances = mutual_info_classif(X,Y)
feat_importances = pd.Series(importances, clean_data.columns[0:len(clean_data.columns)])
feat_importances.plot(kind='barh',color='red')
plt.show()


In [None]:
#Eliminate the least useful features until the most useful 13 (A sweet-spot found through experimentation) remain 
while(len(clean_data.columns)>13):
    clean_data.pop(clean_data.columns[np.argmin(importances)])
        

In [None]:
#Re-Create the training set with the modified features
X = clean_data.to_numpy()

In [None]:
#Set up a K-Folds Cross-Validation, with k=5
kfold = StratifiedKFold(n_splits=10, shuffle=True)
for train, test in kfold.split(X, Y):
    
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    
    # create model
    model = Sequential()
    model.add(Dense(26,input_dim=13,activation='relu'))
    model.add(Dense(52,input_dim=26,activation='relu'))
    model.add(Dense(104,input_dim=52,activation='relu'))
    model.add(Dense(52,input_dim=104,activation='relu'))
    model.add(Dense(26,input_dim=52,activation='relu'))
    model.add(Dense(13,input_dim=26,activation='relu'))
    model.add(Dense(1, input_dim=13, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['AUC'])
    
    print(model.summary())
    
    # Fit the model
    history = model.fit(X[train],
                        Y[train],
                        epochs=750,
                        batch_size=100,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=[callback])
    
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=1)
    
    #Plot the Training / Validation Loss
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()
    

In [None]:
#Make a new confusion matrix
y_pred = model.predict(X[test])
y_pred = np.where(y_pred > 0.5, 1, 0)

cm = confusion_matrix(y_pred, Y[test])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
#The Neural Network appears to only be predicting 0, which is not entirely suprising that one of the models would end up doing
#that Given the imbalance of the data (again, these models were additionally tuned with methods for helping with imbalanced
#data, such as class weighting, SMOTE, etc. to no avail)

#My hypothesis is then that the positive classifications are not unique enough in their features to effectively stand out
#in the data. 

#As such, I'll save the second logistic regression, as it had a slight performance gain over the other models and will be 
#one of the more efficent options in terms of storage / speed. 