In [35]:
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

import pandas as pd
from sklearn.preprocessing import StandardScaler # stanardization
from sklearn.preprocessing import LabelEncoder # Label --> Number
from sklearn.preprocessing import minmax_scale

from sklearn.model_selection import train_test_split, cross_val_predict # Training/Test split

from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.linear_model import LogisticRegression #LR
import statsmodels.api as sm

from sklearn.svm import LinearSVC, SVC #SVM

from sklearn.tree import DecisionTreeClassifier,plot_tree # Decision Tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import roc_curve

In [36]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif # Feature selection for NB
from sklearn.feature_selection import RFE # Feature selection for Logistic Regression and SVM
from sklearn.model_selection import GridSearchCV, StratifiedKFold

## Upload the dataset

In [37]:
df = pd.read_csv('stat473_train.csv')
df.head()

Unnamed: 0,Target,A,B,C,D,E,F,G,H,I,...,K,L,M,N,O,P,Q,R,S,T
0,0,-4.9809,8.4634,1.0394,0.14357,0.82451,4.5875,-7.1183,14.929,-14.51,...,-11.933,-9.0616,-4.0255,-12.887,-16.284,-3.0556,7.5917,-11.645,8.5839,5.974
1,0,-0.063791,-1.9291,-13.095,-4.1099,17.682,-9.9642,22.235,-0.080373,-5.7496,...,0.071424,4.6136,1.9185,7.2199,1.6206,0.16157,1.6537,-13.717,-5.5687,0.64624
2,0,13.42,-6.0815,5.9716,3.0206,-10.104,-12.584,-20.754,-8.8231,1.9093,...,5.5086,14.335,-4.976,0.20882,-7.594,9.2471,-6.1822,4.2594,0.24844,-29.997
3,0,1.0779,8.3899,-3.7983,-8.1731,18.522,-1.9751,-1.7991,5.5804,-16.189,...,-25.112,-3.3708,16.5,-1.8379,-14.232,5.6912,3.1978,1.8329,0.80195,18.829
4,0,4.4603,2.0857,-3.7736,-6.5797,0.91993,-3.4404,8.7778,-4.0243,-5.2369,...,-0.45084,6.0084,-1.3952,12.029,-34.903,-19.334,0.54948,-1.2913,4.5786,-0.73324


## Split Data into Input and Output

In [38]:
X = df.drop('Target', axis=1)
y = df['Target']

## Split Data into Training and Testing

### Note: Split training and testing data at the begining ensures that any decisions about the model (including feature selection) are made based on the training set only.

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Naive Bayes

In [40]:
# Construct the Naive Bayes Model based on selected features
gnb_NB = GaussianNB()
gnb_NB.fit(X_train, y_train)
y_pred_NB = gnb_NB.predict(X_test)


# Construct the confusion matrix
confmat_NB = pd.DataFrame(confusion_matrix(y_test,y_pred_NB),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_NB)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_NB))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_NB))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_NB))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_NB))

         Predict[0]  Predict[1]
True[0]        1294          12
True[1]         237          57
accuracy: 0.844
precision: 0.826
recall: 0.194
F1: 0.314


In [41]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

Target
0    5222
1    5222
Name: count, dtype: int64

In [42]:
# Naive Bayes after SMOTE
gnb_smote = GaussianNB()
gnb_smote.fit(X_train_smote, y_train_smote)
y_pred_NB_smote = gnb_smote.predict(X_test)
confmat_NB_smote = pd.DataFrame(confusion_matrix(y_test, y_pred_NB_smote),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_NB_smote)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_NB_smote))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_NB_smote))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_NB_smote))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_NB_smote))

         Predict[0]  Predict[1]
True[0]         922         384
True[1]         101         193
accuracy: 0.697
precision: 0.334
recall: 0.656
F1: 0.443


In [43]:
# Select the top k features using mutual information
selector = SelectKBest(mutual_info_classif, k=14)
X_train_NB_selected = selector.fit_transform(X_train, y_train)
X_test_NB_selected = selector.transform(X_test)

print("Selected Features:", X.columns[selector.get_support(indices=True)])

Selected Features: Index(['A', 'B', 'C', 'E', 'F', 'H', 'J', 'L', 'M', 'N', 'Q', 'R', 'S', 'T'], dtype='object')


In [44]:
# Construct the Naive Bayes Model based on selected features
gnb_NB_selected = GaussianNB()
gnb_NB_selected.fit(X_train_NB_selected, y_train)
y_pred_NB_selected = gnb_NB_selected.predict(X_test_NB_selected)


# Construct the confusion matrix
confmat_NB_selected = pd.DataFrame(confusion_matrix(y_test,y_pred_NB_selected),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_NB_selected)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_NB_selected))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_NB_selected))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_NB_selected))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_NB_selected))

         Predict[0]  Predict[1]
True[0]        1293          13
True[1]         238          56
accuracy: 0.843
precision: 0.812
recall: 0.190
F1: 0.309


In [45]:
# Doing SMOTE again
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_NB_selected, y_train)
y_train_smote.value_counts()

Target
0    5222
1    5222
Name: count, dtype: int64

In [46]:
# Construct the Naive Bayes Model based on selected features and smote
gnb_NB_selected = GaussianNB()
gnb_NB_selected.fit(X_train_smote, y_train_smote)
y_pred_NB_selected = gnb_NB_selected.predict(X_test_NB_selected)


# Construct the confusion matrix
confmat_NB_selected = pd.DataFrame(confusion_matrix(y_test,y_pred_NB_selected),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_NB_selected)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_NB_selected))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_NB_selected))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_NB_selected))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_NB_selected))

         Predict[0]  Predict[1]
True[0]         913         393
True[1]          95         199
accuracy: 0.695
precision: 0.336
recall: 0.677
F1: 0.449


## Logistic Regression

In [47]:
# Processing Logistic Regression using default option
logistic = LogisticRegression()
# Obtain predictions using 5-fold cross-validation
y_pred_LR = cross_val_predict(logistic, X, y, cv=5)

# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic.fit(X_train, y_train)
y_pred_LR = logistic.predict(X_test)

# Construct the confusion matrix
confmat_LR = pd.DataFrame(confusion_matrix(y_test, y_pred_LR),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_LR)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_LR))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_LR))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_LR))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_LR))

         Predict[0]  Predict[1]
True[0]        1277          29
True[1]         231          63
accuracy: 0.838
precision: 0.685
recall: 0.214
F1: 0.326


In [48]:
# Use SMOTE
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

Target
0    5222
1    5222
Name: count, dtype: int64

In [49]:
# Logistic Regression after SMOTE
logistic_smote = LogisticRegression()
logistic_smote.fit(X_train_smote, y_train_smote)
y_pred_LR_smote = logistic_smote.predict(X_test)
confmat_LR_smote = pd.DataFrame(confusion_matrix(y_test, y_pred_LR_smote),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_LR_smote)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_LR_smote))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_LR_smote))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_LR_smote))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_LR_smote))

         Predict[0]  Predict[1]
True[0]         923         383
True[1]          85         209
accuracy: 0.708
precision: 0.353
recall: 0.711
F1: 0.472


## Decision Tree

In [50]:
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Processing Decision Tree using default option
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_pred_DT = DT.predict(X_test)

# Construct the confusion matrix
confmat_DT = pd.DataFrame(confusion_matrix(y_test, y_pred_DT),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_DT)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_DT))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_DT))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_DT))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_DT))

         Predict[0]  Predict[1]
True[0]        1199         107
True[1]         148         146
accuracy: 0.841
precision: 0.577
recall: 0.497
F1: 0.534


In [51]:
# Use SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

Target
0    5222
1    5222
Name: count, dtype: int64

In [52]:
# Processing Decision Tree using default option
DT.fit(X_train_smote, y_train_smote)
y_pred_DT_smote = DT.predict(X_test)

# Construct the confusion matrix
confmat_DT_smote = pd.DataFrame(confusion_matrix(y_test, y_pred_DT_smote),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_DT_smote)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_DT_smote))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_DT_smote))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_DT_smote))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_DT_smote))

         Predict[0]  Predict[1]
True[0]        1088         218
True[1]         118         176
accuracy: 0.790
precision: 0.447
recall: 0.599
F1: 0.512


## SVM

## Random Forest

## ADABoost

## Gradient Boost

## XG Boost

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

XGBM = XGBClassifier()
XGBM.fit(X_train,y_train)

In [46]:
y_pred_XG = XGBM.predict(X_test)
confmat_XG = pd.DataFrame(confusion_matrix(y_test, y_pred_XG),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_XG)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_XG))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_XG))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_XG))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_XG))

         Predict[0]  Predict[1]
True[0]        1280          26
True[1]         118         176
accuracy: 0.910
precision: 0.871
recall: 0.599
F1: 0.710


In [47]:
# Use SMOTE
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

Target
0    5222
1    5222
Name: count, dtype: int64

In [48]:
XGBM_smote = XGBClassifier()
XGBM_smote.fit(X_train_smote,y_train_smote)

y_pred_XG = XGBM_smote.predict(X_test)
confmat_XG = pd.DataFrame(confusion_matrix(y_test, y_pred_XG),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
print(confmat_XG)
print('accuracy: %.3f' % accuracy_score(y_test, y_pred_XG))
print('precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred_XG))
print('recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred_XG))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred_XG))

         Predict[0]  Predict[1]
True[0]        1215          91
True[1]          74         220
accuracy: 0.897
precision: 0.707
recall: 0.748
F1: 0.727


In [24]:
# Define the pipeline steps
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Define the parameter grid to search
param_grid = {
    'classifier__n_estimators': [1250,1300,1350],
    'classifier__max_depth': [3,4,5],
    'classifier__learning_rate': [0.19, 0.20, 0.21],
    'classifier__subsample': [0.90, 0.91]
}

# Setting up StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Setting up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=cv, scoring='f1', verbose=1, n_jobs=-1)

# Fit grid search
grid_search.fit(X, y)

# Best model after grid search
best_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Lists to store metrics and confusion matrices
metrics = []

# Perform cross-validation
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Fit the best_model on the training data
    best_model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store metrics
    metrics.append({
        'Confusion Matrix': cm,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

# Output the results
for index, result in enumerate(metrics):
    print(f"Results for Fold {index+1}:")
    print("Confusion Matrix:\n", result['Confusion Matrix'])
    print(f"Accuracy: {result['Accuracy']:.3f}")
    print(f"Precision: {result['Precision']:.3f}")
    print(f"Recall: {result['Recall']:.3f}")
    print(f"F1 Score: {result['F1 Score']:.3f}\n")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 1350, 'classifier__subsample': 0.9}
Best Score: 0.810612995086035
Results for Fold 1:
Confusion Matrix:
 [[1254   52]
 [  46  248]]
Accuracy: 0.939
Precision: 0.827
Recall: 0.844
F1 Score: 0.835

Results for Fold 2:
Confusion Matrix:
 [[1223   83]
 [  39  255]]
Accuracy: 0.924
Precision: 0.754
Recall: 0.867
F1 Score: 0.807

Results for Fold 3:
Confusion Matrix:
 [[1236   70]
 [  48  246]]
Accuracy: 0.926
Precision: 0.778
Recall: 0.837
F1 Score: 0.807

Results for Fold 4:
Confusion Matrix:
 [[1229   76]
 [  46  249]]
Accuracy: 0.924
Precision: 0.766
Recall: 0.844
F1 Score: 0.803

Results for Fold 5:
Confusion Matrix:
 [[1232   73]
 [  49  246]]
Accuracy: 0.924
Precision: 0.771
Recall: 0.834
F1 Score: 0.801



In [21]:
# Lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform cross-validation
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Fit the pipeline on the training data
    best_model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = best_model.predict(X_test)
    
    # Calculate metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

# Print average of the recorded metrics
print("Average Accuracy:", np.mean(accuracies))
print("Average Precision:", np.mean(precisions))
print("Average Recall:", np.mean(recalls))
print("Average F1 Score:", np.mean(f1_scores))

Average Accuracy: 0.9272499999999999
Average Precision: 0.7793798539818753
Average Recall: 0.8451170298627926
Average F1 Score: 0.810612995086035


## Test Data

In [65]:
f1_scores = []

# Load test dataset
test_data = pd.read_csv('test_data.csv')
    
# Separate features and target
X_test = test_data.drop('Target', axis=1)
y_test = test_data['Target']
    
# Make predictions
y_pred = best_model.predict(X_test)
    
# Calculate confusion matrix and then F1 score
cm = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
f1_scores.append(f1)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
    
    
print(cm)
print('accuracy: %.3f' % f1)
print('precision: %.3f' % accuracy)
print('recall: %.3f' % recall)
print('F1: %.3f' % precision)

[[832   9]
 [  9 150]]
accuracy: 0.943
precision: 0.982
recall: 0.943
F1: 0.943
