In [2]:
import pandas as pd
import warnings

warnings.filterwarnings(action="ignore", message=r'.*Use subset.*of np.ndarray is not recommended')
warnings.filterwarnings('always')  # "error", "ignore", "always", "default", "module" or "once"
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score, precision_score, recall_score, average_precision_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from tabulate import tabulate

In [3]:
df = pd.read_csv('input/mbti_data_clean.csv')

df.head()

Unnamed: 0,type,clean_posts,E_I,S_N,T_F,J_P
0,INFJ,enfp and intj moments sportscenter not top te...,0,0,0,1
1,ENTP,i m finding the lack of me in these posts ver...,1,0,1,0
2,INTP,good one of course to which i say i know that...,0,0,1,0
3,INTJ,dear intp i enjoyed our conversation the othe...,0,0,1,1
4,ENTJ,you re fired that s another silly misconcepti...,1,0,1,1


In [4]:
# Build the vocabulary from 1500 words that are not common words or MBTI personalities
vectorizer = CountVectorizer(stop_words = ['and','the','to','of','infj','entp','intp','intj','entj','enfj','infp','enfp','isfp','istp','isfj','istj','estp','esfp','estj','esfj','infjs','entps','intps','intjs','entjs','enfjs','infps','enfps','isfps','istps','isfjs','istjs','estps','esfps','estjs','esfjs'], max_features=1500, analyzer='word', max_df=0.8, min_df=0.1)

corpus = df['clean_posts'].values.reshape(1,-1).tolist()[0]

vectorizer.fit(corpus)

X_cnt = vectorizer.fit_transform(corpus)

X_cnt

<8675x886 sparse matrix of type '<class 'numpy.int64'>'
	with 1997107 stored elements in Compressed Sparse Row format>

In [5]:
# Transform the count matrix to a tf-idf representation
tfizer = TfidfTransformer()
tfizer.fit(X_cnt)

X = tfizer.fit_transform(X_cnt).toarray()

all_words = vectorizer.get_feature_names()

n_words = len(all_words)

n_words

886

In [6]:
X_df = pd.DataFrame.from_dict({w: X[:, i] for i, w in enumerate(all_words)})

X_df

Unnamed: 0,ability,able,above,absolutely,accept,accurate,across,act,action,actual,...,xd,yeah,year,years,yes,yesterday,yet,young,younger,yourself
0,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.049223,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.042993,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.039917,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.039886
2,0.139811,0.048495,0.000000,0.11671,0.0,0.069987,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.045025,0.148002,0.033104,0.0,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.079378,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.062033,0.061423,...,0.000000,0.032595,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.061151,0.000000,0.038393,0.000000,0.056455,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0.000000,0.000000,0.058614,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.065136,0.000000,0.000000,0.000000,0.000000,0.0,0.042929,0.000000,0.059659,0.000000
8671,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.094753,0.0,0.000000,0.000000,...,0.000000,0.056037,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.031651
8672,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.137912,0.061694,0.0,0.000000,0.000000,0.000000,0.041923
8673,0.000000,0.059896,0.000000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.046348,...,0.000000,0.073784,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [7]:
def balance_random_oversample(X_train, y_train):
    oversampler = RandomOverSampler(sampling_strategy=0.9, random_state=0)
    
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

def balance_smote_oversample(X_train, y_train):
    oversampler = SMOTE(random_state=0)
    
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

def balance_smote_oversample_random_undersample(X_train, y_train):
    oversampler = SMOTE(sampling_strategy=0.9, random_state=0) 
    undersampler = RandomUnderSampler(sampling_strategy=0.9, random_state=0)

    # Applying oversampler to oversample the minority class
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

    # Applying undersampler to reduce the majority class
    X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

def balance_borderlinesmote_oversample(X_train, y_train):
    oversampler = BorderlineSMOTE(random_state=0)
    
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

def balance_smoteenn_oversample_random_undersample(X_train, y_train):
    oversampler = SMOTEENN(sampling_strategy='auto', random_state=42)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=0)
    
    # Apply SMOTEENN (combination of SMOTE and ENN)
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

    # Apply random undersampling
    X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

def balance_smoteen_adasyn_oversample_random_undersample(X_train, y_train):
    oversampler_1 = SMOTEENN(sampling_strategy='auto', random_state=42)
    oversampler_2 = ADASYN(sampling_strategy=0.6, random_state=0, n_neighbors=5)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=0)

    # Balance the dataset using a combination of SMOTE and ENN
    X_resampled, y_resampled = oversampler_1.fit_resample(X_train, y_train)

    # Apply ADASYN for additional oversampling
    X_resampled, y_resampled = oversampler_2.fit_resample(X_resampled, y_resampled)

    # Apply additional undersampling to reduce the majority class size
    X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)

    return X_resampled, y_resampled

In [8]:
# Define the features and target variables
y_EI = df['E_I']
y_SN = df['S_N']
y_TF = df['T_F']
y_JP = df['J_P']

In [42]:
# Check class distribution
print(y_EI.value_counts())

0    6676
1    1999
Name: E_I, dtype: int64


In [43]:
print(y_SN.value_counts())

0    7478
1    1197
Name: S_N, dtype: int64


In [44]:
print(y_TF.value_counts())

0    4694
1    3981
Name: T_F, dtype: int64


In [45]:
print(y_JP.value_counts())

0    5241
1    3434
Name: J_P, dtype: int64


In [9]:
classifiers = {
    'Dummy': lambda: DummyClassifier(strategy='most_frequent', random_state=42),
    'SVM': lambda: SVC(probability=True, random_state=42),
    'LGBM': lambda: LGBMClassifier(random_state=42),
    'KNeighbors': lambda: KNeighborsClassifier(),    
    'DecisionTree': lambda: DecisionTreeClassifier(random_state=42),
    'RandomForest': lambda: RandomForestClassifier(random_state=42),
    'AdaBoost': lambda: AdaBoostClassifier(),
    'GradientBoosting': lambda: GradientBoostingClassifier(),
    'GaussianNB': lambda: GaussianNB(),
    'LogisticRegression': lambda: LogisticRegression(random_state=42),
    'XGB': lambda: XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'VotingClassifier': lambda: VotingClassifier(estimators=[
        ('SVM', SVC(probability=True, random_state=42)),
        ('LGBM', LGBMClassifier(random_state=42)),
        ('RandomForest', RandomForestClassifier(random_state=42)),
        ('LogisticRegression', LogisticRegression(random_state=42)),
        ('XGB', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ], voting='soft')
}

In [10]:
resamplers = {
  'RandomOverSampler': 'balance_random_oversample',
  'SMOTE': 'balance_smote_oversample',
  'SMOTE + RandomUnderSampler': 'balance_smote_oversample_random_undersample',
  'BorderlineSMOTE': 'balance_borderlinesmote_oversample',
  'SMOTEENN + RandomUnderSampler': 'balance_smoteenn_oversample_random_undersample',
  'SMOTEENN & ADASYN + RandomUnderSampler': 'balance_smoteen_adasyn_oversample_random_undersample'
}

In [11]:
def train_test_split_data(class_name, resampler=None):
  y_df = df[class_name]
  
  if resampler == None:
    X_cl = X_df
    y_cl = y_df
  else:
    rs = resamplers[resampler]

    X_cl, y_cl = globals()[rs](X_df, y_df)

  X_cl_train, X_cl_test, y_cl_train, y_cl_test = train_test_split(X_cl, y_cl, test_size=0.2, random_state=42, stratify=y_cl)

  return X_cl_train, X_cl_test, y_cl_train, y_cl_test

In [12]:
def classifier_report(classifier_name, y_test, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    avg_pre = average_precision_score(y_test, y_pred)
    
    report = {
        'Classifier': classifier_name,
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'Correct': tp + tn,
        'Incorrect': fp + fn,
        'Accuracy': round(accuracy, 3),
        'Precision': round(precision, 3),
        'Recall': round(recall, 3),
        'F1': round(f1, 3),
        'ROC AUC': round(roc_auc, 3),
        'Avg Precision': round(avg_pre, 3)
    }
    return report

In [13]:
# Display the class classificators report
def class_report(X_train, X_test, y_train, y_test, resampler=None):
    reports = []
    
    for clf_name, clf in classifiers.items():

        classifier = clf()

        if resampler == None:
            X_tr = X_train
            y_tr = y_train
        else:
            X_tr, y_tr = globals()[resampler](X_train, y_train)

        classifier.fit(X_tr, y_tr)
        y_pred = classifier.predict(X_test)

        reports.append(classifier_report(clf_name, y_test, y_pred))
    
    df_reports = pd.DataFrame(reports)
    
    return df_reports

In [14]:
# Display class classifiers report
def class_classifiers_report(class_name, resampler=None):
    reports = []

    X_train, X_test, y_train, y_test = train_test_split_data(class_name, resampler)
    
    for clf_name, clf in classifiers.items():

        classifier = clf()

        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)

        reports.append(classifier_report(clf_name, y_test, y_pred))
    
    df_reports = pd.DataFrame(reports)
    
    return df_reports

In [15]:
# Display none resampled class classifiers report
def noneresampled_class_classifiers_report(class_name):    
    cl_clf_report = class_classifiers_report(class_name)
    
    print('{:s} Classifiers Report'.format(class_name))

    print(tabulate(cl_clf_report, headers=cl_clf_report.columns, tablefmt="grid"))

In [16]:
# Display resampled class classifiers report
def resampled_class_classifiers_report(class_name, resampler):    
    cl_clf_report = class_classifiers_report(class_name, resampler)
    
    print('{:s} Classifiers Report with resampling: {:s}'.format(class_name, resampler))

    print(tabulate(cl_clf_report, headers=cl_clf_report.columns, tablefmt="grid"))

In [121]:
##############
# E_I ########
##############

In [32]:
# E_I Classifiers Test Report: Random Oversample
resampled_class_classifiers_report('E_I', 'RandomOverSampler')

E_I Classifiers Report: RandomOverSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1335 |    0 | 1202 |      1335 |        1202 |      0.526 |       0     |    0     | 0     |     0.5   |           0.474 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1080 | 1162 |  173 |  122 |      2242 |         295 |      0.884 |       0.862 |    0.899 | 0.88  |     0.884 |           0.823 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  2 | LGB

In [51]:
# E_I Classifiers Test Report: SMOTE
resampled_class_classifiers_report('E_I', 'SMOTE')

E_I Classifiers Report with resampling: SMOTE
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              | 1335 |    0 | 1336 |    0 |      1335 |        1336 |      0.5   |       0.5   |    1     | 0.667 |     0.5   |           0.5   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1085 | 1279 |   57 |  250 |      2364 |         307 |      0.885 |       0.95  |    0.813 | 0.876 |     0.885 |           0.866 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  2 |

In [52]:
# E_I Classifiers Test Report: SMOTE Oversample + Random Undersample
resampled_class_classifiers_report('E_I', 'SMOTE + RandomUnderSampler')

E_I Classifiers Report with resampling: SMOTE + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1335 |    0 | 1202 |      1335 |        1202 |      0.526 |       0     |    0     | 0     |     0.5   |           0.474 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  920 | 1286 |   49 |  282 |      2206 |         331 |      0.87  |       0.949 |    0.765 | 0.848 |     0.864 |           0.838 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+----

In [53]:
# E_I Classifiers Test Report: SMOTE Oversample + Random Undersample
resampled_class_classifiers_report('E_I', 'BorderlineSMOTE')  

E_I Classifiers Report with resampling: BorderlineSMOTE
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              | 1335 |    0 | 1336 |    0 |      1335 |        1336 |      0.5   |       0.5   |    1     | 0.667 |     0.5   |           0.5   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1098 | 1276 |   60 |  237 |      2374 |         297 |      0.889 |       0.948 |    0.822 | 0.881 |     0.889 |           0.869 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+---------------

In [54]:
# E_I Classifiers Test Report: SMOTE and ENN Oversample + Random Undersample
resampled_class_classifiers_report('E_I', 'SMOTEENN + RandomUnderSampler')    

E_I Classifiers Report with resampling: SMOTEENN + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |   11 |    0 |    7 |    0 |        11 |           7 |      0.611 |       0.611 |    1     | 0.759 |     0.5   |           0.611 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |   11 |    5 |    2 |    0 |        16 |           2 |      0.889 |       0.846 |    1     | 0.917 |     0.857 |           0.846 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-

In [55]:
# E_I Classifiers Test Report: SMOTE and ENN and additional ADASYN Oversample + Random Undersample
resampled_class_classifiers_report('E_I', 'SMOTEENN & ADASYN + RandomUnderSampler')

E_I Classifiers Report with resampling: SMOTEENN & ADASYN + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              | 1101 |    0 |  771 |    0 |      1101 |         771 |      0.588 |       0.588 |    1     | 0.741 |     0.5   |           0.588 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1101 |  771 |    0 |    0 |      1872 |           0 |      1     |       1     |    1     | 1     |     1     |           1     |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+----

In [None]:
##############
# S_N ########
##############

In [56]:
# S_N Classifiers Test Report: Random Oversample
resampled_class_classifiers_report('S_N', 'RandomOverSampler')

S_N Classifiers Report with resampling: RandomOverSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1496 |    0 | 1346 |      1496 |        1346 |      0.526 |       0     |    0     | 0     |     0.5   |           0.474 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1321 | 1437 |   59 |   25 |      2758 |          84 |      0.97  |       0.957 |    0.981 | 0.969 |     0.971 |           0.948 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-------------

In [57]:
# S_N Classifiers Test Report: SMOTE
resampled_class_classifiers_report('S_N', 'SMOTE')

S_N Classifiers Report with resampling: SMOTE
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1496 |    0 | 1496 |      1496 |        1496 |      0.5   |       0     |    0     | 0     |     0.5   |           0.5   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1332 | 1492 |    4 |  164 |      2824 |         168 |      0.944 |       0.997 |    0.89  | 0.941 |     0.944 |           0.943 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  2 |

In [58]:
# S_N Classifiers Test Report: SMOTE Oversample + Random Undersample
resampled_class_classifiers_report('S_N', 'SMOTE + RandomUnderSampler')

S_N Classifiers Report with resampling: SMOTE + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1496 |    0 | 1346 |      1496 |        1346 |      0.526 |       0     |    0     | 0     |     0.5   |           0.474 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1192 | 1493 |    3 |  154 |      2685 |         157 |      0.945 |       0.997 |    0.886 | 0.938 |     0.942 |           0.938 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+----

In [59]:
# S_N Classifiers Test Report: SMOTE Oversample + Random Undersample
resampled_class_classifiers_report('S_N', 'BorderlineSMOTE')  

S_N Classifiers Report with resampling: BorderlineSMOTE
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1496 |    0 | 1496 |      1496 |        1496 |      0.5   |       0     |    0     | 0     |     0.5   |           0.5   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1353 | 1490 |    6 |  143 |      2843 |         149 |      0.95  |       0.996 |    0.904 | 0.948 |     0.95  |           0.948 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+---------------

In [60]:
# S_N Classifiers Test Report: SMOTE and ENN Oversample + Random Undersample
resampled_class_classifiers_report('S_N', 'SMOTEENN + RandomUnderSampler')    

S_N Classifiers Report with resampling: SMOTEENN + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    5 |    0 |    3 |    0 |         5 |           3 |      0.625 |       0.625 |      1   | 0.769 |     0.5   |           0.625 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |    4 |    3 |    0 |    1 |         7 |           1 |      0.875 |       1     |      0.8 | 0.889 |     0.9   |           0.925 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-

In [61]:
# S_N Classifiers Test Report: SMOTE and ENN and additional ADASYN Oversample + Random Undersample
resampled_class_classifiers_report('S_N', 'SMOTEENN & ADASYN + RandomUnderSampler')

S_N Classifiers Report with resampling: SMOTEENN & ADASYN + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              | 1275 |    0 |  892 |    0 |      1275 |         892 |      0.588 |       0.588 |    1     | 0.741 |     0.5   |           0.588 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                | 1275 |  892 |    0 |    0 |      2167 |           0 |      1     |       1     |    1     | 1     |     1     |           1     |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+----

In [None]:
##############
# T_F ########
##############

In [39]:
# T_F Classifiers Test Report: Random Oversample
noneresampled_class_classifiers_report('T_F')

T_F Classifiers Report
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 |  939 |    0 |  796 |       939 |         796 |      0.541 |       0     |    0     | 0     |     0.5   |           0.459 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  588 |  760 |  179 |  208 |      1348 |         387 |      0.777 |       0.767 |    0.739 | 0.752 |     0.774 |           0.686 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  2 | LGBM               |  

In [None]:
##############
# J_P ########
##############

In [62]:
# J_P Classifiers Test Report: Random Oversample
resampled_class_classifiers_report('J_P', 'RandomOverSampler')

J_P Classifiers Report with resampling: RandomOverSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1049 |    0 |  943 |      1049 |         943 |      0.527 |       0     |    0     | 0     |     0.5   |           0.473 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  622 |  811 |  238 |  321 |      1433 |         559 |      0.719 |       0.723 |    0.66  | 0.69  |     0.716 |           0.638 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-------------

In [63]:
# J_P Classifiers Test Report: SMOTE
resampled_class_classifiers_report('J_P', 'SMOTE')

J_P Classifiers Report with resampling: SMOTE
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              | 1048 |    0 | 1049 |    0 |      1048 |        1049 |      0.5   |       0.5   |    1     | 0.666 |     0.5   |           0.5   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  697 |  867 |  182 |  351 |      1564 |         533 |      0.746 |       0.793 |    0.665 | 0.723 |     0.746 |           0.695 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  2 |

In [64]:
# J_P Classifiers Test Report: SMOTE Oversample + Random Undersample
resampled_class_classifiers_report('J_P', 'SMOTE + RandomUnderSampler')

J_P Classifiers Report with resampling: SMOTE + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    0 | 1048 |    0 |  944 |      1048 |         944 |      0.526 |       0     |    0     | 0     |     0.5   |           0.474 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  586 |  869 |  179 |  358 |      1455 |         537 |      0.73  |       0.766 |    0.621 | 0.686 |     0.725 |           0.655 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+----

In [65]:
# J_P Classifiers Test Report: SMOTE Oversample + Random Undersample
resampled_class_classifiers_report('J_P', 'BorderlineSMOTE')  

J_P Classifiers Report with resampling: BorderlineSMOTE
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              | 1048 |    0 | 1049 |    0 |      1048 |        1049 |      0.5   |       0.5   |    1     | 0.666 |     0.5   |           0.5   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  697 |  864 |  185 |  351 |      1561 |         536 |      0.744 |       0.79  |    0.665 | 0.722 |     0.744 |           0.693 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+---------------

In [66]:
# J_P Classifiers Test Report: SMOTE and ENN Oversample + Random Undersample
resampled_class_classifiers_report('J_P', 'SMOTEENN + RandomUnderSampler')    

J_P Classifiers Report with resampling: SMOTEENN + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |    4 |    0 |    2 |    0 |         4 |           2 |      0.667 |       0.667 |     1    | 0.8   |     0.5   |           0.667 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |    4 |    1 |    1 |    0 |         5 |           1 |      0.833 |       0.8   |     1    | 0.889 |     0.75  |           0.8   |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-

In [67]:
# J_P Classifiers Test Report: SMOTE and ENN and additional ADASYN Oversample + Random Undersample
resampled_class_classifiers_report('J_P', 'SMOTEENN & ADASYN + RandomUnderSampler')

J_P Classifiers Report with resampling: SMOTEENN & ADASYN + RandomUnderSampler
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|    | Classifier         |   TP |   TN |   FP |   FN |   Correct |   Incorrect |   Accuracy |   Precision |   Recall |    F1 |   ROC AUC |   Avg Precision |
|  0 | Dummy              |  705 |    0 |  493 |    0 |       705 |         493 |      0.588 |       0.588 |    1     | 0.741 |     0.5   |           0.588 |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+-----------+-----------------+
|  1 | SVM                |  705 |  493 |    0 |    0 |      1198 |           0 |      1     |       1     |    1     | 1     |     1     |           1     |
+----+--------------------+------+------+------+------+-----------+-------------+------------+-------------+----------+-------+----

In [18]:
#####################################
# Class Model Performance Tests #####
#####################################

def class_classifier_model_test(class_name, classifier, resampler=None):
    X_train, X_test, y_train, y_test = train_test_split_data(class_name, resampler)
    
    model_classifier = classifiers[classifier]()

    if resampler:
      print('{:s} Class Classifier: {:s} Model Test Report With Resampling: {:s}'.format(class_name, classifier, resampler), end="\n\n")
    else:
      print('{:s} Class Classifier: {:s} Model Test Report'.format(class_name, classifier), end="\n\n")

    # Cross-Validation and Performance Stability
    ei_cv_scores = cross_val_score(model_classifier, X_train, y_train, cv=5, scoring='f1')
    
    print('{:s} F1 Cross-Validation Scores: {:s}'.format(classifier, str(ei_cv_scores)))
    print('{:s} F1 Mean: {:n}'.format(classifier, ei_cv_scores.mean()), end="\n\n")

    # Performance on Test Data
    model_classifier.fit(X_train, y_train)

    y_pred = model_classifier.predict(X_test)

    model_report = classification_report(y_test, y_pred, output_dict=True)

    model_report_headers = ["Class", "Precision", "Recall", "F1-Score", "Support"]
    model_report_rows = [
        ["0", model_report["0"]["precision"], model_report["0"]["recall"], model_report["0"]["f1-score"], model_report["0"]["support"]],
        ["1", model_report["1"]["precision"], model_report["1"]["recall"], model_report["1"]["f1-score"], model_report["1"]["support"]],
        ["Accuracy", "", "", model_report["accuracy"], ""],
        ["Macro Avg", model_report["macro avg"]["precision"], model_report["macro avg"]["recall"], model_report["macro avg"]["f1-score"], model_report["macro avg"]["support"]],
        ["Weighted Avg", model_report["weighted avg"]["precision"], model_report["weighted avg"]["recall"], model_report["weighted avg"]["f1-score"], model_report["weighted avg"]["support"]],
    ]

    print('')
    print(tabulate(model_report_rows, headers=model_report_headers, tablefmt="grid"))

In [89]:
# E_I class model tests
class_classifier_model_test('E_I', 'SVM', 'BorderlineSMOTE')

E_I Class Classifier: SVM Model Test Report With Resampling: BorderlineSMOTE

SVM F1 Cross-Validation Scores: [0.86052496 0.87315334 0.86717557 0.8428645  0.85390947]
SVM F1 Mean: 0.859526


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.8433575677461996 | 0.9550898203592815 |   0.895753 | 1336      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.9481865284974094 | 0.8224719101123595 |   0.880866 | 1335      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.888806 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.8957720481218046 | 0.8887808652358204 |   0.88831  | 2671      |
+--------------+--------------------+----------------

In [90]:
class_classifier_model_test('E_I', 'VotingClassifier', 'BorderlineSMOTE')

E_I Class Classifier: VotingClassifier Model Test Report With Resampling: BorderlineSMOTE

VotingClassifier F1 Cross-Validation Scores: [0.86163203 0.86839482 0.86995074 0.84580153 0.86453577]
VotingClassifier F1 Mean: 0.862063


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.8334430546412114 | 0.9476047904191617 |   0.886865 | 1336      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.9392361111111112 | 0.8104868913857678 |   0.870125 | 1335      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.879072 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.8863395828761613 | 0.8790458409024647 |   0.878495 | 2671      |
+-------------

In [91]:
class_classifier_model_test('E_I', 'RandomForest', 'RandomOverSampler')

E_I Class Classifier: RandomForest Model Test Report With Resampling: RandomOverSampler

RandomForest F1 Cross-Validation Scores: [0.9209508  0.909699   0.91349862 0.91396648 0.91214326]
RandomForest F1 Mean: 0.914052


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.9341359773371105 | 0.9880149812734083 |   0.96032  | 1335      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.9857777777777778 | 0.9226289517470881 |   0.953159 | 1202      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.957036 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.9599568775574441 | 0.9553219665102481 |   0.956739 | 2537      |
+--------------+--------

In [92]:
# S_N class model tests
class_classifier_model_test('S_N', 'SVM', 'BorderlineSMOTE')

S_N Class Classifier: SVM Model Test Report With Resampling: BorderlineSMOTE

SVM F1 Cross-Validation Scores: [0.94236692 0.94820018 0.94371152 0.93998235 0.93605684]
SVM F1 Mean: 0.942064


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.9124311083894673 | 0.9959893048128342 |   0.952381 | 1496      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.9955849889624724 | 0.9044117647058824 |   0.947811 | 1496      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.950201 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.9540080486759699 | 0.9502005347593583 |   0.950096 | 2992      |
+--------------+--------------------+----------------

In [93]:
class_classifier_model_test('S_N', 'VotingClassifier', 'RandomOverSampler')

S_N Class Classifier: VotingClassifier Model Test Report With Resampling: RandomOverSampler

VotingClassifier F1 Cross-Validation Scores: [0.9656121  0.97204101 0.96357769 0.97322253 0.96244784]
VotingClassifier F1 Mean: 0.96738


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.9911383776414451 | 0.9719251336898396 |   0.981438 | 1496      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.9694545454545455 | 0.9903417533432393 |   0.979787 | 1346      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.980647 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.9802964615479952 | 0.9811334435165394 |   0.980612 | 2842      |
+------------

In [94]:
class_classifier_model_test('S_N', 'RandomForest', 'RandomOverSampler')

S_N Class Classifier: RandomForest Model Test Report With Resampling: RandomOverSampler

RandomForest F1 Cross-Validation Scores: [0.98455779 0.98780488 0.98403756 0.98876404 0.98541176]
RandomForest F1 Mean: 0.986115


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.9946702198534311 | 0.9979946524064172 |   0.99633  | 1496      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.9977628635346756 | 0.9940564635958395 |   0.995906 | 1346      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.996129 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.9962165416940534 | 0.9960255580011284 |   0.996118 | 2842      |
+--------------+--------

In [100]:
# T_F class model tests
class_classifier_model_test('T_F', 'SVM')

T_F Class Classifier: SVM Model Test Report

SVM F1 Cross-Validation Scores: [0.72859451 0.74718196 0.73413655 0.73642173 0.76212833]
SVM F1 Mean: 0.741693


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.7851239669421488 | 0.8093716719914803 |   0.797063 | 939       |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.7666232073011734 | 0.7386934673366834 |   0.752399 | 796       |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.776945 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.7758735871216611 | 0.7740325696640818 |   0.774731 | 1735      |
+--------------+--------------------+--------------------+------------+-----------+
| 

In [101]:
class_classifier_model_test('T_F', 'VotingClassifier')

T_F Class Classifier: VotingClassifier Model Test Report

VotingClassifier F1 Cross-Validation Scores: [0.72904801 0.74796748 0.72168285 0.728      0.74418605]
VotingClassifier F1 Mean: 0.734177


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.778118609406953  | 0.8104366347177849 |   0.793949 | 939       |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.7648612945838837 | 0.7273869346733668 |   0.745654 | 796       |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.772334 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.7714899519954184 | 0.7689117846955759 |   0.769801 | 1735      |
+--------------+--------------------+----------

In [102]:
# J_P class model tests
class_classifier_model_test('J_P', 'SVM', 'SMOTE')

J_P Class Classifier: SVM Model Test Report With Resampling: SMOTE

SVM F1 Cross-Validation Scores: [0.69457162 0.70437018 0.7073329  0.70519481 0.71824629]
SVM F1 Mean: 0.705943


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.7118226600985221 | 0.8265014299332698 |   0.764888 | 1049      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.7929465301478953 | 0.6650763358778626 |   0.723404 | 1048      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.745827 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.7523845951232087 | 0.7457888829055662 |   0.744146 | 2097      |
+--------------+--------------------+--------------------+-----

In [103]:
class_classifier_model_test('J_P', 'VotingClassifier', 'SMOTE')

J_P Class Classifier: VotingClassifier Model Test Report With Resampling: SMOTE

VotingClassifier F1 Cross-Validation Scores: [0.69827034 0.73142857 0.71212121 0.71245186 0.72646873]
VotingClassifier F1 Mean: 0.716148


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.7179916317991631 | 0.8179218303145853 |   0.764706 | 1049      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.7882483370288248 | 0.6784351145038168 |   0.729231 | 1048      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.748212 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.753119984413994  | 0.7481784724092011 |   0.746968 | 2097      |
+--------------+--------

In [19]:
class_classifier_model_test('J_P', 'VotingClassifier', 'BorderlineSMOTE')

J_P Class Classifier: VotingClassifier Model Test Report With Resampling: BorderlineSMOTE

VotingClassifier F1 Cross-Validation Scores: [0.70655633 0.72267846 0.70580808 0.72437938 0.73241206]
VotingClassifier F1 Mean: 0.718367


+--------------+--------------------+--------------------+------------+-----------+
| Class        | Precision          | Recall             |   F1-Score | Support   |
| 0            | 0.7148972602739726 | 0.7959961868446139 |   0.75327  | 1049      |
+--------------+--------------------+--------------------+------------+-----------+
| 1            | 0.7696447793326158 | 0.6822519083969466 |   0.723318 | 1048      |
+--------------+--------------------+--------------------+------------+-----------+
| Accuracy     |                    |                    |   0.739151 |           |
+--------------+--------------------+--------------------+------------+-----------+
| Macro Avg    | 0.7422710198032942 | 0.7391240476207803 |   0.738294 | 2097      |
+-------------