In [46]:
import os
import numpy as np
import pandas as pd

from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE
# from imblearn.combine import SMOTEENN, SMOTETomek # Tried these, didn't work well
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.metrics import classification_report_imbalanced, make_index_balanced_accuracy

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer #, SimpleImputer KNNImputer

# from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score

from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression, RidgeClassifier #, LinearRegression, MultiTaskLassoCV
from sklearn.tree import DecisionTreeClassifier #, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

In [47]:
file_path = os.path.join("clean_student_data.csv")

df = pd.read_csv(file_path, index_col="STU_ID")

# EDA

In [3]:
df.dtypes

BYSEX       int64
BYRACE      int64
BYSTLANG    int64
BYPARED     int64
BYINCOME    int64
BYURBAN     int64
BYREGION    int64
BYRISKFC    int64
BYS34A      int64
BYS34B      int64
BYWRKHRS    int64
BYS42       int64
BYS43       int64
BYTVVIGM    int64
BYS46B      int64
BYS44C      int64
BYS20E      int64
BYS87C      int64
BYS20D      int64
BYS23C      int64
BYS37       int64
BYS27I      int64
BYS90D      int64
BYS38A      int64
BYS20J      int64
BYS24C      int64
BYS24D      int64
BYS54I      int64
BYS84D      int64
BYS84I      int64
BYS85A      int64
F2HSSTAT    int64
F2EVERDO    int64
F1RGPP2     int64
dtype: object

In [4]:
for col in df.columns:
    print(df[col].value_counts())

 0    7095
 1    6994
-1     707
Name: BYSEX, dtype: int64
 5    8058
 4    2008
 3    1779
 2    1346
-1     815
 0     673
 1     117
Name: BYRACE, dtype: int64
 1    11638
 0     2343
-1      815
Name: BYSTLANG, dtype: int64
 6    3193
 2    2813
 7    1629
 5    1597
 3    1539
 4    1477
 8     959
 1     841
-1     748
Name: BYPARED, dtype: int64
5    3049
4    2757
3    2642
6    2001
7    1679
2    1330
1     741
8     523
0      74
Name: BYINCOME, dtype: int64
2    7238
1    4877
3    2681
Name: BYURBAN, dtype: int64
3    5364
2    3749
4    3143
1    2540
Name: BYREGION, dtype: int64
 0    4695
-1    3719
 1    3536
 2    1740
 3     750
 4     293
 5      63
Name: BYRISKFC, dtype: int64
 1    8992
 2    2359
-1    1310
 0     872
 5     506
 3     487
 4     270
Name: BYS34A, dtype: int64
 1    7527
 2    2901
 3    1244
-1    1197
 0     895
 4     552
 5     245
 6     235
Name: BYS34B, dtype: int64
 0    7396
-1    3208
 2    1041
 1     905
 4     748
 3     630
 5     3

# Create X and y arrays

In [5]:
X = df[[column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]]

y_graduate = df["F2HSSTAT"]
y_dropout = df["F2EVERDO"]
y_gpa = df["F1RGPP2"]

all_y = {
    "y_graduate": y_graduate,
    "y_dropout": y_dropout,
    "y_gpa": y_gpa
}

# Create the pipeline for categorical data

In [6]:
categorical_features = [column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]
categorical_transformer = make_pipeline(
    IterativeImputer(estimator=RandomForestClassifier(), missing_values=-1),
    OneHotEncoder(drop="first", sparse=False)
)

# Set Up Train-Test-Split Variables and Imbalanced Learn Models

In [40]:
X_train, X_test, y_graduate_train, y_graduate_test = train_test_split(X, y_graduate, test_size=0.2, random_state=37, stratify=y_graduate)
X_train, X_test, y_dropout_train, y_dropout_test = train_test_split(X, y_dropout, test_size=0.2, random_state=37, stratify=y_dropout)
X_train, X_test, y_gpa_train, y_gpa_test = train_test_split(X, y_gpa, test_size=0.2, random_state=37, stratify=y_gpa)

imb_models = {
    "RandomUnderSampler": RandomUnderSampler(),
    "RandomOverSampler": RandomOverSampler(),
    "SMOTE": SMOTE(),
    "ADASYN": ADASYN()
}

## Functions to fit and predict models and return report

In [48]:
def train_and_score_model_tts(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report_imbalanced(y_test, y_pred)
    
    return report


def print_report(model, all_y, X=None):
    
    for y in all_y:
        print("=" * 55, "\n", y)
        report = train_and_score_model_tts(model, X_train, X_test, eval(y + "_train"), eval(y + "_test"))
        print(report)
    
    print("=" * 55)
    
    
def print_imb_report(imb_model, clf, all_y, reporter=print_report, X=None):
    print("=" * 55)
    print(imb_model)
    reporter(clf, all_y, X)

## Predict using RandomForestClassifier without imbalanced learn

In [49]:
clf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

print_report(clf, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       182
          1       0.94      1.00      0.00      0.97      0.00      0.00      2778

avg / total       0.88      0.94      0.06      0.91      0.00      0.00      2960

 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      1.00      0.00      0.95      0.00      0.00      2658
          1       0.00      0.00      1.00      0.00      0.00      0.00       302

avg / total       0.81      0.90      0.10      0.85      0.00      0.00      2960

 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.29      0.02      1.00      0.04      0.15      0.02       208
          1       0.54      0.63      0.76      0.58      0.69      0.47       925
          2       0.57      0.66      0.61      0.

## Predict using RandomForestClassifier with imbalanced learn

In [50]:
for imb_model in imb_models:
    
    rus_clf = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        RandomForestClassifier()
    )

    print_imb_report(imb_model, rus_clf, all_y)

RandomUnderSampler
 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.54      0.48      0.11      0.51      0.26       182
          1       0.94      0.48      0.54      0.64      0.51      0.26      2778

avg / total       0.89      0.48      0.53      0.60      0.51      0.26      2960

 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.52      0.49      0.66      0.50      0.25      2658
          1       0.10      0.49      0.52      0.17      0.50      0.25       302

avg / total       0.82      0.51      0.49      0.61      0.50      0.25      2960

 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.21      0.63      0.82      0.31      0.72      0.51       208
          1       0.49      0.38      0.82      0.43      0.56      0.30       925
          2       0.59      0.3

## TRAIN-TEST-SPLIT RESULTS: Does not do well.

# Cross Validation

In [33]:
def cv_report(model, all_y, X):
    
    cv_accuracy_lst = []
    cv_precision_lst = []
    cv_recall_lst = []
        
    for y in all_y:
        
        cv_dict = cross_validate(model, X, all_y[y], scoring={
            "Accuracy": "accuracy",
            "Weighted_Precision": make_scorer(precision_score, average="weighted", zero_division=0),
            "Weighted_Recall": make_scorer(recall_score, average="weighted", zero_division=0)
        })

        cv_accuracy = cv_dict["test_Accuracy"].mean()
        cv_accuracy_lst.append(cv_accuracy)
        
        cv_precision = cv_dict["test_Weighted_Precision"].mean()
        cv_precision_lst.append(cv_precision)
        
        cv_recall = cv_dict["test_Weighted_Recall"].mean()
        cv_recall_lst.append(cv_recall)
    
    return pd.DataFrame(data={"parameter": list(all_y.keys()), "accuracy": cv_accuracy_lst, "weighted_precision": cv_precision_lst, "weighted_recall": cv_recall_lst}).set_index("parameter")

## Models Not Utilizing Imbalanced Learn

### Null Model

In [34]:
dc = make_pipeline(
    StandardScaler(),
    DummyClassifier(strategy="prior")
)



cv_report(dc, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.938564,0.880903,0.938564
y_dropout,0.897945,0.806306,0.897945
y_gpa,0.437618,0.19151,0.437618


### Logistic Regression

In [13]:
lr = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

cv_report(lr, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.937483,0.887798,0.937483
y_dropout,0.895309,0.864977,0.895309
y_gpa,0.538657,0.537834,0.538657


### Decision Tree

In [14]:
dt = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

cv_report(dt, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.859487,0.894235,0.859487
y_dropout,0.796497,0.837608,0.796497
y_gpa,0.428898,0.433853,0.428898


### Random Forest

In [15]:
rf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

cv_report(rf, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.920112,0.884553,0.920112
y_dropout,0.875303,0.85292,0.875303
y_gpa,0.545551,0.525191,0.545551


### Multi-layer Perceptron Classifier

In [16]:
mlp = make_pipeline(
    StandardScaler(),
    MLPClassifier()
)

cv_report(mlp, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.901458,0.898288,0.901458
y_dropout,0.842252,0.847651,0.842252
y_gpa,0.506825,0.50473,0.506825


## Models Utilizing Imbalanced Learn

### Null Model

In [17]:
dc = imb_make_pipeline(
    StandardScaler(),
    SMOTE(),
    DummyClassifier(strategy="prior")
)

print("=" * 55)
print("SMOTE")
cv_report(dc, all_y, X)

SMOTE


Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.061436,0.003774,0.061436
y_dropout,0.897945,0.806306,0.897945
y_gpa,0.070154,0.004922,0.070154


### Logistic Regression

In [18]:
for imb_model in imb_models:
    
    lr = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        LogisticRegression()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(lr, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.653213            0.920688         0.653213
y_dropout   0.674975            0.882582         0.674975
y_gpa       0.411395            0.490394         0.411395
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.657404            0.921156         0.657404
y_dropout   0.673624            0.882638         0.673624
y_gpa       0.420046            0.497278         0.420046
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.667135            0.920996         0.667135
y_dropout   0.675246            0.882801         0.675246
y_gpa       0.423357            0.498874         0.423357
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Decision Tree

In [19]:
for imb_model in imb_models:
    
    dt = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        DecisionTreeClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(dt, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.597051            0.907008         0.597051
y_dropout   0.593468            0.851847         0.593468
y_gpa       0.379495            0.436668         0.379495
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.859284            0.889869         0.859284
y_dropout   0.794806            0.832651         0.794806
y_gpa       0.425790            0.433195         0.425790
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.857730            0.893696         0.857730
y_dropout   0.791157            0.837618         0.791157
y_gpa       0.428831            0.437887         0.428831
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Random Forest

In [20]:
for imb_model in imb_models:
    
    rf = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        RandomForestClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(rf, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.652335            0.923621         0.652335
y_dropout   0.675855            0.881411         0.675855
y_gpa       0.427208            0.499547         0.427208
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.911934            0.889602         0.911934
y_dropout   0.859893            0.834445         0.859893
y_gpa       0.534400            0.526896         0.534400
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.922140            0.892193         0.922140
y_dropout   0.872734            0.839342         0.872734
y_gpa       0.537035            0.528496         0.537035
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Balanced Bagging Classifier

In [21]:
rf = imb_make_pipeline(
    StandardScaler(),
    BalancedBaggingClassifier()
)

print(cv_report(rf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.681397            0.916794         0.681397
y_dropout   0.750537            0.860599         0.750537
y_gpa       0.440523            0.492922         0.440523


### Balanced Random Forest

In [22]:
rf = imb_make_pipeline(
    StandardScaler(),
    BalancedRandomForestClassifier()
)

print(cv_report(rf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.666393            0.926488         0.666393
y_dropout   0.661391            0.883749         0.661391
y_gpa       0.430656            0.510755         0.430656


### Easy Ensemble Classifier

In [23]:
rf = imb_make_pipeline(
    StandardScaler(),
    EasyEnsembleClassifier()
)

print(cv_report(rf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.639222            0.927076         0.639222
y_dropout   0.653480            0.883229         0.653480
y_gpa       0.430927            0.512429         0.430927


### RUS Boost Classifier

In [24]:
rf = imb_make_pipeline(
    StandardScaler(),
    RUSBoostClassifier()
)

print(cv_report(rf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.649563            0.922578         0.649563
y_dropout   0.651925            0.879158         0.651925
y_gpa       0.420652            0.506383         0.420652


### Multi-layer Perceptron Classifier with Oversampling

In [25]:
for imb_model in imb_models:

    mlp = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        MLPClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(mlp, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.638279            0.915792         0.638279
y_dropout   0.641249            0.869309         0.641249
y_gpa       0.399972            0.470423         0.399972
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.825761            0.898413         0.825761
y_dropout   0.770541            0.846965         0.770541
y_gpa       0.456540            0.493162         0.456540
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.841238            0.895525         0.841238
y_dropout   0.783181            0.841437         0.783181
y_gpa       0.473505            0.502082         0.473505
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

In [26]:
for imb_model in imb_models:

    mlp = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        MLPClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(mlp, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.635640            0.917057         0.635640
y_dropout   0.631245            0.871489         0.631245
y_gpa       0.407813            0.470723         0.407813
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.834073            0.898887         0.834073
y_dropout   0.769934            0.849259         0.769934
y_gpa       0.460461            0.490548         0.460461
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.845428            0.896582         0.845428
y_dropout   0.773177            0.837016         0.773177
y_gpa       0.463636            0.494678         0.463636
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

# Best Model