In [1]:
import os
import numpy as np
import pandas as pd

from random import randint

from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.metrics import classification_report_imbalanced

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split, cross_validate

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import make_scorer, precision_score, recall_score, roc_auc_score

from joblib import dump

import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path = os.path.join("clean_student_data.csv")

df = pd.read_csv(file_path, index_col="STU_ID")

# EDA

In [3]:
df.dtypes

BYSEX       int64
BYRACE      int64
BYSTLANG    int64
BYPARED     int64
BYINCOME    int64
BYURBAN     int64
BYREGION    int64
BYRISKFC    int64
BYS34A      int64
BYS34B      int64
BYWRKHRS    int64
BYS42       int64
BYS43       int64
BYTVVIGM    int64
BYS46B      int64
BYS44C      int64
BYS20E      int64
BYS87C      int64
BYS20D      int64
BYS23C      int64
BYS37       int64
BYS27I      int64
BYS90D      int64
BYS38A      int64
BYS20J      int64
BYS24C      int64
BYS24D      int64
BYS54I      int64
BYS84D      int64
BYS84I      int64
BYS85A      int64
F2HSSTAT    int64
F2EVERDO    int64
F1RGPP2     int64
dtype: object

In [4]:
for col in df.columns:
    print(df[col].value_counts())

 0    7095
 1    6994
-1     707
Name: BYSEX, dtype: int64
 1    8058
 0    5923
-1     815
Name: BYRACE, dtype: int64
 1    11638
 0     2343
-1      815
Name: BYSTLANG, dtype: int64
 1    13207
 0      841
-1      748
Name: BYPARED, dtype: int64
1    11760
0     3036
Name: BYINCOME, dtype: int64
0    12115
1     2681
Name: BYURBAN, dtype: int64
3    5364
2    3749
4    3143
1    2540
Name: BYREGION, dtype: int64
 1    6382
 0    4695
-1    3719
Name: BYRISKFC, dtype: int64
 1    11351
-1     1310
 2     1263
 0      872
Name: BYS34A, dtype: int64
 1    10428
 2     2276
-1     1197
 0      895
Name: BYS34B, dtype: int64
 0    7396
 1    4192
-1    3208
Name: BYWRKHRS, dtype: int64
 1    6367
 0    4724
 2    2212
-1    1493
Name: BYS42, dtype: int64
 1    9137
 0    3692
-1    1393
 2     574
Name: BYS43, dtype: int64
 0    7892
 1    3590
-1    3314
Name: BYTVVIGM, dtype: int64
 0    10321
 1     2600
-1     1875
Name: BYS46B, dtype: int64
 1    8836
 2    2669
-1    1718
 3    1348

# Create X and y arrays

In [5]:
X = df[[column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]]

y_graduate = df["F2HSSTAT"]
y_dropout = df["F2EVERDO"]
y_gpa = df["F1RGPP2"]

all_y = {
    "y_graduate": y_graduate,
    "y_dropout": y_dropout,
    "y_gpa": y_gpa
}

# Create the pipeline for categorical data

In [6]:
categorical_features = [column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]
categorical_transformer = make_pipeline(
    IterativeImputer(estimator=RandomForestClassifier(), missing_values=-1),
    OneHotEncoder(drop="first", sparse=False)
)

# Set Up Train-Test-Split Variables and Imbalanced Learn Models

In [7]:
random_int = randint(0, 1000000)

X_train, X_test, y_graduate_train, y_graduate_test = train_test_split(X, y_graduate, test_size=0.2, random_state=random_int, stratify=y_graduate)
X_train, X_test, y_dropout_train, y_dropout_test = train_test_split(X, y_dropout, test_size=0.2, random_state=random_int, stratify=y_dropout)
X_train, X_test, y_gpa_train, y_gpa_test = train_test_split(X, y_gpa, test_size=0.2, random_state=random_int, stratify=y_gpa)

imb_models = {
    "RandomUnderSampler": RandomUnderSampler(),
    "RandomOverSampler": RandomOverSampler(),
    "SMOTE": SMOTE(),
    "ADASYN": ADASYN()
}

## Functions to fit and predict models and return report

In [8]:
bar_length = 82

def train_and_score_model_tts(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report_imbalanced(y_test, y_pred)
    
    return report


def print_report(model, all_y, X=None):
    
    for y in all_y:
        print("=" * bar_length, "\n", y)
        report = train_and_score_model_tts(model, X_train, X_test, eval(y + "_train"), eval(y + "_test"))
        print(report)
        
        model.fit(X_train, eval(y + "_train"))
        y_prob = model.predict_proba(X_test)
        if y != "y_gpa":
            y_prob = [prob[1] for prob in y_prob]

        weighted_roc_auc_ovr = roc_auc_score(eval(y + "_test"), y_prob, multi_class="ovr", average="weighted")
        print("One-vs-Rest ROC AUC score: {:.6f} (weighted by prevalence)".format(weighted_roc_auc_ovr))
        
    print("=" * bar_length)
    
    
def print_imb_report(imb_model, clf, all_y, reporter=print_report, X=None):
    print("=" * bar_length)
    print(imb_model)
    reporter(clf, all_y, X)

### Null Model

In [9]:
dc = make_pipeline(
    StandardScaler(),
    DummyClassifier(strategy="prior")
)

print_report(dc, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       182
          1       0.94      1.00      0.00      0.97      0.00      0.00      2778

avg / total       0.88      0.94      0.06      0.91      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.500000 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      1.00      0.00      0.95      0.00      0.00      2658
          1       0.00      0.00      1.00      0.00      0.00      0.00       302

avg / total       0.81      0.90      0.10      0.85      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.500000 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       208
          1

## Predict using RandomForestClassifier without imbalanced learn

In [10]:
clf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

print_report(clf, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       182
          1       0.94      1.00      0.00      0.97      0.00      0.00      2778

avg / total       0.88      0.94      0.06      0.91      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.467142 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      1.00      0.00      0.95      0.00      0.00      2658
          1       0.00      0.00      1.00      0.00      0.00      0.00       302

avg / total       0.81      0.90      0.10      0.85      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.486422 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.33      0.02      1.00      0.04      0.15      0.02       208
          1

## Predict using RandomForestClassifier with imbalanced learn

In [11]:
for imb_model in imb_models:
    
    rus_clf = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        RandomForestClassifier()
    )

    print_imb_report(imb_model, rus_clf, all_y)

RandomUnderSampler
 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.51      0.48      0.11      0.49      0.24       182
          1       0.94      0.48      0.51      0.63      0.49      0.24      2778

avg / total       0.88      0.48      0.50      0.60      0.49      0.24      2960

One-vs-Rest ROC AUC score: 0.483500 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.53      0.46      0.67      0.49      0.25      2658
          1       0.10      0.46      0.53      0.16      0.49      0.24       302

avg / total       0.82      0.53      0.46      0.62      0.49      0.25      2960

One-vs-Rest ROC AUC score: 0.494605 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.20      0.58      0.82      0.29      0.69      0.46   

## Predict using BalancedRandomForestClassifier

In [12]:
brf = imb_make_pipeline(
    StandardScaler(),
    BalancedRandomForestClassifier()
)

print_report(brf, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.49      0.49      0.11      0.49      0.24       182
          1       0.94      0.49      0.49      0.64      0.49      0.24      2778

avg / total       0.88      0.49      0.49      0.61      0.49      0.24      2960

One-vs-Rest ROC AUC score: 0.487893 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.52      0.47      0.66      0.49      0.24      2658
          1       0.10      0.47      0.52      0.16      0.49      0.24       302

avg / total       0.81      0.51      0.47      0.61      0.49      0.24      2960

One-vs-Rest ROC AUC score: 0.479164 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.20      0.68      0.79      0.31      0.73      0.53       208
          1

## TRAIN-TEST-SPLIT RESULTS: No models do very well, and they all yield relatively similar results.
### The models do not do well at predicting successful high school graduation or high school dropouts.
### However, RandomForestClassifier with RandomOverSampler imbalanced learn appears to do reasonably better than the null model. Therefore, we will select this model for our app.
## Export model

In [13]:
rus_clf = imb_make_pipeline(
    StandardScaler(),
    imb_models["RandomOverSampler"],
    RandomForestClassifier()
)

rus_clf.fit(X, y_gpa)
dump(rus_clf, "rus_clf.joblib")

['rus_clf.joblib']

# Cross Validation
### This is mostly to confirm that these models don't do any better then train-test-split.
### Note: This takes a long time to run...

In [14]:
def cv_report(model, all_y, X):
    
    cv_accuracy_lst = []
    cv_precision_lst = []
    cv_recall_lst = []
        
    for y in all_y:
        
        cv_dict = cross_validate(model, X, all_y[y], scoring={
            "Accuracy": "accuracy",
            "Weighted_Precision": make_scorer(precision_score, average="weighted", zero_division=0),
            "Weighted_Recall": make_scorer(recall_score, average="weighted", zero_division=0)
        })

        cv_accuracy = cv_dict["test_Accuracy"].mean()
        cv_accuracy_lst.append(cv_accuracy)
        
        cv_precision = cv_dict["test_Weighted_Precision"].mean()
        cv_precision_lst.append(cv_precision)
        
        cv_recall = cv_dict["test_Weighted_Recall"].mean()
        cv_recall_lst.append(cv_recall)
    
    return pd.DataFrame(data={"parameter": list(all_y.keys()), "accuracy": cv_accuracy_lst, "weighted_precision": cv_precision_lst, "weighted_recall": cv_recall_lst}).set_index("parameter")

## Models Not Utilizing Imbalanced Learn

### Null Model

In [15]:
dc = make_pipeline(
    StandardScaler(),
    DummyClassifier(strategy="prior")
)

cv_report(dc, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.938564,0.880903,0.938564
y_dropout,0.897945,0.806306,0.897945
y_gpa,0.437618,0.19151,0.437618


### Logistic Regression

In [16]:
lr = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

cv_report(lr, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.937686,0.903458,0.937686
y_dropout,0.895309,0.862453,0.895309
y_gpa,0.532304,0.528464,0.532304


### Decision Tree

In [17]:
dt = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

cv_report(dt, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.855094,0.896132,0.855094
y_dropout,0.796497,0.835026,0.796497
y_gpa,0.431603,0.433513,0.431603


### Random Forest

In [18]:
rf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

cv_report(rf, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.920045,0.890276,0.920045
y_dropout,0.878277,0.853759,0.878277
y_gpa,0.526019,0.510385,0.526019


### Multi-layer Perceptron Classifier

In [19]:
mlp = make_pipeline(
    StandardScaler(),
    MLPClassifier()
)

cv_report(mlp, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.895713,0.89734,0.895713
y_dropout,0.851106,0.844953,0.851106
y_gpa,0.496686,0.49145,0.496686


## Models Utilizing Imbalanced Learn

### Null Model

In [20]:
dc = imb_make_pipeline(
    StandardScaler(),
    SMOTE(),
    DummyClassifier(strategy="prior")
)

print("=" * 57)
print("SMOTE")
cv_report(dc, all_y, X)

SMOTE


Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.061436,0.003774,0.061436
y_dropout,0.897945,0.806306,0.897945
y_gpa,0.070154,0.004922,0.070154


### Logistic Regression

In [21]:
for imb_model in imb_models:
    
    lr = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        LogisticRegression()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(lr, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.634760            0.917581         0.634760
y_dropout   0.654293            0.878241         0.654293
y_gpa       0.403960            0.478258         0.403960
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.661187            0.920408         0.661187
y_dropout   0.660038            0.876653         0.660038
y_gpa       0.413760            0.488430         0.413760
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.655847            0.919342         0.655847
y_dropout   0.663552            0.875608         0.663552
y_gpa       0.413151            0.485682         0.413151
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Decision Tree

In [22]:
for imb_model in imb_models:
    
    dt = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        DecisionTreeClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(dt, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.586030            0.904525         0.586030
y_dropout   0.595494            0.848936         0.595494
y_gpa       0.375303            0.431798         0.375303
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.835358            0.892366         0.835358
y_dropout   0.783991            0.830577         0.783991
y_gpa       0.417410            0.430188         0.417410
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.835021            0.893072         0.835021
y_dropout   0.767299            0.834409         0.767299
y_gpa       0.412069            0.426259         0.412069
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Random Forest

In [23]:
for imb_model in imb_models:
    
    rf = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        RandomForestClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(rf, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.652064            0.921879         0.652064
y_dropout   0.649359            0.874063         0.649359
y_gpa       0.412814            0.484842         0.412814
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.888211            0.890341         0.888211
y_dropout   0.849213            0.838490         0.849213
y_gpa       0.513651            0.509581         0.513651
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.906730            0.888611         0.906730
y_dropout   0.855703            0.830993         0.855703
y_gpa       0.516895            0.511985         0.516895
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Balanced Bagging Classifier

In [24]:
rf = imb_make_pipeline(
    StandardScaler(),
    BalancedBaggingClassifier()
)

print(cv_report(rf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.685721            0.916144         0.685721
y_dropout   0.727893            0.857751         0.727893
y_gpa       0.431601            0.486642         0.431601


### Balanced Random Forest

In [25]:
brf = imb_make_pipeline(
    StandardScaler(),
    BalancedRandomForestClassifier()
)

print(cv_report(brf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.649561            0.925562         0.649561
y_dropout   0.646587            0.878757         0.646587
y_gpa       0.424843            0.513769         0.424843


### Easy Ensemble Classifier

In [26]:
ee = imb_make_pipeline(
    StandardScaler(),
    EasyEnsembleClassifier()
)

print(cv_report(ee, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.631247            0.924579         0.631247
y_dropout   0.645368            0.878623         0.645368
y_gpa       0.425653            0.507715         0.425653


### RUS Boost Classifier

In [27]:
rusb = imb_make_pipeline(
    StandardScaler(),
    RUSBoostClassifier()
)

print(cv_report(rusb, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.638817            0.924787         0.638817
y_dropout   0.652194            0.875079         0.652194
y_gpa       0.420923            0.490273         0.420923


### Multi-layer Perceptron Classifier with Oversampling

In [28]:
for imb_model in imb_models:

    mlp = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        MLPClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(mlp, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.620028            0.914458         0.620028
y_dropout   0.628947            0.867923         0.628947
y_gpa       0.397202            0.466381         0.397202
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.838130            0.897708         0.838130
y_dropout   0.755944            0.843811         0.755944
y_gpa       0.452486            0.488242         0.452486
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.821098            0.896456         0.821098
y_dropout   0.780206            0.843605         0.780206
y_gpa       0.459177            0.485374         0.459177
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

## CROSS VALIDATION RESULTS: No models do very well.