In [13]:
import os
import numpy as np
import pandas as pd

from random import randint

from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN #, BorderlineSMOTE
# from imblearn.combine import SMOTEENN, SMOTETomek # Tried these, didn't work well
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier, RUSBoostClassifier
from imblearn.metrics import classification_report_imbalanced

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer #, SimpleImputer KNNImputer

# from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, cross_validate #, cross_val_score

from sklearn.dummy import DummyClassifier #, DummyRegressor
from sklearn.linear_model import LogisticRegression #, RidgeClassifier, LinearRegression, MultiTaskLassoCV
from sklearn.tree import DecisionTreeClassifier #, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import make_scorer, precision_score, recall_score, roc_auc_score #, classification_report, accuracy_score

from joblib import dump

import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path = os.path.join("clean_student_data.csv")

df = pd.read_csv(file_path, index_col="STU_ID")

# EDA

In [3]:
df.dtypes

BYSEX       int64
BYRACE      int64
BYSTLANG    int64
BYPARED     int64
BYINCOME    int64
BYURBAN     int64
BYREGION    int64
BYRISKFC    int64
BYS34A      int64
BYS34B      int64
BYWRKHRS    int64
BYS42       int64
BYS43       int64
BYTVVIGM    int64
BYS46B      int64
BYS44C      int64
BYS20E      int64
BYS87C      int64
BYS20D      int64
BYS23C      int64
BYS37       int64
BYS27I      int64
BYS90D      int64
BYS38A      int64
BYS20J      int64
BYS24C      int64
BYS24D      int64
BYS54I      int64
BYS84D      int64
BYS84I      int64
BYS85A      int64
F2HSSTAT    int64
F2EVERDO    int64
F1RGPP2     int64
dtype: object

In [4]:
for col in df.columns:
    print(df[col].value_counts())

 0    7095
 1    6994
-1     707
Name: BYSEX, dtype: int64
 1    8058
 0    5923
-1     815
Name: BYRACE, dtype: int64
 1    11638
 0     2343
-1      815
Name: BYSTLANG, dtype: int64
 1    13207
 0      841
-1      748
Name: BYPARED, dtype: int64
1    11760
0     3036
Name: BYINCOME, dtype: int64
0    12115
1     2681
Name: BYURBAN, dtype: int64
3    5364
2    3749
4    3143
1    2540
Name: BYREGION, dtype: int64
 1    6382
 0    4695
-1    3719
Name: BYRISKFC, dtype: int64
 1    11351
-1     1310
 2     1263
 0      872
Name: BYS34A, dtype: int64
 1    10428
 2     2276
-1     1197
 0      895
Name: BYS34B, dtype: int64
 0    7396
 1    4192
-1    3208
Name: BYWRKHRS, dtype: int64
 1    6367
 0    4724
 2    2212
-1    1493
Name: BYS42, dtype: int64
 1    9137
 0    3692
-1    1393
 2     574
Name: BYS43, dtype: int64
 0    7892
 1    3590
-1    3314
Name: BYTVVIGM, dtype: int64
 0    10321
 1     2600
-1     1875
Name: BYS46B, dtype: int64
 1    8836
 2    2669
-1    1718
 3    1348

# Create X and y arrays

In [5]:
X = df[[column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]]

y_graduate = df["F2HSSTAT"]
y_dropout = df["F2EVERDO"]
y_gpa = df["F1RGPP2"]

all_y = {
    "y_graduate": y_graduate,
    "y_dropout": y_dropout,
    "y_gpa": y_gpa
}

# Create the pipeline for categorical data

In [6]:
categorical_features = [column for column in df.columns if column not in ["F2HSSTAT", "F2EVERDO", "F1RGPP2"]]
categorical_transformer = make_pipeline(
    IterativeImputer(estimator=RandomForestClassifier(), missing_values=-1),
    OneHotEncoder(drop="first", sparse=False)
)

# Set Up Train-Test-Split Variables and Imbalanced Learn Models

In [7]:
random_int = randint(0, 1000000)

X_train, X_test, y_graduate_train, y_graduate_test = train_test_split(X, y_graduate, test_size=0.2, random_state=random_int, stratify=y_graduate)
X_train, X_test, y_dropout_train, y_dropout_test = train_test_split(X, y_dropout, test_size=0.2, random_state=random_int, stratify=y_dropout)
X_train, X_test, y_gpa_train, y_gpa_test = train_test_split(X, y_gpa, test_size=0.2, random_state=random_int, stratify=y_gpa)

imb_models = {
    "RandomUnderSampler": RandomUnderSampler(),
    "RandomOverSampler": RandomOverSampler(),
    "SMOTE": SMOTE(),
    "ADASYN": ADASYN()
}

## Functions to fit and predict models and return report

In [8]:
bar_length = 82

def train_and_score_model_tts(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    report = classification_report_imbalanced(y_test, y_pred)
    
    return report


def print_report(model, all_y, X=None):
    
    for y in all_y:
        print("=" * bar_length, "\n", y)
        report = train_and_score_model_tts(model, X_train, X_test, eval(y + "_train"), eval(y + "_test"))
        print(report)
        
        model.fit(X_train, eval(y + "_train"))
        y_prob = model.predict_proba(X_test)
        if y != "y_gpa":
            y_prob = [prob[1] for prob in y_prob]

        weighted_roc_auc_ovr = roc_auc_score(eval(y + "_test"), y_prob, multi_class="ovr", average="weighted")
        print("One-vs-Rest ROC AUC score: {:.6f} (weighted by prevalence)".format(weighted_roc_auc_ovr))
        
    print("=" * bar_length)
    
    
def print_imb_report(imb_model, clf, all_y, reporter=print_report, X=None):
    print("=" * bar_length)
    print(imb_model)
    reporter(clf, all_y, X)

### Null Model

In [9]:
dc = make_pipeline(
    StandardScaler(),
    DummyClassifier(strategy="prior")
)

print_report(dc, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       182
          1       0.94      1.00      0.00      0.97      0.00      0.00      2778

avg / total       0.88      0.94      0.06      0.91      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.500000 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      1.00      0.00      0.95      0.00      0.00      2658
          1       0.00      0.00      1.00      0.00      0.00      0.00       302

avg / total       0.81      0.90      0.10      0.85      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.500000 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       208
          1

## Predict using RandomForestClassifier without imbalanced learn

In [10]:
clf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

print_report(clf, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00       182
          1       0.94      1.00      0.00      0.97      0.00      0.00      2778

avg / total       0.88      0.94      0.06      0.91      0.00      0.00      2960

One-vs-Rest ROC AUC score: 0.506183 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      1.00      0.00      0.95      0.06      0.00      2658
          1       1.00      0.00      1.00      0.01      0.06      0.00       302

avg / total       0.91      0.90      0.11      0.85      0.06      0.00      2960

One-vs-Rest ROC AUC score: 0.526134 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.04      1.00      0.08      0.21      0.04       208
          1

## Predict using RandomForestClassifier with imbalanced learn

In [11]:
for imb_model in imb_models:
    
    rus_clf = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        RandomForestClassifier()
    )

    print_imb_report(imb_model, rus_clf, all_y)

RandomUnderSampler
 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.52      0.47      0.11      0.49      0.25       182
          1       0.94      0.47      0.52      0.63      0.49      0.24      2778

avg / total       0.88      0.47      0.52      0.59      0.49      0.24      2960

One-vs-Rest ROC AUC score: 0.510614 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.55      0.44      0.68      0.49      0.24      2658
          1       0.10      0.44      0.55      0.16      0.49      0.24       302

avg / total       0.81      0.54      0.45      0.63      0.49      0.24      2960

One-vs-Rest ROC AUC score: 0.505278 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.18      0.62      0.79      0.28      0.70      0.48   

## Predict using BalancedRandomForestClassifier

In [12]:
brf = imb_make_pipeline(
    StandardScaler(),
    BalancedRandomForestClassifier()
)

print_report(brf, all_y)

 y_graduate
                   pre       rec       spe        f1       geo       iba       sup

          0       0.06      0.51      0.49      0.11      0.50      0.25       182
          1       0.94      0.49      0.51      0.65      0.50      0.25      2778

avg / total       0.88      0.49      0.50      0.61      0.50      0.25      2960

One-vs-Rest ROC AUC score: 0.503269 (weighted by prevalence)
 y_dropout
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.50      0.48      0.65      0.49      0.24      2658
          1       0.10      0.48      0.50      0.16      0.49      0.24       302

avg / total       0.81      0.50      0.48      0.60      0.49      0.24      2960

One-vs-Rest ROC AUC score: 0.490363 (weighted by prevalence)
 y_gpa
                   pre       rec       spe        f1       geo       iba       sup

          0       0.19      0.66      0.78      0.29      0.72      0.51       208
          1

## TRAIN-TEST-SPLIT RESULTS: No models do very well, and they all yield relatively similar results.
### The models do not do well at predicting successful high school graduation or high school dropouts.
### However, RandomForestClassifier with RandomOverSampler imbalanced learn appears to do reasonably better than the null model. Therefore, we will select this model for our app.
## Export model

In [16]:
rus_clf = imb_make_pipeline(
    StandardScaler(),
    imb_models["RandomOverSampler"],
    RandomForestClassifier()
)

rus_clf.fit(X, y_gpa)
dump(rus_clf, "rus_clf.joblib")

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomoversampler',
                 RandomOverSampler(random_state=None,
                                   sampling_strategy='auto')),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
           

# Cross Validation
### This is mostly to confirm that these models don't do any better then train-test-split.

In [13]:
def cv_report(model, all_y, X):
    
    cv_accuracy_lst = []
    cv_precision_lst = []
    cv_recall_lst = []
        
    for y in all_y:
        
        cv_dict = cross_validate(model, X, all_y[y], scoring={
            "Accuracy": "accuracy",
            "Weighted_Precision": make_scorer(precision_score, average="weighted", zero_division=0),
            "Weighted_Recall": make_scorer(recall_score, average="weighted", zero_division=0)
        })

        cv_accuracy = cv_dict["test_Accuracy"].mean()
        cv_accuracy_lst.append(cv_accuracy)
        
        cv_precision = cv_dict["test_Weighted_Precision"].mean()
        cv_precision_lst.append(cv_precision)
        
        cv_recall = cv_dict["test_Weighted_Recall"].mean()
        cv_recall_lst.append(cv_recall)
    
    return pd.DataFrame(data={"parameter": list(all_y.keys()), "accuracy": cv_accuracy_lst, "weighted_precision": cv_precision_lst, "weighted_recall": cv_recall_lst}).set_index("parameter")

## Models Not Utilizing Imbalanced Learn

### Null Model

In [14]:
dc = make_pipeline(
    StandardScaler(),
    DummyClassifier(strategy="prior")
)

cv_report(dc, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.938564,0.880903,0.938564
y_dropout,0.897945,0.806306,0.897945
y_gpa,0.437618,0.19151,0.437618


### Logistic Regression

In [15]:
lr = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

cv_report(lr, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.937551,0.913372,0.937551
y_dropout,0.895242,0.863801,0.895242
y_gpa,0.530345,0.528015,0.530345


### Decision Tree

In [16]:
dt = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

cv_report(dt, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.858271,0.895953,0.858271
y_dropout,0.798592,0.837173,0.798592
y_gpa,0.42944,0.431559,0.42944


### Random Forest

In [17]:
rf = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

cv_report(rf, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.92214,0.893293,0.92214
y_dropout,0.878006,0.848592,0.878006
y_gpa,0.529398,0.521803,0.529398


### Multi-layer Perceptron Classifier

In [18]:
mlp = make_pipeline(
    StandardScaler(),
    MLPClassifier()
)

cv_report(mlp, all_y, X)

Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.903148,0.89915,0.903148
y_dropout,0.848267,0.846133,0.848267
y_gpa,0.49493,0.492193,0.49493


## Models Utilizing Imbalanced Learn

### Null Model

In [19]:
dc = imb_make_pipeline(
    StandardScaler(),
    SMOTE(),
    DummyClassifier(strategy="prior")
)

print("=" * 57)
print("SMOTE")
cv_report(dc, all_y, X)

SMOTE


Unnamed: 0_level_0,accuracy,weighted_precision,weighted_recall
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
y_graduate,0.061436,0.003774,0.061436
y_dropout,0.897945,0.806306,0.897945
y_gpa,0.070154,0.004922,0.070154


### Logistic Regression

In [20]:
for imb_model in imb_models:
    
    lr = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        LogisticRegression()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(lr, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.629828             0.92092         0.629828
y_dropout   0.656523             0.87670         0.656523
y_gpa       0.409502             0.48568         0.409502
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.661863            0.919989         0.661863
y_dropout   0.661255            0.877401         0.661255
y_gpa       0.413760            0.488891         0.413760
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.653616            0.920350         0.653616
y_dropout   0.662065            0.876889         0.662065
y_gpa       0.416125            0.487007         0.416125
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Decision Tree

In [21]:
for imb_model in imb_models:
    
    dt = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        DecisionTreeClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(dt, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.604485            0.904072         0.604485
y_dropout   0.597658            0.849969         0.597658
y_gpa       0.370641            0.430456         0.370641
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.839616            0.893207         0.839616
y_dropout   0.787979            0.834319         0.787979
y_gpa       0.423019            0.435638         0.423019
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.838198            0.893676         0.838198
y_dropout   0.773855            0.833994         0.773855
y_gpa       0.417950            0.428296         0.417950
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Random Forest

In [22]:
for imb_model in imb_models:
    
    rf = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        RandomForestClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(rf, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.641316            0.922192         0.641316
y_dropout   0.650777            0.875937         0.650777
y_gpa       0.422545            0.491187         0.422545
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.889292            0.890071         0.889292
y_dropout   0.850091            0.837082         0.850091
y_gpa       0.514799            0.512483         0.514799
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.908623            0.887883         0.908623
y_dropout   0.854756            0.830864         0.854756
y_gpa       0.515543            0.511436         0.515543
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

### Balanced Bagging Classifier

In [23]:
rf = imb_make_pipeline(
    StandardScaler(),
    BalancedBaggingClassifier()
)

print(cv_report(rf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.665513            0.917057         0.665513
y_dropout   0.731882            0.858087         0.731882
y_gpa       0.421667            0.473842         0.421667


### Balanced Random Forest

In [24]:
brf = imb_make_pipeline(
    StandardScaler(),
    BalancedRandomForestClassifier()
)

print(cv_report(brf, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.650711            0.923709         0.650711
y_dropout   0.648614            0.879325         0.648614
y_gpa       0.426128            0.513193         0.426128


### Easy Ensemble Classifier

In [25]:
ee = imb_make_pipeline(
    StandardScaler(),
    EasyEnsembleClassifier()
)

print(cv_report(ee, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.638479            0.925238         0.638479
y_dropout   0.643678            0.878618         0.643678
y_gpa       0.422951            0.505593         0.422951


### RUS Boost Classifier

In [26]:
rusb = imb_make_pipeline(
    StandardScaler(),
    RUSBoostClassifier()
)

print(cv_report(rusb, all_y, X))

            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.631720            0.923010         0.631720
y_dropout   0.635095            0.876139         0.635095
y_gpa       0.410651            0.487840         0.410651


### Multi-layer Perceptron Classifier with Oversampling

In [27]:
for imb_model in imb_models:

    mlp = imb_make_pipeline(
        StandardScaler(),
        imb_models[imb_model],
        MLPClassifier()
    )

    print("=" * 57, "\n", imb_model)
    print(cv_report(mlp, all_y, X))

 RandomUnderSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.626653            0.915761         0.626653
y_dropout   0.631785            0.867463         0.631785
y_gpa       0.410110            0.469621         0.410110
 RandomOverSampler
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.830490            0.895551         0.830490
y_dropout   0.760404            0.844539         0.760404
y_gpa       0.452351            0.490339         0.452351
 SMOTE
            accuracy  weighted_precision  weighted_recall
parameter                                                
y_graduate  0.847320            0.895315         0.847320
y_dropout   0.781963            0.841450         0.781963
y_gpa       0.451608            0.482103         0.451608
 ADASYN
            accuracy  weighted_precision  weighted_recall
parameter         

## CROSS VALIDATION RESULTS: No models do very well.