### No first year data

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# this notebook aim to predict the probability of students droput during different stages, by solving a classification problem (droput 1 / 0)
# the models we will use:
# 1. LogisticRegression
# 2. RandomForestClassifier
# 3. XGBoostClassifier

In [2]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('data/raw_dataset.csv')
data.shape

(4424, 35)

In [4]:
# we will start with using only the not enrolled
data = data[data['Target']!='Enrolled']
data.shape

(3630, 35)

In [5]:
data['Target'].value_counts(sort=False, normalize=True).round(2)

Target
Dropout     0.39
Graduate    0.61
Name: proportion, dtype: float64

### Prepare data for modeling

1. Handel features with high cardinality
2. Change mapping to ordinal values (when it is possiable and / or good idea)
3. Fix potential data leakage / wierd data values
4. Align data types

In [6]:
label = 'Target'

individual_features = [
    'Marital status',
     'Application mode',
     'Application order',
     'Course',
     'Daytime/evening attendance',
     'Previous qualification',
     'Nacionality',
     "Mother's qualification",
     "Father's qualification",
     "Mother's occupation",
     "Father's occupation",
     'Displaced',
     'Educational special needs',
     'Debtor',
     'Tuition fees up to date',
     'Gender',
     'Scholarship holder',
     'Age at enrollment',
     'International',
    'Unemployment rate',
    'Inflation rate',
    'GDP',
]

first_sem_features = [
     'Curricular units 1st sem (credited)',
     'Curricular units 1st sem (enrolled)',
     'Curricular units 1st sem (evaluations)',
     'Curricular units 1st sem (approved)',
     'Curricular units 1st sem (grade)',
     'Curricular units 1st sem (without evaluations)',
]
second_sem_features = [
     'Curricular units 2nd sem (credited)',
     'Curricular units 2nd sem (enrolled)',
     'Curricular units 2nd sem (evaluations)',
     'Curricular units 2nd sem (approved)',
     'Curricular units 2nd sem (grade)',
     'Curricular units 2nd sem (without evaluations)',
]

In [7]:
data_first_stage = data[individual_features + [label]].copy()
data_first_stage.shape

(3630, 23)

In [8]:
categorical_cols = individual_features
numeric_cols = []

for col in categorical_cols:
    data_first_stage[col] = data_first_stage[col].astype("category")

In [9]:
data_first_stage['y'] = (data_first_stage[label] == 'Dropout').astype(int)
data_first_stage = data_first_stage.drop(columns=[label])

In [10]:
from sklearn.model_selection import train_test_split

X = data_first_stage.drop(columns=['y'])
y = data_first_stage['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocess = ColumnTransformer(
    transformers=[
        #('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

In [12]:
transformed_data = preprocess.fit_transform(data_first_stage)
transformed_data.shape

(3630, 309)

In [13]:
# import pandas as pd
# from sklearn.preprocessing import OneHotEncoder
#
# encoder = OneHotEncoder(sparse_output=False)
#
# one_hot_encoded = encoder.fit_transform(data_first_stage[categorical_cols])
#
# one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_cols))
# df_sklearn_encoded = pd.concat([data_first_stage.drop(categorical_cols, axis=1), one_hot_df], axis=1)
#
# print(f"One-Hot Encoded Data using Scikit-Learn:\n{df_sklearn_encoded}\n")

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


log_reg = LogisticRegression(max_iter=2000)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=4,
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

In [15]:
from sklearn.pipeline import Pipeline

pipe_log = Pipeline([('prep', preprocess), ('model', log_reg)])
pipe_rf  = Pipeline([('prep', preprocess), ('model', rf)])
pipe_xgb = Pipeline([('prep', preprocess), ('model', xgb)])

In [16]:
pipe_log.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)
pipe_xgb.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

def evaluate(model, name):
    pred = model.predict(X_test)
    prob = model.predict_proba(X_test)[:,1]

    print(f"\n{name} Results:")
    print("Accuracy:", round(accuracy_score(y_test, pred),2))
    print("AUC:", round(roc_auc_score(y_test, prob),2))
    print("Confusion Matrix:\n", confusion_matrix(y_test, pred))

evaluate(pipe_log, "Logistic Regression")
evaluate(pipe_rf, "Random Forest")
evaluate(pipe_xgb, "XGBoost")


Logistic Regression Results:
Accuracy: 0.77
AUC: 0.85
Confusion Matrix:
 [[374  68]
 [ 98 186]]

Random Forest Results:
Accuracy: 0.78
AUC: 0.86
Confusion Matrix:
 [[384  58]
 [ 99 185]]

XGBoost Results:
Accuracy: 0.78
AUC: 0.86
Confusion Matrix:
 [[375  67]
 [ 94 190]]


In [20]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

def eval_at_05(model, X_test, y_test, name):
    proba = model.predict_proba(X_test)[:,1]
    pred = (proba >= 0.5).astype(int)
    print("\n", name)
    print("Precision:", round(precision_score(y_test, pred), 3))
    print("Recall:   ", round(recall_score(y_test, pred), 3))
    print("CM:\n", confusion_matrix(y_test, pred))

eval_at_05(pipe_log, X_test, y_test, "Logistic")
eval_at_05(pipe_rf,  X_test, y_test, "RF")
eval_at_05(pipe_xgb, X_test, y_test, "XGB")


 Logistic
Precision: 0.732
Recall:    0.655
CM:
 [[374  68]
 [ 98 186]]

 RF
Precision: 0.761
Recall:    0.651
CM:
 [[384  58]
 [ 99 185]]

 XGB
Precision: 0.739
Recall:    0.669
CM:
 [[375  67]
 [ 94 190]]


In [26]:
from sklearn.metrics import roc_auc_score

def auc_neutralize_cols(model, X_train, X_test, y_train, y_test, cols):
    Xtr = X_train.copy()
    Xte = X_test.copy()

    for c in cols:
        if c in Xtr.columns:
            Xtr[c] = Xtr[c].astype(str)
            Xte[c] = Xte[c].astype(str)
            Xtr[c] = "NEUTRAL"
            Xte[c] = "NEUTRAL"

    m = clone(model)
    m.fit(Xtr, y_train)
    p = m.predict_proba(Xte)[:, 1]
    return roc_auc_score(y_test, p)

suspicious = ["Tuition fees up to date", "Debtor"]
print("AUC after neutralizing:", auc_neutralize_cols(pipe_xgb, X_train, X_test, y_train, y_test, suspicious))

AUC after neutralizing: 0.7857131476642661


In [30]:
from sklearn.base import clone
from sklearn.metrics import roc_auc_score

def auc_neutralize_cols(model, X_train, X_test, y_train, y_test, cols):
    Xtr = X_train.copy()
    Xte = X_test.copy()
    for c in cols:
        if c in Xtr.columns:
            Xtr[c] = "NEUTRAL"
            Xte[c] = "NEUTRAL"
    m = clone(model)
    m.fit(Xtr, y_train)
    p = m.predict_proba(Xte)[:, 1]
    return roc_auc_score(y_test, p)

tests = {
    "Only tuition+debtor": ["Tuition fees up to date", "Debtor"],
    "Course only": ["Course"],
    "Course + Application": ["Course", "Application mode", "Application order"],
    "Parents info": ["Mother's qualification", "Father's qualification",
                     "Mother's occupation", "Father's occupation"],
    "Socio": ["Scholarship holder", "Displaced", "Educational special needs"],
    "Macro": ["GDP", "Inflation rate", "Unemployment rate"],
}

base_auc = roc_auc_score(y_test, pipe_xgb.predict_proba(X_test)[:,1])
print("Base AUC:", base_auc)

for name, cols in tests.items():
    auc = auc_neutralize_cols(pipe_xgb, X_train, X_test, y_train, y_test, cols)
    print(f"AUC after neutralizing {name}: {auc:.3f}")

Base AUC: 0.8562392454273151
AUC after neutralizing Only tuition+debtor: 0.786
AUC after neutralizing Course only: 0.821
AUC after neutralizing Course + Application: 0.806
AUC after neutralizing Parents info: 0.852
AUC after neutralizing Socio: 0.846
AUC after neutralizing Macro: 0.854


In [31]:
from sklearn.inspection import permutation_importance
import pandas as pd

perm = permutation_importance(pipe_xgb, X_test, y_test,
                              scoring="roc_auc", n_repeats=10, random_state=42)

imp = pd.DataFrame({"feature": X_test.columns, "perm_importance": perm.importances_mean})
imp = imp.sort_values("perm_importance", ascending=False)
print(imp.head(20))

                       feature  perm_importance
14     Tuition fees up to date         0.117068
3                       Course         0.063766
16          Scholarship holder         0.033812
1             Application mode         0.024726
13                      Debtor         0.014381
15                      Gender         0.013195
19           Unemployment rate         0.011546
17           Age at enrollment         0.009025
9          Mother's occupation         0.005556
10         Father's occupation         0.003822
5       Previous qualification         0.003803
7       Mother's qualification         0.001453
8       Father's qualification         0.001118
11                   Displaced         0.000752
20              Inflation rate         0.000311
12   Educational special needs         0.000069
6                  Nacionality         0.000057
21                         GDP        -0.000007
18               International        -0.000118
4   Daytime/evening attendance        -0

In [37]:
big_cols = ["Tuition fees up to date", "Debtor", "Course", "Application mode", "Application order"]
auc_big = auc_neutralize_cols(pipe_xgb, X_train, X_test, y_train, y_test, big_cols)
print("AUC after neutralizing BIG drivers:", auc_big)

AUC after neutralizing BIG drivers: 0.7243762347842713


In [38]:
from sklearn.base import clone
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import numpy as np

def eval_model_probs(model, Xtr, Xte, ytr, yte, thr=0.5):
    model.fit(Xtr, ytr)
    p = model.predict_proba(Xte)[:,1]
    pred = (p >= thr).astype(int)
    return {
        "AUC": roc_auc_score(yte, p),
        "Precision": precision_score(yte, pred),
        "Recall": recall_score(yte, pred),
        "CM": confusion_matrix(yte, pred)
    }

def neutralize(X, cols):
    Xn = X.copy()
    for c in cols:
        if c in Xn.columns:
            Xn[c] = "NEUTRAL"
    return Xn

big_cols = ["Tuition fees up to date", "Debtor", "Course", "Application mode", "Application order"]

# Full
full_res = eval_model_probs(clone(pipe_xgb), X_train, X_test, y_train, y_test, thr=0.5)

# Neutralized
Xtr_n = neutralize(X_train, big_cols)
Xte_n = neutralize(X_test, big_cols)
neut_res = eval_model_probs(clone(pipe_xgb), Xtr_n, Xte_n, y_train, y_test, thr=0.5)

print("FULL:", full_res)
print("NEUT:", neut_res)


FULL: {'AUC': 0.8562392454273151, 'Precision': 0.7392996108949417, 'Recall': 0.6690140845070423, 'CM': array([[375,  67],
       [ 94, 190]])}
NEUT: {'AUC': 0.7243762347842713, 'Precision': 0.6085271317829457, 'Recall': 0.5528169014084507, 'CM': array([[341, 101],
       [127, 157]])}
