In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# this notebook aim to predict the probability of students droput during different stages, by solving a classification problem (droput 1 / 0)
# the models we will use:
# 1. LogisticRegression
# 2. RandomForestClassifier
# 3. XGBoostClassifier

In [40]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
data = pd.read_csv('data/raw_dataset.csv')
data.shape

(4424, 35)

In [42]:
# we will start with using only the not enrolled
data = data[data['Target']!='Enrolled']
data.shape

(3630, 35)

In [43]:
data['Target'].value_counts(sort=False, normalize=True).round(2)

Target
Dropout     0.39
Graduate    0.61
Name: proportion, dtype: float64

### Prepare data for modeling

1. Handel features with high cardinality
2. Change mapping to ordinal values (when it is possiable and / or good idea)
3. Fix potential data leakage / wierd data values
4. Align data types

In [44]:
label = 'Target'

individual_features = [
    'Marital status',
     'Application mode',
     'Application order',
     'Course',
     'Daytime/evening attendance',
     'Previous qualification',
     'Nacionality',
     "Mother's qualification",
     "Father's qualification",
     "Mother's occupation",
     "Father's occupation",
     'Displaced',
     'Educational special needs',
     'Debtor',
     'Tuition fees up to date',
     'Gender',
     'Scholarship holder',
     'Age at enrollment',
     'International',
    'Unemployment rate',
    'Inflation rate',
    'GDP',
]

first_sem_features = [
     'Curricular units 1st sem (credited)',
     'Curricular units 1st sem (enrolled)',
     'Curricular units 1st sem (evaluations)',
     'Curricular units 1st sem (approved)',
     'Curricular units 1st sem (grade)',
     'Curricular units 1st sem (without evaluations)',
]
second_sem_features = [
     'Curricular units 2nd sem (credited)',
     'Curricular units 2nd sem (enrolled)',
     'Curricular units 2nd sem (evaluations)',
     'Curricular units 2nd sem (approved)',
     'Curricular units 2nd sem (grade)',
     'Curricular units 2nd sem (without evaluations)',
]

In [45]:
data_first_stage = data[individual_features + [label]].copy()
data_first_stage.shape

(3630, 23)

In [46]:
categorical_cols = individual_features
numeric_cols = []

for col in categorical_cols:
    data_first_stage[col] = data_first_stage[col].astype("category")

In [47]:
data_first_stage['y'] = (data_first_stage[label] == 'Dropout').astype(int)
data_first_stage = data_first_stage.drop(columns=[label])

In [48]:
from sklearn.model_selection import train_test_split

X = data_first_stage.drop(columns=['y'])
y = data_first_stage['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocess = ColumnTransformer(
    transformers=[
        #('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

In [50]:
transformed_data = preprocess.fit_transform(data_first_stage)
transformed_data.shape

(3630, 309)

In [51]:
# import pandas as pd
# from sklearn.preprocessing import OneHotEncoder
#
# encoder = OneHotEncoder(sparse_output=False)
#
# one_hot_encoded = encoder.fit_transform(data_first_stage[categorical_cols])
#
# one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_cols))
# df_sklearn_encoded = pd.concat([data_first_stage.drop(categorical_cols, axis=1), one_hot_df], axis=1)
#
# print(f"One-Hot Encoded Data using Scikit-Learn:\n{df_sklearn_encoded}\n")

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


log_reg = LogisticRegression(max_iter=2000)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=4,
    random_state=42
)

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

In [53]:
from sklearn.pipeline import Pipeline

pipe_log = Pipeline([('prep', preprocess), ('model', log_reg)])
pipe_rf  = Pipeline([('prep', preprocess), ('model', rf)])
pipe_xgb = Pipeline([('prep', preprocess), ('model', xgb)])

In [54]:
pipe_log.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)
pipe_xgb.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [55]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

def evaluate(model, name):
    pred = model.predict(X_test)
    prob = model.predict_proba(X_test)[:,1]

    print(f"\n{name} Results:")
    print("Accuracy:", round(accuracy_score(y_test, pred),2))
    print("AUC:", round(roc_auc_score(y_test, prob),2))
    print("Confusion Matrix:\n", confusion_matrix(y_test, pred))

evaluate(pipe_log, "Logistic Regression")
evaluate(pipe_rf, "Random Forest")
evaluate(pipe_xgb, "XGBoost")


Logistic Regression Results:
Accuracy: 0.77
AUC: 0.85
Confusion Matrix:
 [[374  68]
 [ 98 186]]

Random Forest Results:
Accuracy: 0.78
AUC: 0.86
Confusion Matrix:
 [[384  58]
 [ 99 185]]

XGBoost Results:
Accuracy: 0.78
AUC: 0.86
Confusion Matrix:
 [[375  67]
 [ 94 190]]


### Problem 1: predict dropout before first year


### Problem 2: predict dropout after first semester


### Problem 3: predict dropout after second semester

