# TDT4259

Student Graduation Prediction

In [106]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from catboost import CatBoostClassifier, Pool

import matplotlib.pyplot as plt
import shap

In [58]:
df = pd.read_csv('data/data.csv',sep=";")

le = LabelEncoder()
y = le.fit_transform(df['Target'])
class_names = list(le.classes_)
X = df.drop(columns=['Target'])


In [59]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # ratios
        X["approval_ratio_1st"] = X["Curricular units 1st sem (approved)"] / X["Curricular units 1st sem (enrolled)"].replace(0, np.nan)
        X["approval_ratio_2nd"] = X["Curricular units 2nd sem (approved)"] / X["Curricular units 2nd sem (enrolled)"].replace(0, np.nan)
        # avg grade across semesters (ignore zeros)
        X["avg_grade"] = X[["Curricular units 1st sem (grade)", "Curricular units 2nd sem (grade)"]].replace(0, np.nan).mean(axis=1)
        # totals & overall ratio
        X["total_approved"] = X["Curricular units 1st sem (approved)"] + X["Curricular units 2nd sem (approved)"]
        X["total_enrolled"] = X["Curricular units 1st sem (enrolled)"] + X["Curricular units 2nd sem (enrolled)"]
        X["total_approval_ratio"] = X["total_approved"] / X["total_enrolled"].replace(0, np.nan)
        return X.fillna(0.0)

X = X.fillna(0)

In [60]:
categorical_cols = [
    "Application mode", "Application order", "Course",
    "Previous qualification", "Nationality", "Mother's qualification",
    "Father's qualification", "Mother's occupation", "Father's occupation",
    "Displaced", "Debtor", "Tuition fees up to date", "Scholarship holder",
    "Gender", "International", "Marital status", "Daytime/evening attendance\t",
    "Educational special needs"
]
categorical_cols = [c for c in categorical_cols if c in X.columns]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [99]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

classes = np.unique(y_train)
class_counts = np.bincount(y_train)
class_weights = (class_counts.sum() / (len(classes) * class_counts))
sample_weight = class_weights[y_train]

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.08,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    num_class=len(classes),
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
)

pipe_xgb = Pipeline(steps=[
    ("fe", FeatureEngineer()),
    # ("pre", preprocessor),
    ("clf", xgb),
])

pipe_xgb.fit(X_train, y_train, clf__sample_weight=sample_weight)
y_pred = pipe_xgb.predict(X_test)

print("=== XGBoost + OHE + Scaler (class-weighted) ===")
print("Accuracy:", f"{accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=class_names))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

=== XGBoost + OHE + Scaler (class-weighted) ===
Accuracy: 0.7650
              precision    recall  f1-score   support

     Dropout       0.82      0.74      0.77       284
    Enrolled       0.49      0.54      0.52       159
    Graduate       0.84      0.86      0.85       442

    accuracy                           0.76       885
   macro avg       0.72      0.71      0.71       885
weighted avg       0.77      0.76      0.77       885

Confusion matrix:
 [[209  42  33]
 [ 33  86  40]
 [ 14  46 382]]


In [102]:
smote = SMOTE(random_state=42)

pipe_xgb_smote = ImbPipeline(steps=[
    ("fe", FeatureEngineer()),
    ("pre", preprocess),
    ("smote", smote),
    ("clf", xgb),
])

pipe_xgb_smote.fit(X_train, y_train)
y_pred_sm = pipe_xgb_smote.predict(X_test)

print("\n=== XGBoost + OHE + Scaler + SMOTE ===")
print("Accuracy:", f"{accuracy_score(y_test, y_pred_sm):.4f}")
print(classification_report(y_test, y_pred_sm, target_names=class_names))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_sm))




=== XGBoost + OHE + Scaler + SMOTE ===
Accuracy: 0.7684
              precision    recall  f1-score   support

     Dropout       0.80      0.74      0.77       284
    Enrolled       0.53      0.47      0.49       159
    Graduate       0.82      0.90      0.86       442

    accuracy                           0.77       885
   macro avg       0.72      0.70      0.71       885
weighted avg       0.76      0.77      0.76       885

Confusion matrix:
 [[209  38  37]
 [ 35  74  50]
 [ 17  28 397]]


In [None]:
catboost_preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_cols),
    ],
    remainder="drop",
)

catboost_smote = SMOTE(random_state=42)

catboost_clf = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    random_state=42,
    verbose=0,
)

pipe_catboost_smote = ImbPipeline(steps=[
    ("fe", FeatureEngineer()),
    ("pre", catboost_preprocess),
    ("smote", catboost_smote),
    ("clf", catboost_clf),
])

pipe_catboost_smote.fit(X_train, y_train)
y_pred_cat = pipe_catboost_smote.predict(X_test)

print("\n=== CatBoost + OHE + Scaler + SMOTE ===")
print("Accuracy:", f"{accuracy_score(y_test, y_pred_cat):.4f}")
print(classification_report(y_test, y_pred_cat, target_names=class_names))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_cat))
