In [2]:
import pandas as pd, numpy as np, math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
df = pd.read_csv('data/raw_dataset.csv')
df.shape

(4424, 35)

In [4]:
prev_col = "Previous qualification"

prev_map = {
    15: 3,
    17: 4,
    8: 7,
    9: 7,
    10: 7,
    11: 7,
}

df[prev_col] = df[prev_col].replace(prev_map)

print("Previous qualification value counts after merge:")
print(df[prev_col].value_counts().sort_index())

Previous qualification value counts after merge:
Previous qualification
1     3717
2       23
3      166
4       14
5        1
6       16
7       63
12     162
13       7
14     219
16      36
Name: count, dtype: int64


In [5]:
m_f_qual_cols = ["Mother's qualification", "Father's qualification"]

qual_groups = [
    [1, 7, 8, 10, 11, 12, 14, 15, 17, 19, 20],         # group 1
    [25, 26],                                          # group 2
    [9, 18, 21, 27, 28],                               # group 3
    [6, 13, 16, 22, 23, 29, 31, 32],                   # group 4
    [2, 3, 30],                                        # group 5
    [4, 33],                                           # group 6
    [5, 34],                                           # group 7
    [24],                                              # group 8
]

def build_group_map(groups):
    mapping = {}
    for i, group in enumerate(groups, start=1):
        for code in group:
            mapping[code] = i   # map all codes in that list to group i
    return mapping

qual_map = build_group_map(qual_groups)

for col in m_f_qual_cols:
    df[col] = df[col].replace(qual_map)

print("\nMother's qualification value counts after grouping:")
print(df["Mother's qualification"].value_counts().sort_index())
print("\nFather's qualification value counts after grouping:")
print(df["Father's qualification"].value_counts().sort_index())


Mother's qualification value counts after grouping:
Mother's qualification
1    1263
2      15
3      17
4    2530
5     521
6      49
7      21
8       8
Name: count, dtype: int64

Father's qualification value counts after grouping:
Father's qualification
1    1928
2      10
3    1927
4      32
5     355
6      41
7      19
8     112
Name: count, dtype: int64


In [6]:
nat_col = "Nacionality"
df = df[df[nat_col] == 1].copy()

print("\nShape after keeping only Nationality == 1 (Portugal):", df.shape)
print("Nationality value counts:")
print(df[nat_col].value_counts())


Shape after keeping only Nationality == 1 (Portugal): (4314, 35)
Nationality value counts:
Nacionality
1    4314
Name: count, dtype: int64


In [7]:
m_f_occ_cols = ["Mother's occupation", "Father's occupation"]

occ_groups = [
    [2, 17, 18, 14, 15, 16],                           # group 1
    [3, 19, 20, 21, 22],                               # group 2
    [4, 23, 24, 25, 26],                               # group 3
    [5, 27, 28, 29],                                   # group 4
    [6, 30, 31, 32, 33],                               # group 5
    [7, 34, 35, 43],                                   # group 6
    [8, 36, 37, 38, 39, 40, 41, 42, 44],               # group 7
    [1, 10, 11, 12, 13, 45, 46],                       # group 8
]

occ_map = build_group_map(occ_groups)

for col in m_f_occ_cols:
    df[col] = df[col].replace(occ_map)

print("\nMother's occupation value counts after grouping:")
print(df["Mother's occupation"].value_counts().sort_index())
print("\nFather's occupation value counts after grouping:")
print(df["Father's occupation"].value_counts().sort_index())


Mother's occupation value counts after grouping:
Mother's occupation
1     112
2     325
3     350
4     835
5     541
6      87
7     271
8    1757
9      36
Name: count, dtype: int64

Father's occupation value counts after grouping:
Father's occupation
1     141
2     197
3     382
4     384
5     514
6     244
7     690
8    1452
9     310
Name: count, dtype: int64


In [8]:
first_year_cols = [
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)"
]
df[first_year_cols] = df[first_year_cols].apply(pd.to_numeric, errors="coerce")
mask_all_zero = (df[first_year_cols] == 0).all(axis=1)

df = df[~mask_all_zero].copy()

print("Shape after removing NAL observations:", df.shape)
print("Removed rows:", mask_all_zero.sum())

Shape after removing NAL observations: (4140, 35)
Removed rows: 174


In [9]:
df.shape

(4140, 35)

In [10]:
semA_prefix = "Curricular units 1st sem ("
semB_prefix = "Curricular units 2nd sem ("

target_col        = f"{semB_prefix}grade)"
semB_enrolled_col = f"{semB_prefix}enrolled)"
semA_grade_col    = f"{semA_prefix}grade)"

macro_numeric = ["GDP", "Inflation rate", "Unemployment rate"]

for col in [target_col, semB_enrolled_col, semA_grade_col, *macro_numeric]:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

In [11]:
semA_cols = [c for c in df.columns if c.startswith(semA_prefix)]
semB_cols = [c for c in df.columns if c.startswith(semB_prefix)]

semB_to_drop = [c for c in semB_cols if c not in {target_col, semB_enrolled_col}]
cols_to_exclude = set(semB_to_drop + ["Target"])

feature_cols = [c for c in df.columns if c not in cols_to_exclude and c != target_col]
print(f"Features kept: {len(feature_cols)}")


Features kept: 29


In [12]:
# enforce dtypes
numeric_cols = [c for c in feature_cols if c in macro_numeric + [semA_grade_col]]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")
for c in categorical_cols:
    df[c] = df[c].astype("category")

# target (remove rows with missing Sem-B grade)
y = pd.to_numeric(df[target_col], errors="coerce")
mask = y.notna()
X = df.loc[mask, feature_cols]
y = y.loc[mask]

print("Modeling shape:", X.shape)


Modeling shape: (4140, 29)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [14]:
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
cat_pipe = make_pipeline(SimpleImputer(strategy="most_frequent"),
                         OneHotEncoder(handle_unknown="ignore"))

preprocess = ColumnTransformer(
    [("num", num_pipe, numeric_cols),
     ("cat", cat_pipe, categorical_cols)]
)


In [15]:
lin = LinearRegression()
rf  = RandomForestRegressor(n_estimators=400, max_depth=None,
                            min_samples_split=3, random_state=42, n_jobs=-1)
xgb = XGBRegressor(n_estimators=600, learning_rate=0.05, max_depth=6,
                   subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
                   objective="reg:squarederror", random_state=42, n_jobs=-1)

pipe_lin = Pipeline([("prep", preprocess), ("model", lin)])
pipe_rf  = Pipeline([("prep", preprocess), ("model", rf)])
pipe_xgb = Pipeline([("prep", preprocess), ("model", xgb)])


In [16]:
def eval_model(model, name):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    r2   = r2_score(y_test, pred)
    mae  = mean_absolute_error(y_test, pred)
    rmse = math.sqrt(mean_squared_error(y_test, pred))
    cv_r2 = cross_val_score(model, X_train, y_train, cv=5, scoring="r2").mean()
    print("\n" + "="*68)
    print(name)
    print("="*68)
    print(f"R^2 (test):             {r2:.3f}")
    print(f"CV R^2 (train, 5-fold): {cv_r2:.3f}")
    print(f"MAE:                    {mae:.3f}")
    print(f"RMSE:                   {rmse:.3f}")

for nm, mdl in [("Linear Regression", pipe_lin),
                ("Random Forest",     pipe_rf),
                ("XGBoost",           pipe_xgb)]:
    eval_model(mdl, nm)




Linear Regression
R^2 (test):             0.715
CV R^2 (train, 5-fold): 0.702
MAE:                    1.602
RMSE:                   2.616

Random Forest
R^2 (test):             0.715
CV R^2 (train, 5-fold): 0.701
MAE:                    1.384
RMSE:                   2.616

XGBoost
R^2 (test):             0.717
CV R^2 (train, 5-fold): 0.693
MAE:                    1.435
RMSE:                   2.606


In [17]:
def get_feature_names(prep, cat_cols, num_cols):
    ohe = prep.named_transformers_["cat"].named_steps["onehotencoder"]
    cat_names = list(ohe.get_feature_names_out(cat_cols))
    return num_cols + cat_names

# RF
pipe_rf.fit(X_train, y_train)
rf_names = get_feature_names(pipe_rf.named_steps["prep"], categorical_cols, numeric_cols)
rf_imp = (pd.DataFrame({"feature": rf_names,
                        "importance": pipe_rf.named_steps["model"].feature_importances_})
          .sort_values("importance", ascending=False))
print("\nTop RF importances:\n", rf_imp.head(20))

# XGB (gain)
pipe_xgb.fit(X_train, y_train)
booster = pipe_xgb.named_steps["model"].get_booster()
gain = booster.get_score(importance_type="gain")

xgb_names = get_feature_names(pipe_xgb.named_steps["prep"], categorical_cols, numeric_cols)
xgb_imp = (pd.DataFrame([(xgb_names[int(k[1:])], v) for k, v in gain.items()],
                        columns=["feature", "gain_importance"])
           .sort_values("gain_importance", ascending=False))
print("\nTop XGB gain importances:\n", xgb_imp.head(20))

# collapse dummy columns back to original variable
xgb_imp["base_feature"] = xgb_imp["feature"].str.replace(r"_[^_]+$", "", regex=True)
print("\nTop XGB by variable:\n",
      xgb_imp.groupby("base_feature")["gain_importance"].sum()
             .sort_values(ascending=False).head(20))


Top RF importances:
                                        feature  importance
0             Curricular units 1st sem (grade)    0.448645
233      Curricular units 1st sem (approved)_0    0.202155
234      Curricular units 1st sem (approved)_1    0.054018
235      Curricular units 1st sem (approved)_2    0.023134
1                            Unemployment rate    0.012167
2                               Inflation rate    0.011142
3                                          GDP    0.009294
22                         Application mode_14    0.008551
236      Curricular units 1st sem (approved)_3    0.006049
209  Curricular units 1st sem (evaluations)_10    0.005269
204   Curricular units 1st sem (evaluations)_5    0.005051
40                                    Course_7    0.004717
237      Curricular units 1st sem (approved)_4    0.004586
128                       Age at enrollment_34    0.004299
205   Curricular units 1st sem (evaluations)_6    0.004184
208   Curricular units 1st sem (ev