<a href="https://colab.research.google.com/github/ysf-s/Hi-Paris-AI-Data-Science-Hackathon-Group-42-/blob/main/Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# PISA Math Score Prediction Pipeline

This notebook builds a two-stage model (classifier + regressor) to predict math scores.

## 1. Load Data and Install Libraries

In [None]:
import pandas as pd

# Load datasets
y_train = pd.read_csv('/content/drive/MyDrive/Copy of y_train.csv')
X_train = pd.read_csv('/content/drive/MyDrive/Copy of X_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Copy of X_test.csv')

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (1172086, 307)
X_test shape: (586044, 307)
y_train shape: (1172086, 2)


In [None]:
!pip install lightgbm xgboost

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.29.2-py3-none-manylinux_2_18_x86_64.whl.metadata (2.1 kB)
Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.6/3.6 MB[0m [31m130.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.29.

## 2. Import Libraries

In [None]:
import numpy as np
import warnings
import lightgbm as lgb
import xgboost as xgb
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, r2_score
import re

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

## 3. Feature Engineering Functions

In [None]:
def engineer_psychometrics(df):
    """Create psychometric indicator features."""
    df = df.copy()
    if 'MATHEASE' in df.columns:
        df['has_math_attitude'] = df['MATHEASE'].notna().astype(int)
    if 'ST290' in df.columns:
        df['math_confidence_present'] = df['ST290'].notna().astype(int)
    if 'ST213' in df.columns:
        df['teacher_feedback_present'] = df['ST213'].notna().astype(int)
    if 'ST296' in df.columns:
        try:
            df['does_homework'] = (pd.to_numeric(df['ST296'], errors='coerce') > 0).astype(int)
        except:
            pass
    return df

def engineer_features(df):
    """Create timing, effort, and clustering features."""
    df = df.copy()

    # Starting indicator
    if 'math_q1_total_timing' in df.columns:
        df['started_math'] = df['math_q1_total_timing'].notna().astype(int)

    # Timing features
    sci_cols = [c for c in df.columns if 'science_q' in c and 'total_timing' in c]
    read_cols = [c for c in df.columns if 'reading_q' in c and 'total_timing' in c]

    df['total_sci_time'] = df[sci_cols].fillna(0).sum(axis=1)
    df['total_read_time'] = df[read_cols].fillna(0).sum(axis=1)
    df['total_global_time'] = df['total_sci_time'] + df['total_read_time']
    df['n_items_attempted'] = df[sci_cols + read_cols].notna().sum(axis=1)
    df['std_response_time'] = df[sci_cols + read_cols].std(axis=1).fillna(-1)

    # Student persona clustering
    cluster_cols = ['total_global_time', 'n_items_attempted', 'std_response_time']
    X_cluster = df[cluster_cols].fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_cluster)
    kmeans = KMeans(n_clusters=7, random_state=42, n_init=10)
    df['student_persona'] = kmeans.fit_predict(X_scaled).astype(int)

    # Wealth index
    possessions = [c for c in df.columns if c.startswith('ST25')]
    df['wealth_index'] = df[possessions].notna().sum(axis=1)

    # Effort interactions
    if 'EFFORT1' in df.columns:
        df['EFFORT1_clean'] = pd.to_numeric(df['EFFORT1'], errors='coerce').fillna(0)
        df['wealth_x_effort'] = df['wealth_index'] * df['EFFORT1_clean']
        df['time_x_effort'] = df['total_global_time'] * df['EFFORT1_clean']

    return df

def sanitize_cols(df):
    """Sanitize column names for XGBoost compatibility."""
    new_cols = [re.sub(r'[^a-zA-Z0-9_]', '_', str(c)) for c in df.columns]
    df.columns = new_cols
    return df

def smoothed_target_encoding(train_df, test_df, target, cat_col, alpha=5):
    """Apply smoothed target encoding to categorical features."""
    global_mean = target.mean()
    agg = train_df.groupby(cat_col)[target.name].agg(['count', 'sum'])
    smoothed = (agg['sum'] + (alpha * global_mean)) / (agg['count'] + alpha)

    train_df[f'TE_{cat_col}'] = train_df[cat_col].map(smoothed).astype(float).fillna(global_mean)
    test_df[f'TE_{cat_col}'] = test_df[cat_col].map(smoothed).astype(float).fillna(global_mean)
    return train_df, test_df

def process_year_data(X_train, y_train, X_test):
    """Process data for a specific year: target encoding, feature creation, and column alignment."""
    X_train = X_train.copy()
    X_test = X_test.copy()

    temp_train = X_train.copy()
    temp_train['target'] = y_train

    # Apply target encodings
    X_train, X_test = smoothed_target_encoding(temp_train, X_test, temp_train['target'], 'CNTSCHID', alpha=5)
    X_train, X_test = smoothed_target_encoding(temp_train, X_test, temp_train['target'], 'student_persona', alpha=50)
    if 'MATHEASE' in X_train.columns:
        X_train, X_test = smoothed_target_encoding(temp_train, X_test, temp_train['target'], 'MATHEASE', alpha=50)

    # Create interaction features
    if 'EFFORT1_clean' in X_train.columns:
        X_train['School_x_Effort'] = X_train['TE_CNTSCHID'] * X_train['EFFORT1_clean']
        X_test['School_x_Effort'] = X_test['TE_CNTSCHID'] * X_test['EFFORT1_clean']

    # Drop raw categorical columns
    cols_to_drop = ['CNTSCHID', 'CNT', 'STRATUM', 'OCOD1', 'OCOD2', 'student_persona', 'MATHEASE', 'ST290']
    X_train = X_train.drop(columns=cols_to_drop, errors='ignore')
    X_test = X_test.drop(columns=cols_to_ban, errors='ignore')

    # Align columns
    common_cols = [c for c in X_train.columns if c in X_test.columns]
    X_train = sanitize_cols(X_train[common_cols])
    X_test = sanitize_cols(X_test[common_cols])

    return X_train, X_test

## 4. Data Preprocessing

In [None]:
print("Preparing datasets...")

y_train_target = y_train['MathScore'].copy()

# Remove leakage columns
all_math_cols = [c for c in X_train.columns if c.startswith('math_q')]
cols_to_ban = [c for c in all_math_cols if c != 'math_q1_total_timing'] + ['average_math_question_score', 'last_attempted_math_q']

X_train_clean = X_train.drop(columns=cols_to_ban, errors='ignore').copy()
X_test_clean = X_test.drop(columns=cols_to_ban, errors='ignore').copy()

print("Engineering features...")
X_train_clean = engineer_features(X_train_clean)
X_train_clean = engineer_psychometrics(X_train_clean)

X_test_clean = engineer_features(X_test_clean)
X_test_clean = engineer_psychometrics(X_test_clean)

print("Converting categorical columns...")
obj_cols = X_train_clean.select_dtypes(include='object').columns
for col in obj_cols:
    X_train_clean[col] = X_train_clean[col].astype('category')
    if col in X_test_clean.columns:
        X_test_clean[col] = X_test_clean[col].astype('category')

X_train_clean['student_persona'] = X_train_clean['student_persona'].astype('category')
X_test_clean['student_persona'] = X_test_clean['student_persona'].astype('category')

print("Preprocessing complete!")

Preparing datasets...
Engineering features...
Converting categorical columns...
Preprocessing complete!


## 5. Optimize Classifier Parameters

Split data, train classifiers, and find optimal ensemble weights and threshold.

In [None]:
print("Splitting data for optimization...")

y_binary_split = (y_train_target > 0).astype(int)
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
    X_train_clean, y_train_target, test_size=0.2, random_state=42, stratify=y_binary_split
)

print(f"Optimization split - Train: {X_train_opt.shape[0]}, Val: {X_val_opt.shape[0]}")

Splitting data for optimization...
Optimization split - Train: 937668, Val: 234418


In [None]:
print("Collecting validation predictions for optimization...")

all_y_binary_val = []
all_prob_lgb_val = []
all_prob_xgb_val = []

years = [2015, 2018, 2022]

for year in years:
    print(f"\nProcessing year {year} for optimization...")

    train_mask = X_train_opt['Year'] == year
    val_mask = X_val_opt['Year'] == year

    if not val_mask.any():
        continue

    X_curr_train = X_train_opt[train_mask].copy()
    y_curr_train = y_train_opt[train_mask].copy()
    X_curr_val = X_val_opt[val_mask].copy()
    y_curr_val = y_val_opt[val_mask].copy()

    # Process data
    X_curr_train, X_curr_val = process_year_data(X_curr_train, y_curr_train, X_curr_val)

    # Train classifiers
    y_binary_train = (y_curr_train > 0).astype(int)
    y_binary_val = (y_curr_val > 0).astype(int)

    clf_lgb = lgb.LGBMClassifier(n_estimators=400, learning_rate=0.03, random_state=42, verbose=-1)
    clf_lgb.fit(X_curr_train, y_binary_train)
    prob_lgb = clf_lgb.predict_proba(X_curr_val)[:, 1]

    clf_xgb = xgb.XGBClassifier(n_estimators=400, learning_rate=0.03, enable_categorical=True, tree_method='hist', random_state=42)
    clf_xgb.fit(X_curr_train, y_binary_train)
    prob_xgb = clf_xgb.predict_proba(X_curr_val)[:, 1]

    all_y_binary_val.append(y_binary_val)
    all_prob_lgb_val.append(prob_lgb)
    all_prob_xgb_val.append(prob_xgb)

final_y_binary_val = pd.concat(all_y_binary_val)
final_prob_lgb_val = np.concatenate(all_prob_lgb_val)
final_prob_xgb_val = np.concatenate(all_prob_xgb_val)

print(f"Total validation samples: {len(final_y_binary_val)}")

Collecting validation predictions for optimization...

Processing year 2015 for optimization...

Processing year 2018 for optimization...

Processing year 2022 for optimization...
Total validation samples: 234418


In [None]:
print("Optimizing classifier weights and threshold...")

best_f1 = -1
best_lgb_weight = 0.5
best_threshold = 0.5

for lgb_weight in np.arange(0.1, 1.0, 0.1):
    xgb_weight = 1 - lgb_weight
    for threshold in np.arange(0.1, 0.96, 0.01):
        ensemble_probs = (lgb_weight * final_prob_lgb_val) + (xgb_weight * final_prob_xgb_val)
        binary_predictions = (ensemble_probs > threshold).astype(int)
        f1 = f1_score(final_y_binary_val, binary_predictions)

        if f1 > best_f1:
            best_f1 = f1
            best_lgb_weight = lgb_weight
            best_threshold = threshold

print(f"\nBest F1-score: {best_f1:.4f}")
print(f"Optimal LGBM Weight: {best_lgb_weight:.2f}")
print(f"Optimal Threshold: {best_threshold:.2f}")

LGB_WEIGHT_CLASSIFIER = best_lgb_weight
BEST_THRESH = best_threshold

Optimizing classifier weights and threshold...

Best F1-score: 0.9903
Optimal LGBM Weight: 0.80
Optimal Threshold: 0.46


## 6. Train Final Models and Generate Predictions

In [None]:
final_submission = pd.DataFrame(index=X_test_clean.index)
final_submission['MathScore'] = 0.0

for year in years:
    print(f"\n{'='*50}")
    print(f"Training final models for year {year}")
    print(f"{'='*50}")

    train_mask = X_train_clean['Year'] == year
    test_mask = X_test_clean['Year'] == year

    if not test_mask.any():
        continue

    X_curr_train = X_train_clean[train_mask].copy()
    y_curr_train = y_train_target[train_mask].copy()
    X_curr_test = X_test_clean[test_mask].copy()

    print(f"Train: {len(X_curr_train)}, Test: {len(X_curr_test)}")

    # Process data
    X_curr_train, X_curr_test = process_year_data(X_curr_train, y_curr_train, X_curr_test)

    # Stage 1: Classification
    y_binary = (y_curr_train > 0).astype(int)

    clf_lgb = lgb.LGBMClassifier(n_estimators=400, learning_rate=0.03, random_state=42, verbose=-1)
    clf_lgb.fit(X_curr_train, y_binary)
    prob_lgb = clf_lgb.predict_proba(X_curr_test)[:, 1]

    clf_xgb = xgb.XGBClassifier(n_estimators=400, learning_rate=0.03, enable_categorical=True, tree_method='hist', random_state=42)
    clf_xgb.fit(X_curr_train, y_binary)
    prob_xgb = clf_xgb.predict_proba(X_curr_test)[:, 1]

    prob_avg = LGB_WEIGHT_CLASSIFIER * prob_lgb + (1 - LGB_WEIGHT_CLASSIFIER) * prob_xgb

    # Stage 2: Regression
    non_zero_mask = y_curr_train > 0
    X_reg = X_curr_train[non_zero_mask]
    y_reg = y_curr_train[non_zero_mask]

    reg_lgb = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.02, num_leaves=70, min_child_samples=10, random_state=42, verbose=-1)
    reg_lgb.fit(X_reg, y_reg)
    pred_lgb = reg_lgb.predict(X_curr_test)

    reg_xgb = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.02, max_depth=9, enable_categorical=True, tree_method='hist', random_state=42)
    reg_xgb.fit(X_reg, y_reg)
    pred_xgb = reg_xgb.predict(X_curr_test)

    final_reg_pred = 0.2 * pred_lgb + 0.8 * pred_xgb
    final_pred = np.where(prob_avg > BEST_THRESH, final_reg_pred, 0.0)

    final_submission.loc[test_mask, 'MathScore'] = final_pred
    print(f"Completed year {year}")


Training final models for year 2015
Train: 348119, Test: 174102
Completed year 2015

Training final models for year 2018
Train: 411320, Test: 205889
Completed year 2018

Training final models for year 2022
Train: 412647, Test: 206053
Completed year 2022


## 7. Save Submission

In [None]:
print("\nSaving submission.csv...")

submission_df = pd.DataFrame({
    'ID': X_test['Unnamed: 0'],
    'MathScore': final_submission['MathScore']
})

submission_df.to_csv('submission.csv', index=False)
print("Done! Submission saved.")
print(f"\nTotal predictions: {len(submission_df)}")
print(f"Non-zero predictions: {(submission_df['MathScore'] > 0).sum()}")
print("\nPreview:")
print(submission_df.head(10))


Saving submission.csv...
Done! Submission saved.

Total predictions: 586044
Non-zero predictions: 368263

Preview:
        ID   MathScore
0   412660  112.982534
1   554658   76.146082
2   937138    0.000000
3   752986  236.625236
4  1084508  167.005639
5   527030   60.592353
6   782794    0.000000
7   169543  185.386008
8  1697342  138.635419
9   724544    0.000000


## 8. Evaluate

In [None]:
y_test = pd.read_csv('/content/drive/MyDrive/Copy of y_test.csv')
r2 = r2_score(y_test['MathScore'], final_submission['MathScore'])
print(f"R-squared score: {r2:.2f}")

R-squared score: 0.78
