In [2]:
# 1. load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_auc_score)
import joblib

# 2. load processed data
df = pd.read_csv('/Users/tamanna/Documents/ML project /Task 1/data/processed/online_course_completion_clean.csv')

# 2a. sample for faster training
df = df.sample(20000, random_state=42)  # can increase later for full model

# 3. features & target
X = df.drop(columns=['completed_course'])
y = df['completed_course']

# 4. train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# 5. identify numeric & categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# 6. preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# ----------------------------
# 7a. Random Forest pipeline
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=50,       # fewer trees for faster training
        max_depth=10,          # limit depth
        random_state=42,
        n_jobs=-1              # use all CPU cores
    ))
])

# train Random Forest
rf_pipe.fit(X_train, y_train)

# predict & probabilities
y_pred_rf = rf_pipe.predict(X_test)
y_proba_rf = rf_pipe.predict_proba(X_test)[:, 1]

# evaluation
metrics_rf = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision': precision_score(y_test, y_pred_rf),
    'recall': recall_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf),
    'roc_auc': roc_auc_score(y_test, y_proba_rf),
    'confusion_matrix': confusion_matrix(y_test, y_pred_rf).tolist()
}
print("Random Forest metrics:", metrics_rf)

# save Random Forest model
joblib.dump(rf_pipe, '../saved_models/random_forest_v1.joblib')
print("Saved Random Forest model to ../saved_models/random_forest_v1.joblib")

# ----------------------------
# 7b. Gradient Boosting pipeline
gb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', GradientBoostingClassifier(
        n_estimators=50,       # fewer trees
        max_depth=5,           # limit depth
        learning_rate=0.1,
        random_state=42
    ))
])

# train Gradient Boosting
gb_pipe.fit(X_train, y_train)

# predict & probabilities
y_pred_gb = gb_pipe.predict(X_test)
y_proba_gb = gb_pipe.predict_proba(X_test)[:, 1]

# evaluation
metrics_gb = {
    'accuracy': accuracy_score(y_test, y_pred_gb),
    'precision': precision_score(y_test, y_pred_gb),
    'recall': recall_score(y_test, y_pred_gb),
    'f1': f1_score(y_test, y_pred_gb),
    'roc_auc': roc_auc_score(y_test, y_proba_gb),
    'confusion_matrix': confusion_matrix(y_test, y_pred_gb).tolist()
}
print("Gradient Boosting metrics:", metrics_gb)

# save Gradient Boosting model
joblib.dump(gb_pipe, '../saved_models/gradient_boosting_v1.joblib')
print("Saved Gradient Boosting model to ../saved_models/gradient_boosting_v1.joblib")

Random Forest metrics: {'accuracy': 0.85625, 'precision': 0.9907192575406032, 'recall': 0.4278557114228457, 'f1': 0.5976207137858642, 'roc_auc': 0.8786737365470447, 'confusion_matrix': [[2998, 4], [571, 427]]}
Saved Random Forest model to ../saved_models/random_forest_v1.joblib
Gradient Boosting metrics: {'accuracy': 0.8565, 'precision': 0.9711111111111111, 'recall': 0.437875751503006, 'f1': 0.6035911602209945, 'roc_auc': 0.8854307549142256, 'confusion_matrix': [[2989, 13], [561, 437]]}
Saved Gradient Boosting model to ../saved_models/gradient_boosting_v1.joblib
