# Model Training

In [1]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from pathlib import Path
import pandas as pd

In [2]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "03_10_day_window_sliced.parquet"

In [3]:
df_final = pd.read_parquet(checkpoint_file_path)

In [4]:
X = df_final.drop(columns=["label", "churn_day", "snapshot_day", "userId", "count_cancel", "count_cancellation_confirmation"])
y = df_final["label"]

In [5]:
numerical_features = X.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ],
    remainder="drop"
)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import TimeSeriesSplit, cross_validate
from sklearn.metrics import f1_score, roc_auc_score, make_scorer
from sklearn.ensemble import HistGradientBoostingClassifier

In [8]:
rf_model = RandomForestClassifier(class_weight=None, random_state=42)

In [9]:
final_rf_pipeline_full_fe = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)), 
    ('classifier', rf_model)
])

In [10]:
tscv = TimeSeriesSplit(n_splits=5)
scoring = {'F1': make_scorer(f1_score), 'AUC': make_scorer(roc_auc_score)}

In [11]:
# lr_scores = cross_validate(
#     final_rf_pipeline_full_fe, 
#     X, 
#     y, 
#     scoring=scoring, 
#     cv=tscv, 
#     n_jobs=-1
# )

In [12]:
# print(f"Mean F1 Score (Validation): {lr_scores['test_F1'].mean():.4f}")
# print(f"Mean AUC Score (Validation): {lr_scores['test_AUC'].mean():.4f}")

In [13]:
import numpy as np

In [14]:
# final_rf_pipeline_full_fe.fit(X, y)
# ohe_names = final_rf_pipeline_full_fe['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
# numerical_names = final_rf_pipeline_full_fe['preprocessor'].named_transformers_['num'].get_feature_names_out(numerical_features)
# feature_names_final = np.concatenate([numerical_names, ohe_names])
# importances = final_rf_pipeline_full_fe['classifier'].feature_importances_
# feature_importance_df = pd.DataFrame({
#     'Feature': feature_names_final,
#     'Importance': importances
# }).sort_values('Importance', ascending=False)

# print("\n--- Feature Importance Ranking (for Pruning) ---")
# print(feature_importance_df)

In [None]:
final_numerical_features = [
    "count_help", "count_thumbs_down", "count_about", "count_total_sessions", "count_settings", 
    "count_logout", "count_save_settings", "count_roll_advert", "count_add_friend", 
    "count_add_to_playlist", "count_upgrade", "count_downgrade", "count_home", 
    "count_thumbs_up", "count_nextsong", "count_error",
    "thumbs_ratio", "ads_per_session", "frequency", "avg_songs_session", 
    "item_per_session", "errors_per_session", "num_unique_artists",
    "user_lifecycle_h", "ttl_length"
]

final_categorical_features = ["last_level"]

In [16]:
pruned_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), final_numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), final_categorical_features)
    ],
    remainder="drop" 
)

In [17]:
gbdt_model = HistGradientBoostingClassifier()

In [18]:
final_gbdt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", gbdt_model)
])

In [19]:
gbdt_scores = cross_validate(
    final_gbdt_pipeline, 
    X, 
    y, 
    scoring=scoring, 
    cv=tscv, 
    n_jobs=-1
)

In [20]:
print(f"GBDT Mean F1 Score (Validation): {gbdt_scores['test_F1'].mean():.4f}")
print(f"GBDT Mean AUC Score (Validation): {gbdt_scores['test_AUC'].mean():.4f}")

GBDT Mean F1 Score (Validation): 0.0184
GBDT Mean AUC Score (Validation): 0.5044
