In [None]:
# import libraries 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import joblib



In [13]:
# read data from the csv file 
try:
    df = pd.read_csv('data/xclan_fitness_usage_dataset.csv')
except FileNotFoundError:
    print("Dataset not found. Please make sure the file is located at 'data/xclan_fitness_usage_dataset.csv'")


In [None]:
# get the required data 
# Convert dates
df['session_date'] = pd.to_datetime(df['session_date'])
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_login_date'] = pd.to_datetime(df['last_login_date'])

# Map check-ins and dropoffs
df['completed_goal_checkin'] = df['completed_goal_checkin'].map({'Yes': 1, 'No': 0})
# df['dropoff_flag'] = df['dropoff_flag'].map({'Yes': 1, 'No': 0})

# Sort by user and session_date
df = df.sort_values(by=['user_id', 'session_date']).reset_index(drop=True)

# Initialize empty columns
df['session_frequency'] = 0.0
df['time_spent'] = 0.0
df['progress_logs'] = 0.0

# Loop through users
for user_id, user_df in df.groupby('user_id'):
    session_dates = user_df['session_date']
    durations = user_df['session_duration_minutes']
    checkins = user_df['completed_goal_checkin']

    # For each session, calculate past stats
    for i in range(len(user_df)):
        past_sessions = user_df.iloc[:i+1]

        days_range = (past_sessions['session_date'].max() - past_sessions['session_date'].min()).days + 1
        sessions_count = len(past_sessions)

        freq = sessions_count / days_range if days_range > 0 else 1
        avg_time = past_sessions['session_duration_minutes'].mean()
        checkin_avg = past_sessions['completed_goal_checkin'].mean()

        df.loc[past_sessions.index[-1], 'session_frequency'] = freq
        df.loc[past_sessions.index[-1], 'time_spent'] = avg_time
        df.loc[past_sessions.index[-1], 'progress_logs'] = checkin_avg

model_df = df[['user_id','session_frequency', 'time_spent', 'progress_logs', 'dropoff_flag']] # for training the model


In [None]:
# model code 

# it will be user level prediciton
# drop out risk on the level of the user not the session
# ensure dropoff_flag is binary
df['dropoff_flag'] = df['dropoff_flag'].map({'Yes': 1, 'No': 0})

# aggregate user-level features
features_df = df.groupby('user_id')[['session_frequency', 'time_spent', 'progress_logs']].mean()
dropouts = df.groupby('user_id')['dropoff_flag'].max()  
user_grouped_df = features_df.copy()
user_grouped_df['dropoff_flag'] = dropouts

# Features and Target
X = user_grouped_df[['session_frequency', 'time_spent', 'progress_logs']]
y = user_grouped_df['dropoff_flag']


pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

#  Cross-validation using full dataset (X, y)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=skf, scoring='recall')

print(" Stratified CV Recall Scores (Fold by Fold):", cv_scores)
print(" Average Recall (dropout detection):", np.mean(cv_scores))

#  Train-test split to evaluate classification report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n Classification Report:")
print(classification_report(y_test, y_pred, digits=3))



 Stratified CV Recall Scores (Fold by Fold): [0.31818182 0.47826087 0.47826087 0.30434783 0.52173913]
 Average Recall (dropout detection): 0.42015810276679844

 Confusion Matrix:
[[189  77]
 [ 17  17]]

 Classification Report:
              precision    recall  f1-score   support

           0      0.917     0.711     0.801       266
           1      0.181     0.500     0.266        34

    accuracy                          0.687       300
   macro avg      0.549     0.605     0.533       300
weighted avg      0.834     0.687     0.740       300



In [16]:
# save the model 
joblib.dump(pipeline, 'dropout_gb_model.pkl')


['dropout_gb_model.pkl']