In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
features_df = pd.read_csv('..\data\processed\Dataset_engineered.csv')

  features_df = pd.read_csv('..\data\processed\Dataset_engineered.csv')


In [5]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 35 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 125000 non-null  float64
 1   num_subscription_pauses             125000 non-null  int64  
 2   weekly_hours                        125000 non-null  float64
 3   average_session_length              125000 non-null  float64
 4   song_skip_rate                      125000 non-null  float64
 5   weekly_songs_played                 125000 non-null  int64  
 6   weekly_unique_songs                 125000 non-null  int64  
 7   num_favorite_artists                125000 non-null  int64  
 8   num_platform_friends                125000 non-null  int64  
 9   num_playlists_created               125000 non-null  int64  
 10  num_shared_playlists                125000 non-null  int64  
 11  notifications_clicked     

In [6]:
X = features_df.drop(columns=['churned'])
y = features_df['churned']

In [7]:
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
def tune_and_evaluate(model, param_grid, X_train, y_train, X_test, y_test, model_name):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}")

    if hasattr(best_model, 'predict_proba'):
        y_pred_proba = best_model.predict_proba(X_test)
        if y_pred_proba.shape[1] > 2:  # Multi-class case
            roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
        else:  # Binary case
            roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1] if y_pred_proba.shape[1] > 1 else y_pred_proba.ravel())
        print(f"{model_name} ROC-AUC Score: {roc_auc:.4f}")

    print("-" * 50)
    return accuracy



In [13]:
logistic_params = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
logistic_acc = tune_and_evaluate(LogisticRegression(max_iter=500), logistic_params, X_train_scaled, y_train, X_test_scaled, y_test, "Logistic Regression")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Logistic Regression Best Parameters: {'C': 10, 'solver': 'lbfgs'}
Logistic Regression Accuracy: 0.7930
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79      0.79     12163
           1       0.80      0.80      0.80     12837

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000

Logistic Regression ROC-AUC Score: 0.8805
--------------------------------------------------


In [14]:
dt_params = {'max_depth': [5, 10, 20, None], 'min_samples_split': [2, 5, 10]}
dt_acc = tune_and_evaluate(DecisionTreeClassifier(), dt_params, X_train, y_train, X_test, y_test, "Decision Tree")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Decision Tree Best Parameters: {'max_depth': 10, 'min_samples_split': 2}
Decision Tree Accuracy: 0.8227
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82     12163
           1       0.83      0.83      0.83     12837

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

Decision Tree ROC-AUC Score: 0.9064
--------------------------------------------------


In [15]:
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None]}
rf_acc = tune_and_evaluate(RandomForestClassifier(), rf_params, X_train, y_train, X_test, y_test, "Random Forest")


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Random Forest Best Parameters: {'max_depth': None, 'n_estimators': 200}
Random Forest Accuracy: 0.8278
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.82      0.82     12163
           1       0.83      0.84      0.83     12837

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000

Random Forest ROC-AUC Score: 0.9149
--------------------------------------------------


In [16]:
print("\nModel Accuracy Comparison:")
print(f"Logistic Regression: {logistic_acc:.4f}")
print(f"Decision Tree: {dt_acc:.4f}")
print(f"Random Forest: {rf_acc:.4f}")


Model Accuracy Comparison:
Logistic Regression: 0.7930
Decision Tree: 0.8227
Random Forest: 0.8278


Step 1: Train the Final Random Forest Model (on full data)

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Create and train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


Step 2: Evaluate the Model on Test Data

In [18]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = rf_model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.82644
              precision    recall  f1-score   support

           0       0.82      0.82      0.82     12163
           1       0.83      0.83      0.83     12837

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



👉 This checks how well the model performs on new, unseen data

Step 3: Save the Trained Model (Serialize)

In [20]:
import os

# Create 'models' directory if it doesn't exist
os.makedirs('models', exist_ok=True)

import joblib

# Save model to file
joblib.dump(rf_model, 'models/churn_model_v1.pkl')


['models/churn_model_v1.pkl']

👉 This converts your model into a file so it can be used later for prediction.

✅ PART 2: Prepare the Model for Deployment (API)
Step 4: Create a predict.py file in src/models/

In [21]:
import joblib
import pandas as pd

# Load model
model = joblib.load('models/churn_model_v1.pkl')

def predict_churn(input_data):
    df = pd.DataFrame([input_data])  # Convert input to DataFrame
    prediction = model.predict(df)
    return prediction[0]


👉 This will be used to get predictions from your saved model.

✅ PART 3: Create an API for Your Model
Step 5: Set Up a Simple Flask API (app/api.py)

In [24]:
import joblib
import numpy as np

# Load the saved model
model = joblib.load('models/churn_model_v1.pkl')

def predict_churn(input_data):
    """
    Predicts churn from input data.
    input_data: List or 2D array of input features (excluding churn column)
    Returns: Prediction (0 or 1)
    """
    prediction = model.predict(np.array(input_data).reshape(1, -1))
    return int(prediction[0])


In [26]:
import joblib
import numpy as np

# Load the model from file
model = joblib.load('models/churn_model_v1.pkl')

def predict_churn(input_data):
    """
    Predicts churn from input data.
    input_data: List or 2D array of input features (excluding churn column)
    Returns: Prediction (0 or 1)
    """
    input_array = np.array(input_data).reshape(1, -1)
    prediction = model.predict(input_array)
    return int(prediction[0])
