In [None]:
# Libraries
import kagglehub
import os
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

warnings.filterwarnings("ignore")

path = kagglehub.dataset_download("nabihazahid/spotify-dataset-for-churn-analysis")

In [None]:
df_raw = pd.read_csv(f"{path}/spotify_churn_dataset.csv")

In [None]:
# Separate variables into different objects 

quantitative_cols = ["age", "listening_time", "songs_played_per_day","skip_rate","ads_listened_per_week"]
string_cols = ["user_id"]
categorical_cols = ["gender", "country", "subscription_type","device_type"]
boolean_cols = ["offline_listening"]

target = ["is_churned"]

In [None]:
df_model = df_raw.drop(columns=["user_id", "offline_listening"])

In [None]:
# 2. One hot encoding
df_model = pd.get_dummies(df_model, columns=categorical_cols, drop_first=True)

In [None]:
# 3. Standar Scaler
scaler = StandardScaler()
df_model[quantitative_cols] = scaler.fit_transform(df_model[quantitative_cols])

In [None]:
# Target
y = df_model["is_churned"]

# Features
X = df_model.drop(columns=["is_churned"])

# Split train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Churn ratio train:", y_train.mean().round(3))
print("Churn ratio test:", y_test.mean().round(3))

In [None]:
results = []

# 1. Define and train the model.
log_reg = LogisticRegression(
    max_iter=1000,
    solver="liblinear",
    class_weight="balanced"
)  
log_reg.fit(X_train, y_train)

# 2. Predictions.
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# 3. Metrics.
accuracy  = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred, zero_division=0), 3)
recall    = round(recall_score(y_test, y_pred, zero_division=0), 3)
f1        = round(f1_score(y_test, y_pred, zero_division=0), 3)
roc_auc   = round(roc_auc_score(y_test, y_proba), 3)

results.append({
    "Model": "Logistic Regression",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "ROC-AUC": roc_auc
})

In [None]:
# 1. Define and train the model.  
tree_clf = DecisionTreeClassifier(
    max_depth=5,          
    class_weight="balanced", 
    random_state=42
)
tree_clf.fit(X_train, y_train)

# 2. Predictions.
y_pred = tree_clf.predict(X_test)
y_proba = tree_clf.predict_proba(X_test)[:, 1]

# 3. Metrics.
accuracy  = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred, zero_division=0), 3)
recall    = round(recall_score(y_test, y_pred, zero_division=0), 3)
f1        = round(f1_score(y_test, y_pred, zero_division=0), 3)
roc_auc   = round(roc_auc_score(y_test, y_proba), 3)

results.append({
    "Model": "Decision Tree",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "ROC-AUC": roc_auc
})