In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

df_clean_churn = pd.read_parquet(path_clean_dataset)

In [2]:
min_time = df_clean_churn["time"].min()
max_time = df_clean_churn["time"].max()

min_time, max_time

(Timestamp('2018-10-01 00:00:01'), Timestamp('2018-11-20 00:00:00'))

In [3]:
df_clean_churn["day"] = (df_clean_churn["time"] - min_time).dt.days

min_day = df_clean_churn["day"].min()
max_day = df_clean_churn["day"].max()

min_day, max_day

(0, 49)

In [4]:
# find user churn events
df_churn_events = df_clean_churn[
    df_clean_churn["page"] == "Cancellation Confirmation"
]

# userId â†’ churn_day
user_churn_day = df_churn_events.set_index("userId")["day"].astype("float")

user_churn_day.sample(5)

userId
1944261    19.0
1737216    36.0
1143104    18.0
1063929    24.0
1209949    18.0
Name: day, dtype: float64

In [5]:
window_size = 10
T_values = list(range(max_day - window_size, -1, -window_size))

T_values

[39, 29, 19, 9]

In [6]:
T = 39
df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()


len(df_window)

14480064

In [7]:
# print(df_window.head())

In [8]:
def build_user_features(df_window):
    # Input: raw dataset where day <= cutoff day T
    # Output: user level features aggregated to day T

    # Sort to ensure correct time series
    df_window = df_window.sort_values(by="time", ascending=True)

    # Base DataFrame
    df_users = (
        df_window[["userId", "gender", "registration", "operating_system", "browser"]]
        .drop_duplicates(subset=["userId"])
        .set_index("userId")
    )

    # Unique artist
    df_unique_artists = (
        df_window.groupby("userId")["artist"]
        .nunique()
        .rename("num_unique_artists")
    )
    df_users = df_users.join(df_unique_artists)

    # Page counts
    df_page_counts = (
        df_window.groupby("userId")["page"]
        .value_counts()
        .unstack(fill_value=0)
    )
    df_page_counts.columns = [
        f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns
    ]

    df_users = df_users.join(df_page_counts)


    # Total session count
    df_session_count = (
        df_window.groupby("userId")["sessionId"]
        .nunique()
        .rename("count_total_sessions")
    )
    df_users = df_users.join(df_session_count)

    # Lifecycle (hours)
    df_last_time = (
        df_window.groupby("userId")["time"]
        .max()
        .rename("last_time")
    )
    df_users = df_users.join(df_last_time)

    df_users["user_lifecycle_h"] = (
        (df_users["last_time"] - df_users["registration"]).dt.total_seconds() / 3600
    )

    # Total length
    df_length = (
        df_window.groupby("userId")["length"]
        .sum()
        .rename("ttl_length")
    )
    df_users = df_users.join(df_length)

    # Items per session
    df_item_per_session = (
        df_window.groupby("userId")["itemInSession"].max()
        / df_users["count_total_sessions"]
    )
    df_item_per_session = df_item_per_session.rename("item_per_session")
    df_users = df_users.join(df_item_per_session)

    # Frequency (sessions per user life cycle in hours

    df_users["frequency"] = (
        df_users["count_total_sessions"] / df_users["user_lifecycle_h"]
    )
    df_users["frequency"] = df_users["frequency"].replace(np.inf, 0)

    # avg songs per session
    if "count_nextsong" in df_users.columns:
        df_users["avg_songs_session"] = (
            df_users["count_nextsong"] / df_users["count_total_sessions"]
        )
    else:
        df_users["avg_songs_session"] = 0

    # Thumbs_up/down ratios
    df_users["thumbs_ratio"] = df_users["count_thumbs_up"] / (df_users["count_thumbs_down"] + df_users["count_thumbs_up"])
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].fillna(0)
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].replace(np.inf, 0)

    # Errors per session
    if "count_error" in df_users.columns:
        df_users["errors_per_session"] = (
            df_users["count_error"] / df_users["count_total_sessions"]
        )
    else:
        df_users["errors_per_session"] = 0

    # Ads per session
    if "count_roll_advert" in df_users.columns:
        df_users["ads_per_session"] = (
            df_users["count_roll_advert"] / df_users["count_total_sessions"]
        )
    else:
        df_users["ads_per_session"] = 0

    # Last Level (of paid or free)
    df_last_level = (
    df_window.groupby("userId")["level"]
    .last()
    .rename("last_level")
    )
    df_users = df_users.join(df_last_level)
    
    # Hours since last session
    current_time = df_window["time"].max()
    df_users["hours_since_last_session"] = (current_time - df_users["last_time"]).dt.total_seconds() / 3600

    # Active days ratio
    df_active_days = (
    df_window.groupby("userId")["time"]
    .apply(lambda x: x.dt.date.nunique())
    .rename("active_days")
    )
    df_users = df_users.join(df_active_days)
    window_duration = (df_window["time"].max() - df_window["time"].min()).days + 1
    df_users["active_days_ratio"] = df_users["active_days"] / window_duration
    df_users["active_days_ratio"] = df_users["active_days_ratio"].fillna(0)

    # Session length variance
    df_session_lengths = (
    df_window.groupby(["userId", "sessionId"])["length"]
    .sum()
    .reset_index()
    )

    df_session_variance = (
    df_session_lengths.groupby("userId")["length"]
    .std()
    .rename("session_length_variance")
    )

    df_users = df_users.join(df_session_variance)
    df_users["session_length_variance"] = df_users["session_length_variance"].fillna(0)

    # New user
    df_users["is_new_user"] = (df_users["user_lifecycle_h"] < 480).astype(int)

    # Hours since last downgrade
    if "count_downgrade" in df_users.columns and df_users["count_downgrade"].sum() > 0:
        df_last_downgrade = (
            df_window[df_window["page"] == "Downgrade"]
            .groupby("userId")["time"]
            .max()
            .rename("last_downgrade_time")
        )
        df_users = df_users.join(df_last_downgrade)
        df_users["hours_since_downgrade"] = (
            (current_time - df_users["last_downgrade_time"]).dt.total_seconds() / 3600
        )
        df_users["hours_since_downgrade"] = df_users["hours_since_downgrade"].fillna(999)
        df_users = df_users.drop(columns=["last_downgrade_time"])
    else:
        df_users["hours_since_downgrade"] = 999

    # Unique songs ratio
    df_unique_songs = (
    df_window[df_window["page"] == "NextSong"]
    .groupby("userId")["song"]
    .nunique()
    .rename("num_unique_songs")
    )
    df_users = df_users.join(df_unique_songs)

    if "count_nextsong" in df_users.columns:
        df_users["unique_songs_ratio"] = (
            df_users["num_unique_songs"] / df_users["count_nextsong"]
        )
        df_users["unique_songs_ratio"] = df_users["unique_songs_ratio"].fillna(0)
        df_users["unique_songs_ratio"] = df_users["unique_songs_ratio"].replace(np.inf, 0)
    else:
        df_users["unique_songs_ratio"] = 0

    # New trend features
    window_start = df_window["time"].min()
    window_end = df_window["time"].max()
    window_midpoint = window_start + (window_end - window_start) / 2

    # Activity level early and late

    df_early_actions = (
        df_window[df_window["time"] <= window_midpoint]
        .groupby("userId")
        .size()
        .rename("early_actions")
        )
    
    df_late_actions = (
        df_window[df_window["time"] > window_midpoint]
        .groupby("userId")
        .size()
        .rename("late_actions")
    )

    df_users = df_users.join(df_early_actions)
    df_users = df_users.join(df_late_actions)
    df_users["early_actions"] = df_users["early_actions"].fillna(0)
    df_users["late_actions"] = df_users["late_actions"].fillna(0)

    # Engagement in window
    df_users["within_window_activity_ratio"] = (
        df_users["late_actions"] / (df_users["early_actions"] + 1)
    )
    df_users["within_window_activity_change"] = (
        df_users["late_actions"] - df_users["early_actions"]
    )

    # Songs early vs late
    df_early_songs = (
        df_window[(df_window["time"] <= window_midpoint) & (df_window["page"] == "NextSong")]
        .groupby("userId")
        .size()
        .rename("early_songs_played")
    )
    df_late_songs = (
        df_window[(df_window["time"] > window_midpoint) & (df_window["page"] == "NextSong")]
        .groupby("userId")
        .size()
        .rename("late_songs_played")
    )
        
    df_users = df_users.join(df_early_songs)
    df_users = df_users.join(df_late_songs)
    df_users["early_songs_played"] = df_users["early_songs_played"].fillna(0)
    df_users["late_songs_played"] = df_users["late_songs_played"].fillna(0)
    
    df_users["song_listening_change"] = (
        df_users["late_songs_played"] - df_users["early_songs_played"]
    )
    
    # Engagement in last 3 days

    window_last_3_days = window_end - pd.Timedelta(days=3)
    
    df_recent_actions = (
        df_window[df_window["time"] > window_last_3_days]
        .groupby("userId")
        .size()
        .rename("recent_actions_last_3d")
    )
    df_users = df_users.join(df_recent_actions)
    df_users["recent_actions_last_3d"] = df_users["recent_actions_last_3d"].fillna(0)
    
    df_users["recent_activity_ratio"] = (
        df_users["recent_actions_last_3d"] / (df_users.index.map(
            df_window.groupby("userId").size()
        ) + 1)
    )

    # Session depth
    df_early_session_depth = (
        df_window[df_window["time"] <= window_midpoint]
        .groupby("userId")["itemInSession"]
        .mean()
        .rename("early_avg_items_per_session")
    )
    df_late_session_depth = (
        df_window[df_window["time"] > window_midpoint]
        .groupby("userId")["itemInSession"]
        .mean()
        .rename("late_avg_items_per_session")
    )
    
    df_users = df_users.join(df_early_session_depth)
    df_users = df_users.join(df_late_session_depth)
    df_users["early_avg_items_per_session"] = df_users["early_avg_items_per_session"].fillna(0)
    df_users["late_avg_items_per_session"] = df_users["late_avg_items_per_session"].fillna(0)
    
    df_users["session_depth_change"] = (
        df_users["late_avg_items_per_session"] - df_users["early_avg_items_per_session"]
    )

    # is weekend
    cutoff_weekday = current_time.weekday()
    df_users['cutoff_weekend'] = 1 if cutoff_weekday in [5, 6] else 0

    return df_users


In [9]:
# build_user_features(df_window)

In [10]:
# print(build_user_features(df_window).dtypes)

In [11]:
def add_label(df_users, user_churn_day, T, window_size=10):

    df_users["churn_day"] = user_churn_day.astype("float")

    df_users["label"] = 0

    T_end_window = T + window_size

    churn_in_window_mask = ((df_users["churn_day"] > T) & (df_users["churn_day"] <= T_end_window))
    df_users.loc[churn_in_window_mask, "label"] = 1

    df_users = df_users[df_users["churn_day"].isna() | (df_users["churn_day"] > T)].copy()

    df_users["label"] = df_users["label"].fillna(0).astype(int)

    return df_users

In [12]:
# df_users_T39 = build_user_features(df_window)
# df_users_T39.head()
# df_users_T39.shape

In [13]:
# df_users_T39 = add_label(df_users_T39, user_churn_day, T=39)

In [14]:
# df_users_T39["snapshot_day"] = 39
# df_users_T39 = df_users_T39.reset_index()
# df_users_T39

In [15]:
# user_churn_day.dtype


In [16]:
# print(user_churn_day.head())
# print(user_churn_day.dtype)
# user_churn_day.index.is_unique



In [17]:
def build_training_dataset(df_clean_churn, user_churn_day, T_values, window_size=10):

    all_snapshots = []

    for T in T_values:
        print(f"Processing snapshot for T = {T} ...")

        # filter window
        df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()

        # build features
        df_features = build_user_features(df_window)

        # add labels
        df_labeled = add_label(df_features, user_churn_day, T=T, window_size=window_size)

        # add snapshot day column
        df_labeled["snapshot_day"] = T

        # reset index so userId becomes a column
        df_labeled = df_labeled.reset_index()

        # keep the snapshot
        all_snapshots.append(df_labeled)

    df_final = pd.concat(all_snapshots, axis = 0).reset_index(drop=True)

    return df_final


In [18]:
df_training = build_training_dataset(df_clean_churn, user_churn_day, T_values)


Processing snapshot for T = 39 ...
Processing snapshot for T = 29 ...
Processing snapshot for T = 19 ...
Processing snapshot for T = 9 ...


In [19]:
# df_training.head

In [20]:
# df_training["snapshot_day"].value_counts()


In [21]:
df_training["label"].value_counts()


label
0    57344
1     3090
Name: count, dtype: int64

In [22]:
# df_training.head()

In [23]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "03_10_day_window_sliced.parquet"
df_training.to_parquet(checkpoint_file_path, index=False)

In [24]:
# df_training[df_training["label"]==1]

In [25]:
# df_training[df_training["userId"]==1099753]

In [26]:
# df_training[df_training["userId"]==1749042]

In [27]:
# df_training[df_training["label"]==0]

In [28]:
# df_training[df_training["userId"]==1563081]

In [29]:
# df_training.isna().sum(
# )

In [30]:
# df_training[df_training["churn_day"]==39]

In [31]:
# df_training[df_training["userId"]==1162900] 

In [32]:
df_training.columns

Index(['userId', 'gender', 'registration', 'operating_system', 'browser',
       'num_unique_artists', 'count_about', 'count_add_friend',
       'count_add_to_playlist', 'count_cancel',
       'count_cancellation_confirmation', 'count_downgrade', 'count_error',
       'count_help', 'count_home', 'count_logout', 'count_nextsong',
       'count_roll_advert', 'count_save_settings', 'count_settings',
       'count_submit_downgrade', 'count_submit_upgrade', 'count_thumbs_down',
       'count_thumbs_up', 'count_upgrade', 'count_total_sessions', 'last_time',
       'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency',
       'avg_songs_session', 'thumbs_ratio', 'errors_per_session',
       'ads_per_session', 'last_level', 'hours_since_last_session',
       'active_days', 'active_days_ratio', 'session_length_variance',
       'is_new_user', 'hours_since_downgrade', 'num_unique_songs',
       'unique_songs_ratio', 'early_actions', 'late_actions',
       'within_window_activity_rati

In [33]:
numerical_features = df_training.select_dtypes(include=np.number).columns.tolist()
print(numerical_features)

['userId', 'num_unique_artists', 'count_about', 'count_add_friend', 'count_add_to_playlist', 'count_cancel', 'count_cancellation_confirmation', 'count_downgrade', 'count_error', 'count_help', 'count_home', 'count_logout', 'count_nextsong', 'count_roll_advert', 'count_save_settings', 'count_settings', 'count_submit_downgrade', 'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up', 'count_upgrade', 'count_total_sessions', 'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency', 'avg_songs_session', 'thumbs_ratio', 'errors_per_session', 'ads_per_session', 'hours_since_last_session', 'active_days', 'active_days_ratio', 'session_length_variance', 'is_new_user', 'hours_since_downgrade', 'num_unique_songs', 'unique_songs_ratio', 'early_actions', 'late_actions', 'within_window_activity_ratio', 'within_window_activity_change', 'early_songs_played', 'late_songs_played', 'song_listening_change', 'recent_actions_last_3d', 'recent_activity_ratio', 'early_avg_items_per_session', '

In [34]:
df_training.isna().sum()

userId                                 0
gender                                 0
registration                           0
operating_system                       0
browser                                0
num_unique_artists                     0
count_about                            0
count_add_friend                       0
count_add_to_playlist                  0
count_cancel                           0
count_cancellation_confirmation        0
count_downgrade                        0
count_error                            0
count_help                             0
count_home                             0
count_logout                           0
count_nextsong                         0
count_roll_advert                      0
count_save_settings                    0
count_settings                         0
count_submit_downgrade                 0
count_submit_upgrade                   0
count_thumbs_down                      0
count_thumbs_up                        0
count_upgrade   