In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

df_clean_churn = pd.read_parquet(path_clean_dataset)
df_clean_churn.dtypes


gender                     category
level                      category
userId                        int32
page                       category
sessionId                     int64
itemInSession                 int64
length                      float64
song                       category
artist                     category
time                 datetime64[us]
registration         datetime64[us]
metropolitan_area          category
region                     category
operating_system           category
browser                    category
dtype: object

In [2]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
10962127,F,free,1110476,Home,2492,62,0.0,No song,No artist,2018-10-05 14:04:09,2018-06-05 22:59:17,Chicago-Naperville-Elgin,IL-IN-WI,iPad,Safari
11328629,F,free,1688692,Home,113063,12,0.0,No song,No artist,2018-10-24 00:58:40,2018-09-14 11:12:01,Minneapolis-St. Paul-Bloomington,MN-WI,Macintosh,Safari
4245264,F,free,1494675,NextSong,123925,19,205.03465,The R,Eric B. & Rakim,2018-10-27 20:37:19,2018-08-24 17:58:38,Chicago-Naperville-Elgin,IL-IN-WI,Macintosh,Firefox
8347610,M,paid,1383233,NextSong,164651,12,294.1122,Fix You,Coldplay,2018-11-11 03:22:15,2018-07-14 13:44:44,Dumas,TX,Windows,Chrome
1653312,M,paid,1206824,NextSong,149439,1,191.39873,Hey Scenesters!,The Cribs,2018-11-02 04:20:38,2018-09-16 04:01:50,Atlanta-Sandy Springs-Roswell,GA,Macintosh,Chrome


In [3]:
min_time = df_clean_churn["time"].min()
max_time = df_clean_churn["time"].max()

min_time, max_time

(Timestamp('2018-10-01 00:00:01'), Timestamp('2018-11-20 00:00:00'))

In [4]:
df_clean_churn["day"] = (df_clean_churn["time"] - min_time).dt.days

min_day = df_clean_churn["day"].min()
max_day = df_clean_churn["day"].max()

min_day, max_day

(0, 49)

In [5]:
# find user churn events
df_churn_events = df_clean_churn[
    df_clean_churn["page"] == "Cancellation Confirmation"
]

# userId → churn_day
user_churn_day = df_churn_events.set_index("userId")["day"].astype("float")

user_churn_day.sample(5)

userId
1870948    20.0
1703593     1.0
1749809    21.0
1292942    30.0
1345947    35.0
Name: day, dtype: float64

In [6]:
window_size = 10
T_values = list(range(max_day - window_size, -1, -window_size))

T_values

[39, 29, 19, 9]

In [7]:
T = 39
df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()


len(df_window)

14480064

In [8]:
print(df_window.head())

  gender level   userId      page  sessionId  itemInSession     length  \
0      M  paid  1749042  NextSong      22683            278  524.32934   
1      M  paid  1749042  NextSong      22683            279  178.02404   
2      M  paid  1749042  NextSong      22683            280  232.61995   
3      M  paid  1749042  NextSong      22683            281  265.50812   
4      M  paid  1749042  NextSong      22683            282  471.69261   

                                     song                 artist  \
0  Ich mache einen Spiegel - Dream Part 4              Popol Vuh   
1                 Monster (Album Version)                Skillet   
2                       Seven Nation Army      The White Stripes   
3        Under The Bridge (Album Version)  Red Hot Chili Peppers   
4                            Circlesong 6         Bobby McFerrin   

                 time        registration            metropolitan_area region  \
0 2018-10-01 00:00:01 2018-08-08 13:22:21  Dallas-Fort Worth-Arli

In [16]:
def build_user_features(df_window):
    # Input: raw dataset where day <= cutoff day T
    # Output: user level features aggregated to day T

    # Base DataFrame
    df_users = (
        df_window[["userId", "gender", "registration", "operating_system", "browser"]]
        .drop_duplicates(subset=["userId"])
        .set_index("userId")
    )

    # Unique artist
    df_unique_artists = (
        df_window.groupby("userId")["artist"]
        .nunique()
        .rename("num_unique_artists")
    )
    df_users = df_users.join(df_unique_artists)

    # Page counts
    df_page_counts = (
        df_window.groupby("userId")["page"]
        .value_counts()
        .unstack(fill_value=0)
    )
    df_page_counts.columns = [
        f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns
    ]

    df_users = df_users.join(df_page_counts)


    # Total session count
    df_session_count = (
        df_window.groupby("userId")["sessionId"]
        .nunique()
        .rename("count_total_sessions")
    )
    df_users = df_users.join(df_session_count)

    # Lifecycle (hours)
    df_last_time = (
        df_window.groupby("userId")["time"]
        .max()
        .rename("last_time")
    )
    df_users = df_users.join(df_last_time)

    df_users["user_lifecycle_h"] = (
        (df_users["last_time"] - df_users["registration"]).dt.total_seconds() / 3600
    )

    # Total length
    df_length = (
        df_window.groupby("userId")["length"]
        .sum()
        .rename("ttl_length")
    )
    df_users = df_users.join(df_length)

    # Items per session
    df_item_per_session = (
        df_window.groupby("userId")["itemInSession"].max()
        / df_users["count_total_sessions"]
    )
    df_item_per_session = df_item_per_session.rename("item_per_session")
    df_users = df_users.join(df_item_per_session)

    # Frequency (sessions per user life cycle in hours

    df_users["frequency"] = (
        df_users["count_total_sessions"] / df_users["user_lifecycle_h"]
    )

    # avg songs per session
    if "count_nextsong" in df_users.columns:
        df_users["avg_songs_session"] = (
            df_users["count_nextsong"] / df_users["count_total_sessions"]
        )
    else:
        df_users["avg_songs_session"] = 0

    # Thumbs_up/down ratios
    df_users["thumbs_ratio"] = df_users["count_thumbs_up"] / (df_users["count_thumbs_down"] + df_users["count_thumbs_up"])
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].replace(np.inf, 0)

    # Errors per session
    if "count_error" in df_users.columns:
        df_users["errors_per_session"] = (
            df_users["count_error"] / df_users["count_total_sessions"]
        )
    else:
        df_users["errors_per_session"] = 0

    # Ads per session
    if "count_roll_advert" in df_users.columns:
        df_users["ads_per_session"] = (
            df_users["count_roll_advert"] / df_users["count_total_sessions"]
        )
    else:
        df_users["ads_per_session"] = 0

    return df_users


In [20]:
build_user_features(df_window)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,9,2018-10-21 01:16:24,1763.900833,256456.28661,51.444444,0.005102,113.777778,0.850000,0.000000,0.000000
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,6,2018-11-03 02:00:13,1030.581944,123645.65861,29.666667,0.005822,82.833333,0.833333,0.000000,0.000000
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,0,...,27,2018-11-09 13:10:30,1487.368056,213208.87262,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185
1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,1,...,20,2018-10-30 23:17:30,1820.775000,452234.08024,25.650000,0.010984,89.650000,0.833333,0.150000,0.850000
1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,0,...,10,2018-11-03 00:29:30,1062.098611,321970.48842,34.800000,0.009415,128.100000,0.780488,0.100000,0.300000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418744,F,2018-10-22 13:27:55,Macintosh,Safari,87,0,3,5,0,0,...,1,2018-11-09 07:44:45,426.280556,22066.22015,130.000000,0.002346,87.000000,0.857143,0.000000,5.000000
1934047,M,2018-08-31 04:28:43,Macintosh,Chrome,271,0,3,11,0,0,...,1,2018-11-09 23:57:32,1699.480278,73805.63500,353.000000,0.000588,290.000000,0.791667,2.000000,1.000000
1205281,F,2018-09-26 17:21:05,iPad,Safari,88,0,0,5,0,0,...,2,2018-11-09 18:08:22,1056.788056,21789.76404,48.500000,0.001893,46.000000,1.000000,0.000000,5.000000
1266866,F,2018-09-14 22:37:16,Windows,Firefox,3,0,0,0,0,0,...,1,2018-11-09 11:27:54,1332.843889,382.48399,3.000000,0.000750,2.000000,1.000000,0.000000,0.000000


In [23]:
print(build_user_features(df_window).dtypes)

gender                                   category
registration                       datetime64[us]
operating_system                         category
browser                                  category
num_unique_artists                          int64
count_about                                 int64
count_add_friend                            int64
count_add_to_playlist                       int64
count_cancel                                int64
count_cancellation_confirmation             int64
count_downgrade                             int64
count_error                                 int64
count_help                                  int64
count_home                                  int64
count_logout                                int64
count_nextsong                              int64
count_roll_advert                           int64
count_save_settings                         int64
count_settings                              int64
count_submit_downgrade                      int64


In [38]:
def add_label(df_users, user_churn_day, T, window_size=10):

    df_users["churn_day"] = user_churn_day.astype("float")

    df_users["label"] = 0

    df_users.loc[df_users["churn_day"] <= T + window_size, "label"] = 1
    df_users.loc[df_users["churn_day"] <= T, "label"] = 1

    df_users["label"] = df_users["label"].fillna(0).astype(int)

    return df_users

In [31]:
df_users_T39 = build_user_features(df_window)
df_users_T39.head()
df_users_T39.shape

(18880, 34)

In [40]:
df_users_T39 = add_label(df_users_T39, user_churn_day, T=39)

In [41]:
df_users_T39["snapshot_day"] = 39
df_users_T39 = df_users_T39.reset_index()
df_users_T39

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,churn_day,label,snapshot_day
0,1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,...,256456.28661,51.444444,0.005102,113.777778,0.850000,0.000000,0.000000,20.0,1,39
1,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,123645.65861,29.666667,0.005822,82.833333,0.833333,0.000000,0.000000,,0,39
2,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,213208.87262,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185,,0,39
3,1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,...,452234.08024,25.650000,0.010984,89.650000,0.833333,0.150000,0.850000,29.0,1,39
4,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,321970.48842,34.800000,0.009415,128.100000,0.780488,0.100000,0.300000,,0,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18875,1418744,F,2018-10-22 13:27:55,Macintosh,Safari,87,0,3,5,0,...,22066.22015,130.000000,0.002346,87.000000,0.857143,0.000000,5.000000,,0,39
18876,1934047,M,2018-08-31 04:28:43,Macintosh,Chrome,271,0,3,11,0,...,73805.63500,353.000000,0.000588,290.000000,0.791667,2.000000,1.000000,,0,39
18877,1205281,F,2018-09-26 17:21:05,iPad,Safari,88,0,0,5,0,...,21789.76404,48.500000,0.001893,46.000000,1.000000,0.000000,5.000000,,0,39
18878,1266866,F,2018-09-14 22:37:16,Windows,Firefox,3,0,0,0,0,...,382.48399,3.000000,0.000750,2.000000,1.000000,0.000000,0.000000,,0,39


In [42]:
user_churn_day.dtype


dtype('float64')

In [45]:
print(user_churn_day.head())
print(user_churn_day.dtype)
user_churn_day.index.is_unique



userId
1749042    20.0
1222580    29.0
1385500    47.0
1032628     0.0
1009070     2.0
Name: day, dtype: float64
float64


True

In [47]:
def build_training_dataset(df_clean_churn, user_churn_day, T_values, window_size=10):

    all_snapshots = []

    for T in T_values:
        print(f"Processing snapshot for T = {T} ...")

        # filter window
        df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()

        # build features
        df_features = build_user_features(df_window)

        # add labels
        df_labeled = add_label(df_features, user_churn_day, T=T, window_size=window_size)

        # add snapshot day column
        df_labeled["snapshot_day"] = T

        # reset index so userId becomes a column
        df_labeled = df_labeled.reset_index()

        # keep the snapshot
        all_snapshots.append(df_labeled)

    df_final = pd.concat(all_snapshots, axis = 0).reset_index(drop=True)

    return df_final


In [48]:
df_training = build_training_dataset(df_clean_churn, user_churn_day, T_values)


Processing snapshot for T = 39 ...
Processing snapshot for T = 29 ...
Processing snapshot for T = 19 ...
Processing snapshot for T = 9 ...


In [50]:
df_training.head

<bound method NDFrame.head of         userId gender        registration operating_system  browser  \
0      1749042      M 2018-08-08 13:22:21          Windows   Chrome   
1      1563081      F 2018-09-21 03:25:18        Macintosh   Chrome   
2      1697168      F 2018-09-08 13:48:25        Macintosh  Firefox   
3      1222580      M 2018-08-16 02:31:00        Macintosh   Safari   
4      1714398      F 2018-09-19 18:23:35          Windows   Chrome   
...        ...    ...                 ...              ...      ...   
69878  1208557      F 2018-08-02 15:05:09          Windows   Chrome   
69879  1896647      F 2018-08-09 12:48:50          Windows   Chrome   
69880  1494890      F 2018-09-14 03:15:14          Windows   Chrome   
69881  1729584      M 2018-09-14 18:30:17        Macintosh   Safari   
69882  1878214      M 2018-08-04 03:43:33        Macintosh   Chrome   

       num_unique_artists  count_about  count_add_friend  \
0                     797            5                18 

In [51]:
df_training["snapshot_day"].value_counts()


snapshot_day
39    18880
29    18333
19    17416
9     15254
Name: count, dtype: int64

In [52]:
df_training["label"].value_counts()


label
0    57344
1    12539
Name: count, dtype: int64

In [53]:
df_training.head()

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,churn_day,label,snapshot_day
0,1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,...,256456.28661,51.444444,0.005102,113.777778,0.85,0.0,0.0,20.0,1,39
1,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,123645.65861,29.666667,0.005822,82.833333,0.833333,0.0,0.0,,0,39
2,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,213208.87262,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185,,0,39
3,1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,...,452234.08024,25.65,0.010984,89.65,0.833333,0.15,0.85,29.0,1,39
4,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,321970.48842,34.8,0.009415,128.1,0.780488,0.1,0.3,,0,39
