In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

df_clean_churn = pd.read_parquet(path_clean_dataset)
df_clean_churn.dtypes

gender                     category
level                      category
userId                        int64
page                       category
sessionId                     int64
itemInSession                 int64
length                      float64
song                       category
artist                     category
time                 datetime64[us]
registration         datetime64[us]
metropolitan_area          category
region                     category
operating_system           category
browser                    category
dtype: object

In [2]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
6354325,M,paid,1190352,NextSong,187503,54,280.18893,Forever,Chris Brown,2018-11-17 19:08:00,2018-09-17 05:31:20,Chattanooga,TN-GA,Windows,Chrome
2861903,F,paid,1937580,NextSong,138645,25,238.18404,Bruised (Snakes Remix),Jack's Mannequin,2018-10-30 18:16:06,2018-09-17 09:33:30,Salinas,CA,Linux,Chrome
676106,F,paid,1813112,NextSong,147182,268,308.47955,Lewis Takes Off His Shirt,Owen Pallett,2018-11-03 03:38:05,2018-09-02 23:57:02,New York-Newark-Jersey City,NY-NJ-PA,Windows,Chrome
2530285,M,free,1872583,NextSong,80700,54,119.58812,Go Skate! (Possessed To Skate '97),Suicidal Tendencies,2018-10-15 00:04:03,2018-09-26 04:55:39,Miami-Fort Lauderdale-West Palm Beach,FL,Windows,Firefox
5401237,F,paid,1670523,Home,192531,64,0.0,No song,No artist,2018-11-15 10:10:06,2018-08-08 16:26:01,Bloomington,IN,Windows,Chrome


In [3]:
min_time = df_clean_churn["time"].min()
max_time = df_clean_churn["time"].max()

min_time, max_time

(Timestamp('2018-10-01 00:00:01'), Timestamp('2018-11-20 00:00:00'))

In [4]:
df_clean_churn["day"] = (df_clean_churn["time"] - min_time).dt.days

min_day = df_clean_churn["day"].min()
max_day = df_clean_churn["day"].max()

min_day, max_day

(0, 49)

In [5]:
# find user churn events
df_churn_events = df_clean_churn[
    df_clean_churn["page"] == "Cancellation Confirmation"
]

# userId → churn_day
user_churn_day = df_churn_events.set_index("userId")["day"].astype("float")

user_churn_day.sample(5)

userId
1492237    18.0
1094896    22.0
1475989    45.0
1616650    44.0
1386265    10.0
Name: day, dtype: float64

In [6]:
window_size = 10
T_values = list(range(max_day - window_size, -1, -window_size))

T_values

[39, 29, 19, 9]

In [7]:
T = 39
df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()


len(df_window)

14480064

In [8]:
print(df_window.head())

  gender level   userId      page  sessionId  itemInSession     length  \
0      M  paid  1749042  NextSong      22683            278  524.32934   
1      M  paid  1749042  NextSong      22683            279  178.02404   
2      M  paid  1749042  NextSong      22683            280  232.61995   
3      M  paid  1749042  NextSong      22683            281  265.50812   
4      M  paid  1749042  NextSong      22683            282  471.69261   

                                     song                 artist  \
0  Ich mache einen Spiegel - Dream Part 4              Popol Vuh   
1                 Monster (Album Version)                Skillet   
2                       Seven Nation Army      The White Stripes   
3        Under The Bridge (Album Version)  Red Hot Chili Peppers   
4                            Circlesong 6         Bobby McFerrin   

                 time        registration            metropolitan_area region  \
0 2018-10-01 00:00:01 2018-08-08 13:22:21  Dallas-Fort Worth-Arli

In [9]:
def build_user_features(df_window):
    # Input: raw dataset where day <= cutoff day T
    # Output: user level features aggregated to day T

    # Sort to ensure correct time series
    df_window = df_window.sort_values(by="time", ascending=True)

    # Base DataFrame
    df_users = (
        df_window[["userId", "gender", "registration", "operating_system", "browser"]]
        .drop_duplicates(subset=["userId"])
        .set_index("userId")
    )

    # Unique artist
    df_unique_artists = (
        df_window.groupby("userId")["artist"]
        .nunique()
        .rename("num_unique_artists")
    )
    df_users = df_users.join(df_unique_artists)

    # Page counts
    df_page_counts = (
        df_window.groupby("userId")["page"]
        .value_counts()
        .unstack(fill_value=0)
    )
    df_page_counts.columns = [
        f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns
    ]

    df_users = df_users.join(df_page_counts)


    # Total session count
    df_session_count = (
        df_window.groupby("userId")["sessionId"]
        .nunique()
        .rename("count_total_sessions")
    )
    df_users = df_users.join(df_session_count)

    # Lifecycle (hours)
    df_last_time = (
        df_window.groupby("userId")["time"]
        .max()
        .rename("last_time")
    )
    df_users = df_users.join(df_last_time)

    df_users["user_lifecycle_h"] = (
        (df_users["last_time"] - df_users["registration"]).dt.total_seconds() / 3600
    )

    # Total length
    df_length = (
        df_window.groupby("userId")["length"]
        .sum()
        .rename("ttl_length")
    )
    df_users = df_users.join(df_length)

    # Items per session
    df_item_per_session = (
        df_window.groupby("userId")["itemInSession"].max()
        / df_users["count_total_sessions"]
    )
    df_item_per_session = df_item_per_session.rename("item_per_session")
    df_users = df_users.join(df_item_per_session)

    # Frequency (sessions per user life cycle in hours

    df_users["frequency"] = (
        df_users["count_total_sessions"] / df_users["user_lifecycle_h"]
    )
    df_users["frequency"] = df_users["frequency"].replace(np.inf, 0)

    # avg songs per session
    if "count_nextsong" in df_users.columns:
        df_users["avg_songs_session"] = (
            df_users["count_nextsong"] / df_users["count_total_sessions"]
        )
    else:
        df_users["avg_songs_session"] = 0

    # Thumbs_up/down ratios
    df_users["thumbs_ratio"] = df_users["count_thumbs_up"] / (df_users["count_thumbs_down"] + df_users["count_thumbs_up"])
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].fillna(0)
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].replace(np.inf, 0)

    # Errors per session
    if "count_error" in df_users.columns:
        df_users["errors_per_session"] = (
            df_users["count_error"] / df_users["count_total_sessions"]
        )
    else:
        df_users["errors_per_session"] = 0

    # Ads per session
    if "count_roll_advert" in df_users.columns:
        df_users["ads_per_session"] = (
            df_users["count_roll_advert"] / df_users["count_total_sessions"]
        )
    else:
        df_users["ads_per_session"] = 0

    # Last Level (of paid or free)
    df_last_level = (
    df_window.groupby("userId")["level"]
    .last()
    .rename("last_level")
    )
    df_users = df_users.join(df_last_level)
    
    # Hours since last session
    current_time = df_window["time"].max()
    df_users["hours_since_last_session"] = (current_time - df_users["last_time"]).dt.total_seconds() / 3600

    # Active days ratio
    df_active_days = (
    df_window.groupby("userId")["time"]
    .apply(lambda x: x.dt.date.nunique())
    .rename("active_days")
    )
    df_users = df_users.join(df_active_days)
    window_duration = (df_window["time"].max() - df_window["time"].min()).days + 1
    df_users["active_days_ratio"] = df_users["active_days"] / window_duration
    df_users["active_days_ratio"] = df_users["active_days_ratio"].fillna(0)

    # Session length variance
    df_session_lengths = (
    df_window.groupby(["userId", "sessionId"])["length"]
    .sum()
    .reset_index()
    )

    df_session_variance = (
    df_session_lengths.groupby("userId")["length"]
    .std()
    .rename("session_length_variance")
    )

    df_users = df_users.join(df_session_variance)
    df_users["session_length_variance"] = df_users["session_length_variance"].fillna(0)

    # New user
    df_users["is_new_user"] = (df_users["user_lifecycle_h"] < 720).astype(int)

    # Hours since last downgrade
    if "count_downgrade" in df_users.columns and df_users["count_downgrade"].sum() > 0:
        df_last_downgrade = (
            df_window[df_window["page"] == "Downgrade"]
            .groupby("userId")["time"]
            .max()
            .rename("last_downgrade_time")
        )
        df_users = df_users.join(df_last_downgrade)
        df_users["hours_since_downgrade"] = (
            (current_time - df_users["last_downgrade_time"]).dt.total_seconds() / 3600
        )
        df_users["hours_since_downgrade"] = df_users["hours_since_downgrade"].fillna(999)
        df_users = df_users.drop(columns=["last_downgrade_time"])
    else:
        df_users["hours_since_downgrade"] = 999*3600

    # Unique songs ratio
    df_unique_songs = (
    df_window[df_window["page"] == "NextSong"]
    .groupby("userId")["song"]
    .nunique()
    .rename("num_unique_songs")
    )
    df_users = df_users.join(df_unique_songs)

    if "count_nextsong" in df_users.columns:
        df_users["unique_songs_ratio"] = (
            df_users["num_unique_songs"] / df_users["count_nextsong"]
        )
        df_users["unique_songs_ratio"] = df_users["unique_songs_ratio"].fillna(0)
        df_users["unique_songs_ratio"] = df_users["unique_songs_ratio"].replace(np.inf, 0)
    else:
        df_users["unique_songs_ratio"] = 0

    return df_users


In [10]:
build_user_features(df_window)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,ads_per_session,last_level,hours_since_last_session,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,0.000000,paid,478.726667,14,0.350,32019.507689,0,478.835278,945.0,0.922852
1484921,M,2018-09-16 09:11:42,Linux,Chrome,409,1,10,14,0,0,...,0.666667,paid,195.432222,7,0.175,3941.193667,0,779.140556,446.0,0.971678
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,259,0,3,6,0,0,...,1.625000,free,0.008611,8,0.200,7954.631237,0,4.231667,282.0,0.955932
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,0,...,2.185185,paid,10.825000,25,0.625,6989.034408,0,177.998889,809.0,0.937428
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,0.000000,paid,165.996389,6,0.150,14209.894271,0,251.946111,474.0,0.953722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035157,M,2018-09-07 23:46:40,Macintosh,Safari,43,0,0,1,0,0,...,4.000000,free,3.470000,1,0.025,0.000000,0,999.000000,42.0,1.000000
1624824,M,2018-09-10 18:44:36,iPhone,Safari,50,0,2,3,0,0,...,0.000000,paid,0.002500,1,0.025,0.000000,0,999.000000,50.0,1.000000
1668706,F,2018-08-27 08:53:07,Macintosh,Chrome,38,1,0,1,0,0,...,4.000000,free,0.021389,1,0.025,0.000000,0,999.000000,37.0,1.000000
1921154,M,2018-07-10 17:19:26,Windows,Firefox,5,0,0,0,0,0,...,0.000000,free,1.722222,1,0.025,0.000000,0,999.000000,4.0,1.000000


In [11]:
print(build_user_features(df_window).dtypes)

gender                                   category
registration                       datetime64[us]
operating_system                         category
browser                                  category
num_unique_artists                          int64
count_about                                 int64
count_add_friend                            int64
count_add_to_playlist                       int64
count_cancel                                int64
count_cancellation_confirmation             int64
count_downgrade                             int64
count_error                                 int64
count_help                                  int64
count_home                                  int64
count_logout                                int64
count_nextsong                              int64
count_roll_advert                           int64
count_save_settings                         int64
count_settings                              int64
count_submit_downgrade                      int64


In [12]:
def add_label(df_users, user_churn_day, T, window_size=10):

    df_users["churn_day"] = user_churn_day.astype("float")

    df_users["label"] = 0

    T_end_window = T + window_size

    churn_in_window_mask = ((df_users["churn_day"] > T) & (df_users["churn_day"] <= T_end_window))
    df_users.loc[churn_in_window_mask, "label"] = 1

    df_users = df_users[df_users["churn_day"].isna() | (df_users["churn_day"] > T)].copy()

    df_users["label"] = df_users["label"].fillna(0).astype(int)

    return df_users

In [13]:
df_users_T39 = build_user_features(df_window)
df_users_T39.head()
df_users_T39.shape

(18880, 43)

In [14]:
df_users_T39 = add_label(df_users_T39, user_churn_day, T=39)

In [15]:
df_users_T39["snapshot_day"] = 39
df_users_T39 = df_users_T39.reset_index()
df_users_T39

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
0,1484921,M,2018-09-16 09:11:42,Linux,Chrome,409,1,10,14,0,...,7,0.175,3941.193667,0,779.140556,446.0,0.971678,,0,39
1,1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,259,0,3,6,0,...,8,0.200,7954.631237,0,4.231667,282.0,0.955932,,0,39
2,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,25,0.625,6989.034408,0,177.998889,809.0,0.937428,,0,39
3,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,6,0.150,14209.894271,0,251.946111,474.0,0.953722,,0,39
4,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,13,0.325,18296.247913,0,285.468056,1191.0,0.929742,,0,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15271,1035157,M,2018-09-07 23:46:40,Macintosh,Safari,43,0,0,1,0,...,1,0.025,0.000000,0,999.000000,42.0,1.000000,,0,39
15272,1624824,M,2018-09-10 18:44:36,iPhone,Safari,50,0,2,3,0,...,1,0.025,0.000000,0,999.000000,50.0,1.000000,,0,39
15273,1668706,F,2018-08-27 08:53:07,Macintosh,Chrome,38,1,0,1,0,...,1,0.025,0.000000,0,999.000000,37.0,1.000000,,0,39
15274,1921154,M,2018-07-10 17:19:26,Windows,Firefox,5,0,0,0,0,...,1,0.025,0.000000,0,999.000000,4.0,1.000000,,0,39


In [16]:
user_churn_day.dtype


dtype('float64')

In [17]:
print(user_churn_day.head())
print(user_churn_day.dtype)
user_churn_day.index.is_unique



userId
1749042    20.0
1222580    29.0
1385500    47.0
1032628     0.0
1009070     2.0
Name: day, dtype: float64
float64


True

In [18]:
def build_training_dataset(df_clean_churn, user_churn_day, T_values, window_size=10):

    all_snapshots = []

    for T in T_values:
        print(f"Processing snapshot for T = {T} ...")

        # filter window
        df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()

        # build features
        df_features = build_user_features(df_window)

        # add labels
        df_labeled = add_label(df_features, user_churn_day, T=T, window_size=window_size)

        # add snapshot day column
        df_labeled["snapshot_day"] = T

        # reset index so userId becomes a column
        df_labeled = df_labeled.reset_index()

        # keep the snapshot
        all_snapshots.append(df_labeled)

    df_final = pd.concat(all_snapshots, axis = 0).reset_index(drop=True)

    return df_final


In [19]:
df_training = build_training_dataset(df_clean_churn, user_churn_day, T_values)


Processing snapshot for T = 39 ...
Processing snapshot for T = 29 ...
Processing snapshot for T = 19 ...
Processing snapshot for T = 9 ...


In [20]:
df_training.head

<bound method NDFrame.head of         userId gender        registration operating_system  browser  \
0      1484921      M 2018-09-16 09:11:42            Linux   Chrome   
1      1694515      M 2018-09-15 04:03:02        Macintosh   Chrome   
2      1697168      F 2018-09-08 13:48:25        Macintosh  Firefox   
3      1563081      F 2018-09-21 03:25:18        Macintosh   Chrome   
4      1714398      F 2018-09-19 18:23:35          Windows   Chrome   
...        ...    ...                 ...              ...      ...   
60429  1662852      F 2018-09-07 21:21:06          Windows  Firefox   
60430  1013799      M 2018-07-27 23:31:47        Macintosh   Safari   
60431  1869507      M 2018-08-21 03:28:42          Windows   Chrome   
60432  1921067      F 2018-09-12 08:00:37          Windows  Firefox   
60433  1317591      M 2018-07-28 02:56:10        Macintosh   Chrome   

       num_unique_artists  count_about  count_add_friend  \
0                     409            1                10 

In [21]:
df_training["snapshot_day"].value_counts()


snapshot_day
29    15530
19    15452
39    15276
9     14176
Name: count, dtype: int64

In [22]:
df_training["label"].value_counts()


label
0    57344
1     3090
Name: count, dtype: int64

In [23]:
df_training.head()

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
0,1484921,M,2018-09-16 09:11:42,Linux,Chrome,409,1,10,14,0,...,7,0.175,3941.193667,0,779.140556,446.0,0.971678,,0,39
1,1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,259,0,3,6,0,...,8,0.2,7954.631237,0,4.231667,282.0,0.955932,,0,39
2,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,25,0.625,6989.034408,0,177.998889,809.0,0.937428,,0,39
3,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,6,0.15,14209.894271,0,251.946111,474.0,0.953722,,0,39
4,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,13,0.325,18296.247913,0,285.468056,1191.0,0.929742,,0,39


In [24]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "03_10_day_window_sliced.parquet"
df_training.to_parquet(checkpoint_file_path, index=False)

In [25]:
df_training[df_training["label"]==1]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
12,1385500,M,2018-08-16 04:30:35,Linux,Chrome,1433,4,47,72,0,...,22,0.550,25383.324157,0,2.153056,1921.0,0.874374,47.0,1,39
28,1240184,M,2018-09-19 18:34:21,Linux,Firefox,2444,8,82,153,0,...,38,0.950,30710.775030,0,52.516944,3708.0,0.802424,44.0,1,39
44,1418529,F,2018-09-21 19:27:33,Windows,Firefox,1805,9,57,76,0,...,28,0.700,25858.691421,0,34.960000,2546.0,0.856951,41.0,1,39
50,1295776,F,2018-09-19 22:50:26,Windows,Chrome,1465,7,31,66,0,...,26,0.650,18624.599020,0,172.003333,1938.0,0.867114,46.0,1,39
51,1507760,M,2018-09-13 23:33:17,Macintosh,Chrome,429,1,7,13,0,...,7,0.175,18777.067194,0,385.482500,475.0,0.951904,46.0,1,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60322,1772505,F,2018-07-24 10:52:15,Windows,Firefox,1,0,0,0,0,...,1,0.100,0.000000,0,999.000000,,0.000000,14.0,1,9
60364,1064758,F,2018-08-24 02:03:49,Macintosh,Firefox,4,0,0,0,0,...,1,0.100,0.000000,0,999.000000,3.0,1.000000,10.0,1,9
60379,1772854,F,2018-08-30 06:27:46,Macintosh,Chrome,53,0,0,2,0,...,1,0.100,0.000000,0,999.000000,53.0,1.000000,10.0,1,9
60382,1868830,F,2018-07-10 19:45:45,Linux,Chrome,6,0,0,0,0,...,1,0.100,0.000000,0,999.000000,5.0,1.000000,15.0,1,9


In [26]:
df_training[df_training["userId"]==1099753]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
58328,1099753,F,2018-09-15 08:48:59,Windows,Chrome,38,0,0,0,0,...,2,0.2,2962.465541,1,999.0,37.0,1.0,11.0,1,9


In [27]:
df_training[df_training["userId"]==1749042]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
30806,1749042,M,2018-08-08 13:22:21,Windows,Chrome,787,5,18,31,0,...,13,0.65,32254.578767,0,43.683333,933.0,0.922849,20.0,1,19
46258,1749042,M,2018-08-08 13:22:21,Windows,Chrome,417,2,7,13,0,...,8,0.8,24125.96559,0,158.610833,465.0,0.95679,20.0,0,9


In [28]:
df_training[df_training["label"]==0]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
0,1484921,M,2018-09-16 09:11:42,Linux,Chrome,409,1,10,14,0,...,7,0.175,3941.193667,0,779.140556,446.0,0.971678,,0,39
1,1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,259,0,3,6,0,...,8,0.200,7954.631237,0,4.231667,282.0,0.955932,,0,39
2,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,25,0.625,6989.034408,0,177.998889,809.0,0.937428,,0,39
3,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,6,0.150,14209.894271,0,251.946111,474.0,0.953722,,0,39
4,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,13,0.325,18296.247913,0,285.468056,1191.0,0.929742,,0,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60429,1662852,F,2018-09-07 21:21:06,Windows,Firefox,9,0,0,0,0,...,1,0.100,0.000000,0,999.000000,8.0,1.000000,,0,9
60430,1013799,M,2018-07-27 23:31:47,Macintosh,Safari,8,0,0,1,0,...,1,0.100,0.000000,0,999.000000,7.0,1.000000,,0,9
60431,1869507,M,2018-08-21 03:28:42,Windows,Chrome,4,0,0,0,0,...,1,0.100,0.000000,0,999.000000,3.0,1.000000,,0,9
60432,1921067,F,2018-09-12 08:00:37,Windows,Firefox,3,0,0,0,0,...,1,0.100,0.000000,1,999.000000,2.0,1.000000,,0,9


In [29]:
df_training[df_training["userId"]==1563081]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
3,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,6,0.15,14209.894271,0,251.946111,474.0,0.953722,,0,39
15279,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,303,1,4,8,0,...,4,0.133333,9487.958575,0,11.946111,316.0,0.960486,,0,29
30810,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,104,0,1,1,0,...,2,0.1,9965.575105,1,479.084167,107.0,0.963964,,0,19
46262,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,24,0,0,0,0,...,1,0.1,0.0,1,239.084167,24.0,0.96,,0,9


In [30]:
df_training.isna().sum(
)

userId                                 0
gender                                 0
registration                           0
operating_system                       0
browser                                0
num_unique_artists                     0
count_about                            0
count_add_friend                       0
count_add_to_playlist                  0
count_cancel                           0
count_cancellation_confirmation        0
count_downgrade                        0
count_error                            0
count_help                             0
count_home                             0
count_logout                           0
count_nextsong                         0
count_roll_advert                      0
count_save_settings                    0
count_settings                         0
count_submit_downgrade                 0
count_submit_upgrade                   0
count_thumbs_down                      0
count_thumbs_up                        0
count_upgrade   

In [31]:
df_training[df_training["churn_day"]==39]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
15365,1162900,M,2018-07-25 10:47:59,Macintosh,Firefox,1096,4,32,38,0,...,18,0.600000,25905.432779,0,4.018889,1381.0,0.910349,39.0,1,29
15413,1495207,F,2018-09-24 03:33:41,Windows,Edge,217,1,5,6,0,...,7,0.233333,6166.100651,1,999.000000,229.0,0.974468,39.0,1,29
15542,1063143,M,2018-06-29 06:33:07,iPhone,Safari,737,4,23,27,0,...,22,0.733333,14723.479262,0,394.899167,874.0,0.918067,39.0,1,29
15560,1545772,F,2018-09-01 07:01:42,Windows,Firefox,425,3,9,10,0,...,18,0.600000,4889.882101,0,999.000000,478.0,0.954092,39.0,1,29
15722,1663980,M,2018-09-11 02:03:54,Macintosh,Chrome,332,1,17,11,0,...,12,0.400000,9084.026250,0,999.000000,353.0,0.972452,39.0,1,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57403,1589072,M,2018-09-09 04:26:36,Macintosh,Safari,43,0,0,3,0,...,3,0.300000,1594.643274,0,999.000000,42.0,1.000000,39.0,0,9
58739,1905721,M,2018-09-08 22:54:45,Macintosh,Chrome,116,1,4,0,0,...,3,0.300000,20107.793495,0,11.773889,119.0,0.967480,39.0,0,9
58762,1437548,F,2018-09-16 23:15:33,Macintosh,Safari,116,0,3,4,0,...,2,0.200000,16211.017045,1,6.626667,118.0,0.975207,39.0,0,9
59382,1794800,F,2018-09-16 20:31:28,Macintosh,Safari,47,0,0,0,0,...,2,0.200000,31.513409,1,999.000000,47.0,0.979167,39.0,0,9


In [32]:
df_training[df_training["userId"]==1162900] 

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,active_days,active_days_ratio,session_length_variance,is_new_user,hours_since_downgrade,num_unique_songs,unique_songs_ratio,churn_day,label,snapshot_day
15365,1162900,M,2018-07-25 10:47:59,Macintosh,Firefox,1096,4,32,38,0,...,18,0.6,25905.432779,0,4.018889,1381.0,0.910349,39.0,1,29
30906,1162900,M,2018-07-25 10:47:59,Macintosh,Firefox,527,1,11,14,0,...,9,0.45,19838.801586,0,34.351667,590.0,0.94703,39.0,0,19
46363,1162900,M,2018-07-25 10:47:59,Macintosh,Firefox,142,1,0,5,0,...,4,0.4,7498.368663,0,198.979722,147.0,0.993243,39.0,0,9


In [33]:
df_training.columns

Index(['userId', 'gender', 'registration', 'operating_system', 'browser',
       'num_unique_artists', 'count_about', 'count_add_friend',
       'count_add_to_playlist', 'count_cancel',
       'count_cancellation_confirmation', 'count_downgrade', 'count_error',
       'count_help', 'count_home', 'count_logout', 'count_nextsong',
       'count_roll_advert', 'count_save_settings', 'count_settings',
       'count_submit_downgrade', 'count_submit_upgrade', 'count_thumbs_down',
       'count_thumbs_up', 'count_upgrade', 'count_total_sessions', 'last_time',
       'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency',
       'avg_songs_session', 'thumbs_ratio', 'errors_per_session',
       'ads_per_session', 'last_level', 'hours_since_last_session',
       'active_days', 'active_days_ratio', 'session_length_variance',
       'is_new_user', 'hours_since_downgrade', 'num_unique_songs',
       'unique_songs_ratio', 'churn_day', 'label', 'snapshot_day'],
      dtype='object')