In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

df_clean_churn = pd.read_parquet(path_clean_dataset)
df_clean_churn.dtypes


gender                     category
level                      category
userId                        int64
page                       category
sessionId                     int64
itemInSession                 int64
length                      float64
song                       category
artist                     category
time                 datetime64[us]
registration         datetime64[us]
metropolitan_area          category
region                     category
operating_system           category
browser                    category
dtype: object

In [2]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
4641982,M,paid,1745586,NextSong,139716,47,162.19383,I'm Diggin' It,Alecia Elliott,2018-10-30 22:40:59,2018-09-11 16:31:59,Chicago-Naperville-Elgin,IL-IN-WI,iPhone,Safari
7332757,M,paid,1643057,NextSong,171836,34,215.87546,Evil,Interpol,2018-11-09 13:02:11,2018-09-30 12:22:38,San Francisco-Oakland-Hayward,CA,iPad,Safari
2506367,M,paid,1388244,NextSong,135862,42,316.94322,Static_ Oh Static,Mixtapes And Cellmates,2018-10-30 02:23:57,2018-09-15 05:54:30,Beaumont-Port Arthur,TX,Windows,Firefox
7665500,M,paid,1892313,NextSong,139161,8,346.48771,Perfecting Loneliness,Jets To Brazil,2018-11-07 01:02:13,2018-08-05 10:48:33,El Dorado,AR,Windows,Firefox
6245344,M,paid,1212267,Add to Playlist,81928,33,0.0,No song,No artist,2018-10-19 17:48:31,2018-09-24 18:38:33,Mobile,AL,Windows,Chrome


In [3]:
min_time = df_clean_churn["time"].min()
max_time = df_clean_churn["time"].max()

min_time, max_time

(Timestamp('2018-10-01 00:00:01'), Timestamp('2018-11-20 00:00:00'))

In [4]:
df_clean_churn["day"] = (df_clean_churn["time"] - min_time).dt.days

min_day = df_clean_churn["day"].min()
max_day = df_clean_churn["day"].max()

min_day, max_day

(0, 49)

In [5]:
# find user churn events
df_churn_events = df_clean_churn[
    df_clean_churn["page"] == "Cancellation Confirmation"
]

# userId → churn_day
user_churn_day = df_churn_events.set_index("userId")["day"].astype("float")

user_churn_day.sample(5)

userId
1908228    18.0
1806168    37.0
1381416    36.0
1364740    28.0
1610953     2.0
Name: day, dtype: float64

In [6]:
window_size = 10
T_values = list(range(max_day - window_size, -1, -window_size))

T_values

[39, 29, 19, 9]

In [7]:
T = 39
df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()


len(df_window)

14480064

In [8]:
print(df_window.head())

  gender level   userId      page  sessionId  itemInSession     length  \
0      M  paid  1749042  NextSong      22683            278  524.32934   
1      M  paid  1749042  NextSong      22683            279  178.02404   
2      M  paid  1749042  NextSong      22683            280  232.61995   
3      M  paid  1749042  NextSong      22683            281  265.50812   
4      M  paid  1749042  NextSong      22683            282  471.69261   

                                     song                 artist  \
0  Ich mache einen Spiegel - Dream Part 4              Popol Vuh   
1                 Monster (Album Version)                Skillet   
2                       Seven Nation Army      The White Stripes   
3        Under The Bridge (Album Version)  Red Hot Chili Peppers   
4                            Circlesong 6         Bobby McFerrin   

                 time        registration            metropolitan_area region  \
0 2018-10-01 00:00:01 2018-08-08 13:22:21  Dallas-Fort Worth-Arli

In [9]:
def build_user_features(df_window):
    # Input: raw dataset where day <= cutoff day T
    # Output: user level features aggregated to day T

    # Base DataFrame
    df_users = (
        df_window[["userId", "gender", "registration", "operating_system", "browser"]]
        .drop_duplicates(subset=["userId"])
        .set_index("userId")
    )

    # Unique artist
    df_unique_artists = (
        df_window.groupby("userId")["artist"]
        .nunique()
        .rename("num_unique_artists")
    )
    df_users = df_users.join(df_unique_artists)

    # Page counts
    df_page_counts = (
        df_window.groupby("userId")["page"]
        .value_counts()
        .unstack(fill_value=0)
    )
    df_page_counts.columns = [
        f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns
    ]

    df_users = df_users.join(df_page_counts)


    # Total session count
    df_session_count = (
        df_window.groupby("userId")["sessionId"]
        .nunique()
        .rename("count_total_sessions")
    )
    df_users = df_users.join(df_session_count)

    # Lifecycle (hours)
    df_last_time = (
        df_window.groupby("userId")["time"]
        .max()
        .rename("last_time")
    )
    df_users = df_users.join(df_last_time)

    df_users["user_lifecycle_h"] = (
        (df_users["last_time"] - df_users["registration"]).dt.total_seconds() / 3600
    )

    # Total length
    df_length = (
        df_window.groupby("userId")["length"]
        .sum()
        .rename("ttl_length")
    )
    df_users = df_users.join(df_length)

    # Items per session
    df_item_per_session = (
        df_window.groupby("userId")["itemInSession"].max()
        / df_users["count_total_sessions"]
    )
    df_item_per_session = df_item_per_session.rename("item_per_session")
    df_users = df_users.join(df_item_per_session)

    # Frequency (sessions per user life cycle in hours

    df_users["frequency"] = (
        df_users["count_total_sessions"] / df_users["user_lifecycle_h"]
    )

    # avg songs per session
    if "count_nextsong" in df_users.columns:
        df_users["avg_songs_session"] = (
            df_users["count_nextsong"] / df_users["count_total_sessions"]
        )
    else:
        df_users["avg_songs_session"] = 0

    # Thumbs_up/down ratios
    df_users["thumbs_ratio"] = df_users["count_thumbs_up"] / (df_users["count_thumbs_down"] + df_users["count_thumbs_up"])
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].fillna(0)
    df_users["thumbs_ratio"] = df_users["thumbs_ratio"].replace(np.inf, 0)

    # Errors per session
    if "count_error" in df_users.columns:
        df_users["errors_per_session"] = (
            df_users["count_error"] / df_users["count_total_sessions"]
        )
    else:
        df_users["errors_per_session"] = 0

    # Ads per session
    if "count_roll_advert" in df_users.columns:
        df_users["ads_per_session"] = (
            df_users["count_roll_advert"] / df_users["count_total_sessions"]
        )
    else:
        df_users["ads_per_session"] = 0

    # Last Level (of paid or free)
    df_last_level = (
    df_clean_churn.groupby("userId")["level"]
    .last()
    .rename("last_level")
    )
    df_users = df_users.join(df_last_level)


    return df_users


In [10]:
build_user_features(df_window)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,2018-10-21 01:16:24,1763.900833,256456.28661,51.444444,0.005102,113.777778,0.850000,0.000000,0.000000,paid
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,2018-11-03 02:00:13,1030.581944,123645.65861,29.666667,0.005822,82.833333,0.833333,0.000000,0.000000,paid
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,0,...,2018-11-09 13:10:30,1487.368056,213208.87262,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185,paid
1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,1,...,2018-10-30 23:17:30,1820.775000,452234.08024,25.650000,0.010984,89.650000,0.833333,0.150000,0.850000,paid
1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,0,...,2018-11-03 00:29:30,1062.098611,321970.48842,34.800000,0.009415,128.100000,0.780488,0.100000,0.300000,paid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418744,F,2018-10-22 13:27:55,Macintosh,Safari,87,0,3,5,0,0,...,2018-11-09 07:44:45,426.280556,22066.22015,130.000000,0.002346,87.000000,0.857143,0.000000,5.000000,free
1934047,M,2018-08-31 04:28:43,Macintosh,Chrome,271,0,3,11,0,0,...,2018-11-09 23:57:32,1699.480278,73805.63500,353.000000,0.000588,290.000000,0.791667,2.000000,1.000000,paid
1205281,F,2018-09-26 17:21:05,iPad,Safari,88,0,0,5,0,0,...,2018-11-09 18:08:22,1056.788056,21789.76404,48.500000,0.001893,46.000000,1.000000,0.000000,5.000000,free
1266866,F,2018-09-14 22:37:16,Windows,Firefox,3,0,0,0,0,0,...,2018-11-09 11:27:54,1332.843889,382.48399,3.000000,0.000750,2.000000,1.000000,0.000000,0.000000,paid


In [11]:
print(build_user_features(df_window).dtypes)

gender                                   category
registration                       datetime64[us]
operating_system                         category
browser                                  category
num_unique_artists                          int64
count_about                                 int64
count_add_friend                            int64
count_add_to_playlist                       int64
count_cancel                                int64
count_cancellation_confirmation             int64
count_downgrade                             int64
count_error                                 int64
count_help                                  int64
count_home                                  int64
count_logout                                int64
count_nextsong                              int64
count_roll_advert                           int64
count_save_settings                         int64
count_settings                              int64
count_submit_downgrade                      int64


In [12]:
def add_label(df_users, user_churn_day, T, window_size=10):

    df_users["churn_day"] = user_churn_day.astype("float")

    df_users["label"] = 0

    T_end_window = T + window_size

    churn_in_window_mask = ((df_users["churn_day"] > T) & (df_users["churn_day"] <= T_end_window))
    df_users.loc[churn_in_window_mask, "label"] = 1

    df_users = df_users[df_users["churn_day"].isna() | (df_users["churn_day"] > T)].copy()

    df_users["label"] = df_users["label"].fillna(0).astype(int)

    return df_users

In [13]:
df_users_T39 = build_user_features(df_window)
df_users_T39.head()
df_users_T39.shape

(18880, 35)

In [14]:
df_users_T39 = add_label(df_users_T39, user_churn_day, T=39)

In [15]:
df_users_T39["snapshot_day"] = 39
df_users_T39 = df_users_T39.reset_index()
df_users_T39

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
0,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,29.666667,0.005822,82.833333,0.833333,0.000000,0.000000,paid,,0,39
1,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185,paid,,0,39
2,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,34.800000,0.009415,128.100000,0.780488,0.100000,0.300000,paid,,0,39
3,1010522,M,2018-09-22 07:49:20,Windows,Chrome,1034,4,14,44,0,...,15.240000,0.021916,58.240000,0.829787,0.120000,1.880000,free,,0,39
4,1475659,M,2018-09-10 07:17:33,Windows,Firefox,734,3,19,24,0,...,24.818182,0.007963,83.636364,0.836364,0.000000,0.272727,paid,,0,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15271,1418744,F,2018-10-22 13:27:55,Macintosh,Safari,87,0,3,5,0,...,130.000000,0.002346,87.000000,0.857143,0.000000,5.000000,free,,0,39
15272,1934047,M,2018-08-31 04:28:43,Macintosh,Chrome,271,0,3,11,0,...,353.000000,0.000588,290.000000,0.791667,2.000000,1.000000,paid,,0,39
15273,1205281,F,2018-09-26 17:21:05,iPad,Safari,88,0,0,5,0,...,48.500000,0.001893,46.000000,1.000000,0.000000,5.000000,free,,0,39
15274,1266866,F,2018-09-14 22:37:16,Windows,Firefox,3,0,0,0,0,...,3.000000,0.000750,2.000000,1.000000,0.000000,0.000000,paid,,0,39


In [16]:
user_churn_day.dtype


dtype('float64')

In [17]:
print(user_churn_day.head())
print(user_churn_day.dtype)
user_churn_day.index.is_unique



userId
1749042    20.0
1222580    29.0
1385500    47.0
1032628     0.0
1009070     2.0
Name: day, dtype: float64
float64


True

In [18]:
def build_training_dataset(df_clean_churn, user_churn_day, T_values, window_size=10):

    all_snapshots = []

    for T in T_values:
        print(f"Processing snapshot for T = {T} ...")

        # filter window
        df_window = df_clean_churn[df_clean_churn["day"] <= T].copy()

        # build features
        df_features = build_user_features(df_window)

        # add labels
        df_labeled = add_label(df_features, user_churn_day, T=T, window_size=window_size)

        # add snapshot day column
        df_labeled["snapshot_day"] = T

        # reset index so userId becomes a column
        df_labeled = df_labeled.reset_index()

        # keep the snapshot
        all_snapshots.append(df_labeled)

    df_final = pd.concat(all_snapshots, axis = 0).reset_index(drop=True)

    return df_final


In [19]:
df_training = build_training_dataset(df_clean_churn, user_churn_day, T_values)


Processing snapshot for T = 39 ...
Processing snapshot for T = 29 ...
Processing snapshot for T = 19 ...
Processing snapshot for T = 9 ...


In [20]:
df_training.head

<bound method NDFrame.head of         userId gender        registration operating_system  browser  \
0      1563081      F 2018-09-21 03:25:18        Macintosh   Chrome   
1      1697168      F 2018-09-08 13:48:25        Macintosh  Firefox   
2      1714398      F 2018-09-19 18:23:35          Windows   Chrome   
3      1010522      M 2018-09-22 07:49:20          Windows   Chrome   
4      1475659      M 2018-09-10 07:17:33          Windows  Firefox   
...        ...    ...                 ...              ...      ...   
60429  1208557      F 2018-08-02 15:05:09          Windows   Chrome   
60430  1896647      F 2018-08-09 12:48:50          Windows   Chrome   
60431  1494890      F 2018-09-14 03:15:14          Windows   Chrome   
60432  1729584      M 2018-09-14 18:30:17        Macintosh   Safari   
60433  1878214      M 2018-08-04 03:43:33        Macintosh   Chrome   

       num_unique_artists  count_about  count_add_friend  \
0                     437            1                 5 

In [21]:
df_training["snapshot_day"].value_counts()


snapshot_day
29    15530
19    15452
39    15276
9     14176
Name: count, dtype: int64

In [22]:
df_training["label"].value_counts()


label
0    57344
1     3090
Name: count, dtype: int64

In [23]:
df_training.head()

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
0,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,29.666667,0.005822,82.833333,0.833333,0.0,0.0,paid,,0,39
1,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185,paid,,0,39
2,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,34.8,0.009415,128.1,0.780488,0.1,0.3,paid,,0,39
3,1010522,M,2018-09-22 07:49:20,Windows,Chrome,1034,4,14,44,0,...,15.24,0.021916,58.24,0.829787,0.12,1.88,free,,0,39
4,1475659,M,2018-09-10 07:17:33,Windows,Firefox,734,3,19,24,0,...,24.818182,0.007963,83.636364,0.836364,0.0,0.272727,paid,,0,39


In [24]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "03_10_day_window_sliced.parquet"
df_training.to_parquet(checkpoint_file_path, index=False)

In [25]:
df_training[df_training["label"]==1]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
7,1385500,M,2018-08-16 04:30:35,Linux,Chrome,1433,4,47,72,0,...,18.863636,0.010682,99.863636,0.860870,0.090909,0.818182,paid,47.0,1,39
17,1240184,M,2018-09-19 18:34:21,Linux,Firefox,2444,8,82,153,0,...,13.771930,0.046791,81.070175,0.864151,0.070175,1.087719,paid,44.0,1,39
30,1418529,F,2018-09-21 19:27:33,Windows,Firefox,1805,9,57,76,0,...,16.538462,0.022681,114.269231,0.839779,0.038462,0.538462,paid,41.0,1,39
41,1642905,F,2018-07-22 13:15:58,Windows,Firefox,1336,5,46,50,0,...,45.500000,0.005370,140.000000,0.771186,0.071429,0.000000,paid,43.0,1,39
95,1475989,M,2018-09-22 00:09:18,Windows,Chrome,1838,12,50,104,0,...,15.736842,0.032391,83.973684,0.857923,0.052632,1.368421,paid,45.0,1,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59881,1138673,F,2018-09-30 13:33:20,Macintosh,Chrome,456,1,2,14,0,...,46.500000,0.025131,90.666667,0.813559,0.000000,0.166667,paid,19.0,1,9
59896,1081063,M,2018-09-29 13:39:28,Windows,Chrome,341,0,1,16,0,...,23.500000,0.022411,65.666667,0.900000,0.500000,1.833333,paid,16.0,1,9
60035,1172943,F,2018-08-04 20:35:20,Macintosh,Safari,20,0,1,1,0,...,26.000000,0.000681,19.000000,0.500000,0.000000,1.000000,free,17.0,1,9
60179,1099753,F,2018-09-15 08:48:59,Windows,Chrome,38,0,0,0,0,...,8.333333,0.004878,12.333333,1.000000,0.000000,0.666667,paid,11.0,1,9


In [26]:
df_training[df_training["userId"]==1099753]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
60179,1099753,F,2018-09-15 08:48:59,Windows,Chrome,38,0,0,0,0,...,8.333333,0.004878,12.333333,1.0,0.0,0.666667,paid,11.0,1,9


In [27]:
df_training[df_training["userId"]==1749042]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
30806,1749042,M,2018-08-08 13:22:21,Windows,Chrome,787,5,18,31,0,...,51.444444,0.005106,112.333333,0.847458,0.0,0.0,paid,20.0,1,19
46258,1749042,M,2018-08-08 13:22:21,Windows,Chrome,417,2,7,13,0,...,54.0,0.003941,81.0,0.75,0.0,0.0,paid,20.0,0,9


In [28]:
df_training[df_training["label"]==0]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
0,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,29.666667,0.005822,82.833333,0.833333,0.000000,0.000000,paid,,0,39
1,1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,683,1,9,21,0,...,5.555556,0.018153,31.962963,0.866667,0.037037,2.185185,paid,,0,39
2,1714398,F,2018-09-19 18:23:35,Windows,Chrome,959,0,20,46,0,...,34.800000,0.009415,128.100000,0.780488,0.100000,0.300000,paid,,0,39
3,1010522,M,2018-09-22 07:49:20,Windows,Chrome,1034,4,14,44,0,...,15.240000,0.021916,58.240000,0.829787,0.120000,1.880000,free,,0,39
4,1475659,M,2018-09-10 07:17:33,Windows,Firefox,734,3,19,24,0,...,24.818182,0.007963,83.636364,0.836364,0.000000,0.272727,paid,,0,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60429,1208557,F,2018-08-02 15:05:09,Windows,Chrome,26,1,0,0,0,...,38.000000,0.000601,25.000000,1.000000,0.000000,0.000000,paid,,0,9
60430,1896647,F,2018-08-09 12:48:50,Windows,Chrome,29,0,0,3,0,...,40.000000,0.000667,28.000000,0.800000,0.000000,0.000000,paid,,0,9
60431,1494890,F,2018-09-14 03:15:14,Windows,Chrome,26,0,0,1,0,...,25.000000,0.001551,25.000000,0.000000,0.000000,0.000000,paid,,0,9
60432,1729584,M,2018-09-14 18:30:17,Macintosh,Safari,3,0,0,0,0,...,5.000000,0.001592,2.000000,0.000000,0.000000,0.000000,free,,0,9


In [29]:
df_training[df_training["userId"]==1563081]

Unnamed: 0,userId,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,...,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level,churn_day,label,snapshot_day
0,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,...,29.666667,0.005822,82.833333,0.833333,0.0,0.0,paid,,0,39
15276,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,303,1,4,8,0,...,33.25,0.004228,82.25,0.705882,0.0,0.0,paid,,0,29
30807,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,104,0,1,1,0,...,51.5,0.002924,55.5,0.5,0.0,0.0,paid,,0,19
46259,1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,24,0,0,0,0,...,36.0,0.004197,25.0,1.0,0.0,0.0,paid,,0,9


In [30]:
df_training.isna().sum(
)

userId                                 0
gender                                 0
registration                           0
operating_system                       0
browser                                0
num_unique_artists                     0
count_about                            0
count_add_friend                       0
count_add_to_playlist                  0
count_cancel                           0
count_cancellation_confirmation        0
count_downgrade                        0
count_error                            0
count_help                             0
count_home                             0
count_logout                           0
count_nextsong                         0
count_roll_advert                      0
count_save_settings                    0
count_settings                         0
count_submit_downgrade                 0
count_submit_upgrade                   0
count_thumbs_down                      0
count_thumbs_up                        0
count_upgrade   