In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

In [3]:
df_clean_churn = pd.read_parquet(path_clean_dataset)

In [4]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
15672349,F,paid,1908388,NextSong,24646,79,127.242,Tive Sim,Cartola,2018-10-31 21:45:05,2018-09-21 11:18:15,New York-Newark-Jersey City,NY-NJ-PA,Windows,Chrome
11162453,M,paid,1828392,NextSong,110145,302,170.4224,I Feel Better,Frightened Rabbit,2018-10-29 16:00:25,2018-09-10 12:32:29,San Francisco-Oakland-Hayward,CA,Macintosh,Safari
12716641,M,free,1275914,Home,109026,16,0.0,No song,No artist,2018-10-25 15:32:48,2018-09-15 09:16:32,Allentown-Bethlehem-Easton,PA-NJ,Windows,Edge
15720338,M,paid,1129473,NextSong,25573,70,165.642,Again & Again,the bird and the bee,2018-11-01 16:57:55,2018-09-01 17:44:47,Corpus Christi,TX,Macintosh,Chrome
8420474,M,paid,1199649,NextSong,66437,371,201.87383,The Beacon,A Fine Frenzy,2018-10-19 06:25:31,2018-09-13 13:19:44,Cincinnati,OH-KY-IN,Macintosh,Firefox


In [5]:
df_clean_churn = df_clean_churn.sort_values(by="time", ascending=True)
df_clean_churn["page"] = df_clean_churn["page"].str.strip().str.lower()

In [6]:
df_users_base = df_clean_churn[["userId", "gender", "registration", "operating_system", "browser"]].drop_duplicates(subset=["userId"])

In [7]:
len(df_users_base)

19140

In [8]:
df_users_base = df_users_base.set_index("userId")

In [9]:
df_unique_artists = (
    df_clean_churn.groupby("userId")["artist"].nunique().reset_index().rename(columns={"artist": "num_unique_artists"})
)
df_unique_artists = df_unique_artists.set_index("userId")
df_users_base = df_users_base.join(df_unique_artists)

In [10]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1612169,M,2018-09-28 14:42:25,Macintosh,Safari,23
1935058,M,2018-06-26 13:49:49,Windows,Chrome,823
1770272,M,2018-10-12 08:41:40,Windows,Firefox,67
1950153,F,2018-09-25 10:19:39,Macintosh,Chrome,1008
1060146,M,2018-09-17 11:50:49,iPhone,Safari,1022


In [11]:
df_page_counts = (
    df_clean_churn.groupby("userId")["page"]
    .value_counts()
    .unstack(fill_value=0)
)

df_page_counts.columns = [f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns]

In [12]:
df_users_base = df_users_base.join(df_page_counts)
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_logout,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1596267,F,2018-09-26 15:58:30,Macintosh,Safari,1163,5,20,53,0,0,...,28,1692,19,3,7,0,1,12,94,2
1683780,F,2018-09-16 14:33:08,Macintosh,Chrome,936,3,32,37,0,0,...,11,1283,7,3,8,0,1,20,60,1
1454224,M,2018-09-12 09:29:03,Windows,Chrome,76,0,3,1,0,0,...,0,77,4,0,3,0,0,1,3,1
1609457,M,2018-07-20 03:55:54,Linux,Chrome,1006,2,18,41,0,0,...,19,1377,50,2,10,0,1,11,71,4
1689133,M,2018-09-11 08:23:31,Macintosh,Firefox,469,2,14,11,0,0,...,11,576,1,1,1,0,0,1,29,0


In [13]:
df_session_count = (
    df_clean_churn.groupby("userId")["sessionId"]
    .nunique()
    .reset_index(name="count_total_sessions")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_session_count)


In [14]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1339528,M,2018-06-26 18:23:40,Windows,Firefox,1447,3,31,66,0,0,...,2197,4,4,16,1,1,21,188,1,34
1079351,M,2018-09-17 07:51:20,Macintosh,Chrome,171,0,1,4,0,0,...,174,12,1,1,0,0,1,4,2,7
1328966,M,2018-02-19 00:34:39,Macintosh,Firefox,408,0,11,13,0,0,...,475,18,1,4,1,1,7,27,5,11
1459797,M,2018-07-01 00:37:14,Linux,Firefox,1112,2,36,30,0,0,...,1574,2,1,10,0,1,11,78,1,17
1485510,F,2018-09-18 23:56:36,Macintosh,Safari,820,2,13,34,0,0,...,1096,0,1,10,0,1,5,64,1,12


In [15]:
# Build user lifecycle

df_last_time = (
    df_clean_churn.groupby("userId")["time"]
    .max()
    .reset_index(name="last_time")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_last_time)
df_users_base["user_lifecycle_h"] = df_users_base["last_time"] - df_users_base["registration"]
df_users_base["user_lifecycle_h"] = df_users_base["user_lifecycle_h"].dt.total_seconds() / 3600

df_users_base.head()

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,2,7,0,0,9,51,0,9,2018-10-21 01:16:24,1763.900833
1484921,M,2018-09-16 09:11:42,Linux,Chrome,465,1,11,15,0,0,...,0,1,0,1,4,48,1,8,2018-11-15 17:33:31,1448.363611
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,482,0,11,21,0,0,...,1,3,1,2,4,68,3,9,2018-11-15 20:43:13,1480.669722
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,1117,1,22,44,0,0,...,2,12,1,1,16,83,7,34,2018-11-19 23:59:55,1738.191667
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,0,2,0,0,5,25,0,6,2018-11-03 02:00:13,1030.581944


In [16]:
df_length = (
    df_clean_churn.groupby("userId")["length"].sum()
)
df_length = (
    df_length
    .reset_index(name="ttl_length")
    .set_index("userId")
)
df_users_base = df_users_base.join(df_length)
df_users_base.head(5)


Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,7,0,0,9,51,0,9,2018-10-21 01:16:24,1763.900833,256456.28661
1484921,M,2018-09-16 09:11:42,Linux,Chrome,465,1,11,15,0,0,...,1,0,1,4,48,1,8,2018-11-15 17:33:31,1448.363611,136515.28536
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,482,0,11,21,0,0,...,3,1,2,4,68,3,9,2018-11-15 20:43:13,1480.669722,146479.95366
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,1117,1,22,44,0,0,...,12,1,1,16,83,7,34,2018-11-19 23:59:55,1738.191667,386582.82117
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,2,0,0,5,25,0,6,2018-11-03 02:00:13,1030.581944,123645.65861


In [17]:
df_item_per_session = (
    df_clean_churn.groupby("userId")["itemInSession"].max() / df_users_base["count_total_sessions"]
)

df_item_per_session = (
    df_item_per_session
    .reset_index(name="item_per_session")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_item_per_session)


In [18]:
df_users_base["frequency"] = df_users_base["count_total_sessions"] / df_users_base["user_lifecycle_h"]

In [19]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1989655,F,2018-09-21 13:11:02,Windows,Chrome,27,0,0,0,0,0,...,0,0,0,0,1,2018-10-04 08:06:06,306.917778,6165.69596,30.0,0.003258
1715892,F,2018-09-14 20:02:33,Macintosh,Chrome,221,0,5,5,1,1,...,0,5,13,0,8,2018-10-17 00:00:50,771.971389,61933.60271,11.625,0.010363
1230806,M,2018-09-21 11:00:46,iPhone,Safari,708,0,16,27,0,0,...,0,12,55,0,8,2018-11-18 17:09:16,1398.141667,227671.5973,67.625,0.005722
1450484,M,2018-08-19 17:11:22,Macintosh,Chrome,49,0,2,2,0,0,...,0,0,3,2,1,2018-11-05 17:36:34,1872.42,11767.01824,69.0,0.000534
1898080,M,2018-10-26 18:47:28,Macintosh,Chrome,28,0,0,1,0,0,...,0,1,0,0,1,2018-11-13 13:29:37,426.7025,7472.05306,38.0,0.002344


In [20]:
### df_users_base = df_users_base.drop(columns="user_lifecycle_h", errors="ignore")


In [21]:
"user_lifecycle_h" in df_users_base.columns

True

In [22]:
print(df_users_base["count_cancellation_confirmation"].unique())
df_users_base = df_users_base.rename(columns={"count_cancellation_confirmation":"churn_label"})

[1 0]


In [23]:
df_users_base["avg_songs_session"] = df_users_base["count_nextsong"] / df_users_base["count_total_sessions"]

In [24]:
mask = df_users_base["user_lifecycle_h"] < 24
df_users_base[mask]

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,churn_label,...,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1714714,F,2018-09-30 22:59:16,Macintosh,Safari,65,1,0,0,1,1,...,0,2,0,1,2018-10-01 09:25:13,10.4325,15332.72528,79.0,0.095854,64.0
1444644,F,2018-10-06 04:01:43,Macintosh,Safari,40,0,0,2,0,0,...,0,0,0,1,2018-10-07 02:00:54,21.986389,9873.95455,46.0,0.045483,39.0
1039566,M,2018-10-07 10:56:31,iPhone,Safari,73,0,0,1,0,0,...,2,5,0,1,2018-10-07 16:32:59,5.607778,20749.91474,104.0,0.178324,75.0
1594905,M,2018-10-09 02:53:40,Windows,Chrome,304,2,2,11,0,0,...,4,15,0,1,2018-10-10 02:30:24,23.612222,84829.05169,399.0,0.042351,338.0
1501690,F,2018-10-23 18:06:07,Macintosh,Chrome,82,0,9,1,0,0,...,0,5,1,1,2018-10-24 00:43:46,6.6275,21688.30803,128.0,0.150886,84.0
1835558,M,2018-10-24 15:11:40,Windows,Firefox,99,1,2,2,1,1,...,0,3,2,1,2018-10-24 22:20:59,7.155278,25865.30558,154.0,0.139757,102.0
1075367,F,2018-10-31 11:48:10,Macintosh,Firefox,10,0,0,0,0,0,...,1,0,0,1,2018-10-31 12:29:58,0.696667,1964.35182,25.0,1.435407,9.0
1745396,F,2018-11-07 15:38:32,Macintosh,Firefox,24,0,0,1,0,0,...,0,1,0,1,2018-11-07 17:02:16,1.395556,5221.4753,33.0,0.716561,23.0
1649197,F,2018-11-07 15:54:59,Macintosh,Chrome,83,0,0,2,0,0,...,2,8,1,1,2018-11-07 21:44:35,5.826667,21060.63695,120.0,0.171625,86.0
1868524,M,2018-11-08 08:19:58,Windows,Firefox,53,0,0,1,0,0,...,1,2,0,1,2018-11-08 12:03:49,3.730833,13668.99197,76.0,0.268037,52.0


In [25]:
df_users_base.loc[[1714714]]

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,churn_label,...,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1714714,F,2018-09-30 22:59:16,Macintosh,Safari,65,1,0,0,1,1,...,0,2,0,1,2018-10-01 09:25:13,10.4325,15332.72528,79.0,0.095854,64.0


In [26]:
df_users_base["thumbs_ratio"] = df_users_base["count_thumbs_up"] / (df_users_base["count_thumbs_down"] + df_users_base["count_thumbs_up"])
df_users_base["thumbs_ratio"] = df_users_base["thumbs_ratio"].replace(np.inf, 0)
df_users_base["errors_per_session"] = df_users_base["count_error"] / df_users_base["count_total_sessions"]
df_users_base["ads_per_session"] = df_users_base["count_roll_advert"] / df_users_base["count_total_sessions"]

In [27]:
mask = df_users_base["thumbs_ratio"] ==0
df_users_base[mask]

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,churn_label,...,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1542469,M,2018-07-24 04:08:58,Windows,Chrome,21,0,0,2,1,1,...,1,2018-10-01 01:29:42,1653.345556,5558.01170,93.0,0.000605,21.0,0.0,0.0,5.0
1641130,M,2018-08-29 07:20:08,iPhone,Safari,6,0,0,0,1,1,...,1,2018-10-01 00:21:26,785.021667,1184.10223,187.0,0.001274,5.0,0.0,0.0,0.0
1278056,M,2018-09-24 01:08:46,Linux,Firefox,51,0,0,0,1,1,...,1,2018-10-01 03:21:37,170.214167,11992.53198,58.0,0.005875,51.0,0.0,0.0,1.0
1441204,F,2018-09-05 01:06:30,Macintosh,Safari,29,0,0,1,0,0,...,2,2018-10-12 07:52:34,894.767778,7033.01263,18.0,0.002235,14.0,0.0,0.0,2.0
1072890,M,2018-09-18 06:45:18,Macintosh,Chrome,40,0,3,2,0,0,...,2,2018-10-01 18:28:56,323.727222,10032.43945,21.5,0.006178,19.5,0.0,0.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1178165,M,2018-08-12 20:34:23,Windows,Chrome,5,0,0,0,0,0,...,1,2018-11-15 15:50:46,2275.273056,1063.26022,5.0,0.000440,4.0,0.0,0.0,1.0
1072628,F,2018-08-19 22:30:44,Windows,Chrome,26,0,0,1,0,0,...,1,2018-11-16 01:22:31,2114.863056,5566.81315,28.0,0.000473,25.0,0.0,0.0,0.0
1802584,F,2018-06-30 04:31:34,Windows,Firefox,13,0,0,0,0,0,...,1,2018-11-16 07:04:40,3338.551667,2815.91619,14.0,0.000300,12.0,0.0,0.0,0.0
1264392,F,2018-11-16 08:43:56,Macintosh,Chrome,49,0,0,1,0,0,...,2,2018-11-19 07:46:49,71.048056,11883.52393,15.5,0.028150,24.5,0.0,0.0,1.0


In [28]:
df_users_base.columns

Index(['gender', 'registration', 'operating_system', 'browser',
       'num_unique_artists', 'count_about', 'count_add_friend',
       'count_add_to_playlist', 'count_cancel', 'churn_label',
       'count_downgrade', 'count_error', 'count_help', 'count_home',
       'count_logout', 'count_nextsong', 'count_roll_advert',
       'count_save_settings', 'count_settings', 'count_submit_downgrade',
       'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up',
       'count_upgrade', 'count_total_sessions', 'last_time',
       'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency',
       'avg_songs_session', 'thumbs_ratio', 'errors_per_session',
       'ads_per_session'],
      dtype='object')

In [29]:
mask = df_users_base["count_submit_upgrade"] > 3
df_users_base[mask][["count_submit_upgrade", "count_submit_downgrade", "churn_label"]]

Unnamed: 0_level_0,count_submit_upgrade,count_submit_downgrade,churn_label
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1240184,4,3,1
1295776,4,3,1
1839740,4,4,0
1747195,6,5,0
1766016,5,4,0
1504480,4,3,1
1882951,4,3,1
1255222,5,4,0
1646275,4,3,1
1608306,4,3,0


In [30]:
df_last_level = (
    df_clean_churn.groupby("userId")["level"]
    .last()
    .reset_index(name="last_level")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_last_level)

In [31]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,churn_label,...,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1376645,M,2018-09-26 10:04:32,Windows,Chrome,12,1,2,1,0,0,...,2018-11-18 18:52:22,1280.797222,2688.36074,3.666667,0.002342,3.666667,1.0,0.0,0.666667,free
1650308,M,2018-08-11 14:47:12,Windows,Chrome,197,0,2,4,1,1,...,2018-10-02 13:33:24,1246.77,51823.65391,93.5,0.001604,108.0,0.692308,0.5,0.0,paid
1345334,M,2018-09-20 14:22:20,Macintosh,Chrome,1928,4,41,110,0,0,...,2018-11-19 23:55:41,1449.555833,810018.39535,10.277778,0.024835,91.694444,0.838542,0.222222,0.194444,paid
1133422,M,2018-08-27 19:51:26,Macintosh,Chrome,87,0,3,3,0,0,...,2018-11-08 14:00:00,1746.142778,21360.83827,17.0,0.002863,17.4,0.857143,0.0,2.0,free
1029119,F,2018-09-16 03:45:10,Windows,Firefox,1174,4,35,57,1,1,...,2018-11-02 15:41:56,1139.946111,407561.02027,29.25,0.014036,103.3125,0.836207,0.0,0.9375,paid


In [32]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "02_engineered_train.parquet"
df_users_base.to_parquet(checkpoint_file_path, index=True)