In [2]:
import pandas as pd
from pathlib import Path

In [3]:
DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

In [4]:
df_clean_churn = pd.read_parquet(path_clean_dataset)

In [5]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
7080513,F,paid,1019424,NextSong,100306,341,251.92444,Dark Blue (Album Version),Jack's Mannequin,2018-10-20 02:41:49,2018-09-21 14:30:40,Washington-Arlington-Alexandria,DC-VA-MD-WV,Macintosh,Chrome
9465385,F,paid,1607133,NextSong,172482,12,360.51546,Crumpshit,Philippe Rochard,2018-11-10 18:45:42,2018-09-29 11:37:34,Cincinnati,OH-KY-IN,Windows,Chrome
12665467,M,paid,1860475,NextSong,125921,338,206.00118,Opus 23,Dustin O'Halloran,2018-10-27 04:43:36,2018-09-14 06:38:21,New Orleans-Metairie,LA,Macintosh,Safari
4909286,M,paid,1774796,NextSong,186564,621,201.56036,Uno [Live],Muse,2018-11-17 13:41:31,2018-08-31 03:10:53,Texarkana,TX-AR,Windows,Firefox
9830770,M,paid,1768352,Thumbs Down,167922,139,0.0,No song,No artist,2018-11-08 03:16:55,2018-08-13 04:08:31,Houston-The Woodlands-Sugar Land,TX,Linux,Chrome


In [6]:
df_users_base = df_clean_churn[["userId", "gender", "registration", "operating_system", "browser"]].drop_duplicates(subset=["userId"])

In [7]:
len(df_users_base)

19140

In [8]:
df_users_base = df_users_base.set_index("userId")

In [9]:
df_unique_artists = (
    df_clean_churn.groupby("userId")["artist"].nunique().reset_index().rename(columns={"artist": "num_unique_artists"})
)
df_unique_artists = df_unique_artists.set_index("userId")
df_users_base = df_users_base.join(df_unique_artists)

In [10]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993389,F,2018-08-09 15:48:55,Macintosh,Safari,1009
1228246,F,2018-09-24 22:27:38,Macintosh,Firefox,726
1700187,M,2018-09-11 15:24:25,Windows,Firefox,344
1887336,F,2018-07-19 04:59:56,Windows,Firefox,334
1929416,F,2018-06-14 17:47:33,Macintosh,Chrome,24


In [11]:
df_page_counts = (
    df_clean_churn.groupby("userId")["page"]
    .value_counts()
    .unstack(fill_value=0)
)

df_page_counts.columns = [f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns]

In [12]:
df_users_base = df_users_base.join(df_page_counts)
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_logout,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1401040,M,2018-08-29 08:19:07,Windows,Chrome,704,2,12,14,1,1,...,16,910,19,2,4,1,1,17,48,3
1038329,M,2018-09-22 21:40:08,Windows,Firefox,1069,5,39,42,1,1,...,21,1530,3,2,8,0,1,14,79,2
1192394,M,2018-09-27 16:33:32,Windows,Chrome,1782,5,54,94,0,0,...,31,2883,5,6,22,0,0,31,294,0
1321590,F,2018-09-06 16:34:57,Windows,Chrome,466,0,9,19,1,1,...,3,550,1,1,5,0,0,5,23,0
1312550,M,2018-09-17 16:52:24,Macintosh,Chrome,250,1,12,11,1,1,...,3,286,22,1,1,0,0,17,27,1


In [13]:
df_session_count = (
    df_clean_churn.groupby("userId")["sessionId"]
    .nunique()
    .reset_index(name="count_total_sessions")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_session_count)


In [14]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1397036,F,2018-09-02 01:32:14,Macintosh,Firefox,266,0,2,5,1,1,...,282,8,0,1,0,1,7,13,2,8
1152545,M,2018-08-27 02:03:00,Macintosh,Safari,1514,2,33,67,1,1,...,2374,17,3,17,1,2,27,125,3,24
1490051,F,2018-08-06 14:17:44,Windows,Chrome,38,0,0,0,0,0,...,37,5,0,0,0,0,0,1,0,1
1135429,M,2018-08-31 01:48:04,Windows,Chrome,215,0,4,10,0,0,...,234,0,1,3,0,0,5,20,0,7
1787163,F,2018-09-22 01:51:42,Linux,Firefox,473,4,14,12,0,0,...,557,0,1,3,0,0,2,24,0,6


In [37]:
# Build user lifecycle
df_user_lifecycle = (
    df_clean_churn.groupby("userId")["time"].max()
    - df_clean_churn.groupby("userId")["registration"].min()
)

df_user_lifecycle = (
    df_user_lifecycle
    .reset_index(name="user_lifecycle")
    .set_index("userId")
)

# Join
df_users_base = df_users_base.join(df_user_lifecycle)
df_users_base["user_lifecycle"] = df_users_base["user_lifecycle"].dt.days

df_users_base.head()

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,user_lifecycle
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,0,2,7,0,0,9,51,0,9,73
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,0,0,2,0,0,5,25,0,6,42
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,1117,1,22,44,0,0,...,61,2,12,1,1,16,83,7,34,72
1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,1,...,17,1,11,2,2,17,85,5,20,75
1714398,F,2018-09-19 18:23:35,Windows,Chrome,1102,0,22,51,0,0,...,5,2,9,0,0,19,79,0,14,60


In [38]:
df_length = (
    df_clean_churn.groupby("userId")["length"].sum()
)
df_length = (
    df_length
    .reset_index(name="ttl_length")
    .set_index("userId")
)
df_users_base = df_users_base.join(df_length)
df_users_base.head(5)


Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,user_lifecycle,ttl_length
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,2,7,0,0,9,51,0,9,73,256456.28661
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,0,2,0,0,5,25,0,6,42,123645.65861
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,1117,1,22,44,0,0,...,2,12,1,1,16,83,7,34,72,386582.82117
1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,1,...,1,11,2,2,17,85,5,20,75,452234.08024
1714398,F,2018-09-19 18:23:35,Windows,Chrome,1102,0,22,51,0,0,...,2,9,0,0,19,79,0,14,60,384934.72168


In [None]:
df_item_per_session = (
    df_clean_churn.groupby("userId")["itemInSession"].max() / df_users_base["count_total_sessions"]
)

df_item_per_session = (
    df_item_per_session
    .reset_index(name="item_per_session")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_item_per_session)


In [41]:
df_users_base["frequency"] = df_users_base["count_total_sessions"] / df_users_base["user_lifecycle"]

In [42]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,user_lifecycle,ttl_length,item_per_session,frequency
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1496207,M,2018-09-10 15:17:31,Windows,Chrome,180,1,3,9,1,1,...,0,0,7,10,0,6,48,48292.84778,22.0,0.125
1939313,M,2018-09-09 04:38:37,Macintosh,Firefox,1984,11,56,104,1,1,...,1,2,33,168,2,34,63,812498.66869,10.911765,0.539683
1617154,F,2018-08-09 01:56:28,Windows,Chrome,193,3,2,4,1,1,...,0,1,3,15,2,5,67,51724.07777,28.0,0.074627
1803737,F,2018-08-22 01:17:52,Macintosh,Chrome,954,2,18,43,0,0,...,0,1,18,131,1,19,87,329114.67337,14.736842,0.218391
1001027,M,2018-08-17 04:04:29,Windows,Chrome,1224,3,29,54,0,0,...,2,3,26,102,12,25,94,431174.59752,38.08,0.265957


In [None]:
### df_users_base = df_users_base.drop(columns="user_lifecycle", errors="ignore")


In [27]:
"user_lifecycle" in df_users_base.columns

True

In [22]:
print(df_users_base["count_cancellation_confirmation"].unique())
df_users_base.rename(columns={"count_cancellation_confirmation":"churn_label"})

[1 0]


Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,churn_label,...,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,797,5,18,33,1,1,...,1024,0,2,7,0,0,9,51,0,9
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,437,1,5,14,0,0,...,497,0,0,2,0,0,5,25,0,6
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,1117,1,22,44,0,0,...,1585,61,2,12,1,1,16,83,7,34
1222580,M,2018-08-16 02:31:00,Macintosh,Safari,1252,8,35,50,1,1,...,1793,17,1,11,2,2,17,85,5,20
1714398,F,2018-09-19 18:23:35,Windows,Chrome,1102,0,22,51,0,0,...,1537,5,2,9,0,0,19,79,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1494594,M,2018-08-18 18:07:54,iPhone,Safari,12,0,0,0,0,0,...,11,1,0,0,0,0,0,0,0,1
1036641,F,2018-07-09 12:04:12,Windows,Chrome,71,0,3,1,0,0,...,72,4,0,4,0,0,0,12,3,1
1110980,F,2018-08-31 14:17:48,Windows,Chrome,51,0,0,4,0,0,...,55,0,0,0,0,0,0,5,0,1
1594272,M,2018-09-26 21:40:35,Macintosh,Safari,55,0,0,2,0,0,...,56,0,0,4,0,0,0,6,0,1
