In [1]:
import pandas as pd
from pathlib import Path

In [2]:
DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

In [3]:
df_clean_churn = pd.read_parquet(path_clean_dataset)

In [4]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
10155033,F,paid,1891709,Thumbs Up,173701,640,0.0,No song,No artist,2018-11-12 00:04:05,2018-09-25 12:05:24,Danville,VA,Macintosh,Firefox
9772986,M,paid,1135012,NextSong,82839,97,217.41669,Steve Earle,Sugarland,2018-10-15 20:43:23,2018-09-23 18:44:05,Philadelphia-Camden-Wilmington,PA-NJ-DE-MD,Windows,Chrome
6405817,M,paid,1574547,NextSong,169072,118,251.89832,Always,Blink-182,2018-11-08 14:40:18,2018-09-14 06:53:09,La Grande,OR,Windows,Chrome
14204281,M,free,1763609,NextSong,16322,66,218.40934,Re-Hash,Gorillaz,2018-10-21 18:53:58,2018-09-10 02:56:59,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
9147872,M,paid,1092320,NextSong,105945,65,169.50812,Perfect,Flyleaf,2018-10-22 16:09:22,2018-09-02 18:04:44,Greensboro-High Point,NC,Windows,Firefox


In [5]:
df_users_base = df_clean_churn[["userId", "gender", "registration", "operating_system", "browser"]].drop_duplicates(subset=["userId"])

In [6]:
len(df_users_base)

19140

In [7]:
df_users_base = df_users_base.set_index("userId")

In [8]:
df_unique_artists = (
    df_clean_churn.groupby("userId")["artist"].nunique().reset_index().rename(columns={"artist": "num_unique_artists"})
)
df_unique_artists = df_unique_artists.set_index("userId")
df_users_base = df_users_base.join(df_unique_artists)

In [9]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1149117,F,2018-08-19 21:46:14,Windows,Chrome,355
1000214,M,2018-09-18 00:37:57,Windows,Chrome,994
1637449,M,2018-09-03 05:03:23,iPhone,Safari,255
1261899,M,2018-09-04 10:00:41,Windows,Chrome,275
1646138,F,2018-08-22 11:31:40,Macintosh,Chrome,11


In [10]:
df_page_counts = (
    df_clean_churn.groupby("userId")["page"]
    .value_counts()
    .unstack(fill_value=0)
)

df_page_counts.columns = [f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns]

In [11]:
df_users_base = df_users_base.join(df_page_counts)
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_logout,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1615201,F,2018-09-30 00:14:16,Windows,Chrome,1021,2,35,45,1,1,...,21,1474,1,3,9,0,0,12,76,0
1344292,M,2018-09-15 22:19:13,Linux,Firefox,214,1,2,4,0,0,...,5,226,17,0,1,0,0,3,9,1
1762547,F,2018-09-04 20:55:24,Windows,Edge,192,0,8,3,0,0,...,6,218,23,1,2,0,0,4,8,3
1901750,M,2018-08-10 04:27:49,Macintosh,Chrome,940,2,23,39,1,1,...,8,1242,9,3,10,1,1,19,78,1
1695464,M,2018-09-28 02:49:47,Windows,Chrome,782,2,18,37,0,0,...,14,1007,3,0,7,0,1,17,47,3


In [12]:
df_session_count = (
    df_clean_churn.groupby("userId")["sessionId"]
    .nunique()
    .reset_index(name="count_total_sessions")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_session_count)


In [13]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,count_cancel,count_cancellation_confirmation,...,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1801448,M,2018-08-18 06:36:00,Linux,Firefox,532,2,12,27,1,1,...,668,15,1,4,0,1,7,34,3,18
1362621,M,2018-09-19 17:15:26,Windows,Firefox,4,0,0,0,1,1,...,3,0,0,0,0,0,0,0,0,1
1507873,F,2018-09-12 23:21:46,Macintosh,Safari,792,5,22,37,1,1,...,1049,1,1,9,0,0,8,50,0,14
1865406,F,2018-06-21 01:43:09,Macintosh,Safari,8,0,1,0,0,0,...,7,1,0,0,0,0,0,0,0,4
1101469,F,2018-09-23 06:13:36,Macintosh,Firefox,160,1,2,7,0,0,...,167,12,0,3,0,0,3,6,1,3
