In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

In [3]:
df_clean_churn = pd.read_parquet(path_clean_dataset)

In [4]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
10014708,F,paid,1051851,NextSong,161444,207,161.95873,Kansas City,Wilbert Harrison,2018-11-06 15:00:16,2018-09-02 09:18:29,Indianapolis-Carmel-Anderson,IN,Windows,Firefox
6145492,M,paid,1779066,NextSong,106971,27,230.47791,You Belong With Me,Taylor Swift,2018-10-22 17:55:20,2018-09-09 19:38:44,Corpus Christi,TX,Windows,Chrome
9745944,M,paid,1262936,NextSong,64293,141,288.39138,Nagumomo,Susheela Raman,2018-10-11 13:54:38,2018-09-03 02:50:10,Pittsburgh,PA,Macintosh,Safari
10000896,M,paid,1053583,NextSong,144536,230,242.6771,Control,Narco,2018-11-01 22:35:36,2018-08-31 19:49:28,Providence-Warwick,RI-MA,Windows,Chrome
12749538,M,paid,1346579,NextSong,92023,49,202.81424,La Despedida,Daddy Yankee,2018-11-15 03:11:42,2018-09-25 10:54:46,Syracuse,NY,Linux,Firefox


In [5]:
df_clean_churn = df_clean_churn.sort_values(by="time", ascending=True)
df_clean_churn["page"] = df_clean_churn["page"].str.strip().str.lower()

In [6]:
df_users_base = df_clean_churn[["userId", "gender", "registration", "operating_system", "browser", "metropolitan_area", "region"]].drop_duplicates(subset=["userId"])

In [7]:
len(df_users_base)

19140

In [8]:
df_users_base = df_users_base.set_index("userId")

In [9]:
df_unique_artists = (
    df_clean_churn.groupby("userId")["artist"].nunique().reset_index().rename(columns={"artist": "num_unique_artists"})
)
df_unique_artists = df_unique_artists.set_index("userId")
df_users_base = df_users_base.join(df_unique_artists)

In [10]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1295025,M,2018-09-20 07:04:45,Macintosh,Safari,Houston-The Woodlands-Sugar Land,TX,239
1574213,F,2018-06-10 07:48:05,Windows,Firefox,Los Angeles-Long Beach-Anaheim,CA,65
1656005,M,2018-09-15 16:21:28,Macintosh,Chrome,Huntington,IN,254
1726109,F,2018-09-30 19:08:40,Windows,Chrome,Madera,CA,1966
1417726,M,2018-10-31 02:04:41,Windows,Firefox,Nashville-Davidson--Murfreesboro--Franklin,TN,280


In [11]:
df_page_counts = (
    df_clean_churn.groupby("userId")["page"]
    .value_counts()
    .unstack(fill_value=0)
)

df_page_counts.columns = [f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns]

In [12]:
df_users_base = df_users_base.join(df_page_counts)
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_logout,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1476889,F,2018-09-29 01:53:09,Windows,Chrome,St. Louis,MO-IL,1036,1,15,41,...,19,1410,32,3,4,1,2,18,71,4
1322052,F,2018-09-30 02:27:23,Macintosh,Chrome,San Francisco-Oakland-Hayward,CA,832,4,23,37,...,16,1097,29,0,8,0,1,12,105,4
1902130,M,2018-09-25 17:43:44,Linux,Firefox,London,KY,318,2,5,10,...,9,361,20,1,4,0,0,9,17,2
1551170,M,2018-08-12 12:09:56,Windows,Firefox,Grand Rapids-Wyoming,MI,1793,7,50,101,...,62,2937,150,5,19,2,3,97,149,13
1986288,M,2018-09-13 15:58:34,Macintosh,Chrome,Chicago-Naperville-Elgin,IL-IN-WI,224,0,13,7,...,8,239,8,1,1,0,1,3,15,2


In [13]:
df_session_count = (
    df_clean_churn.groupby("userId")["sessionId"]
    .nunique()
    .reset_index(name="count_total_sessions")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_session_count)


In [14]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1577124,F,2018-09-14 14:58:12,Windows,Chrome,Washington-Arlington-Alexandria,DC-VA-MD-WV,876,0,18,30,...,1174,14,5,14,1,0,26,66,1,12
1967201,M,2018-09-23 02:47:31,Macintosh,Safari,Jonesboro,AR,676,4,18,29,...,865,5,0,6,2,1,15,52,2,17
1821834,M,2018-09-16 13:08:45,Windows,Edge,New York-Newark-Jersey City,NY-NJ-PA,617,4,16,23,...,757,24,4,6,0,1,23,32,4,18
1477130,M,2018-09-24 02:04:20,Macintosh,Chrome,Los Angeles-Long Beach-Anaheim,CA,187,1,1,6,...,206,15,0,0,0,0,8,6,3,3
1505495,M,2018-09-12 05:13:37,Windows,Firefox,Sioux Falls,SD,232,1,5,6,...,258,22,0,2,0,0,4,11,4,9


In [15]:
# Build user lifecycle

df_last_time = (
    df_clean_churn.groupby("userId")["time"]
    .max()
    .reset_index(name="last_time")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_last_time)
df_users_base["user_lifecycle_h"] = df_users_base["last_time"] - df_users_base["registration"]
df_users_base["user_lifecycle_h"] = df_users_base["user_lifecycle_h"].dt.total_seconds() / 3600

df_users_base.head()

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,Dallas-Fort Worth-Arlington,TX,797,5,18,33,...,2,7,0,0,9,51,0,9,2018-10-21 01:16:24,1763.900833
1484921,M,2018-09-16 09:11:42,Linux,Chrome,New York-Newark-Jersey City,NY-NJ-PA,465,1,11,15,...,0,1,0,1,4,48,1,8,2018-11-15 17:33:31,1448.363611
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,Lafayette,LA,482,0,11,21,...,1,3,1,2,4,68,3,9,2018-11-15 20:43:13,1480.669722
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,Hilo,HI,1117,1,22,44,...,2,12,1,1,16,83,7,34,2018-11-19 23:59:55,1738.191667
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,San Francisco-Oakland-Hayward,CA,437,1,5,14,...,0,2,0,0,5,25,0,6,2018-11-03 02:00:13,1030.581944


In [16]:
df_length = (
    df_clean_churn.groupby("userId")["length"].sum()
)
df_length = (
    df_length
    .reset_index(name="ttl_length")
    .set_index("userId")
)
df_users_base = df_users_base.join(df_length)
df_users_base.head(5)


Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,Dallas-Fort Worth-Arlington,TX,797,5,18,33,...,7,0,0,9,51,0,9,2018-10-21 01:16:24,1763.900833,256456.28661
1484921,M,2018-09-16 09:11:42,Linux,Chrome,New York-Newark-Jersey City,NY-NJ-PA,465,1,11,15,...,1,0,1,4,48,1,8,2018-11-15 17:33:31,1448.363611,136515.28536
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,Lafayette,LA,482,0,11,21,...,3,1,2,4,68,3,9,2018-11-15 20:43:13,1480.669722,146479.95366
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,Hilo,HI,1117,1,22,44,...,12,1,1,16,83,7,34,2018-11-19 23:59:55,1738.191667,386582.82117
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,San Francisco-Oakland-Hayward,CA,437,1,5,14,...,2,0,0,5,25,0,6,2018-11-03 02:00:13,1030.581944,123645.65861


In [17]:
df_item_per_session = (
    df_clean_churn.groupby("userId")["itemInSession"].max() / df_users_base["count_total_sessions"]
)

df_item_per_session = (
    df_item_per_session
    .reset_index(name="item_per_session")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_item_per_session)


In [18]:
df_users_base["frequency"] = df_users_base["count_total_sessions"] / df_users_base["user_lifecycle_h"]

In [19]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1402336,F,2018-09-29 19:18:08,Windows,Firefox,Los Angeles-Long Beach-Anaheim,CA,2106,6,71,105,...,1,40,185,1,28,2018-11-19 23:57:20,1228.653333,927720.55291,32.964286,0.022789
1936839,M,2018-05-07 21:31:42,Macintosh,Safari,Duluth,MN-WI,562,2,12,12,...,1,6,36,2,11,2018-10-15 20:46:37,3863.248611,168874.6285,28.0,0.002847
1587060,M,2018-09-17 00:03:51,iPad,Safari,Detroit-Warren-Dearborn,MI,321,2,3,11,...,1,6,14,1,9,2018-11-13 01:38:31,1369.577778,91827.34656,13.777778,0.006571
1540874,M,2018-08-23 16:03:40,Macintosh,Chrome,Charlotte-Concord-Gastonia,NC-SC,293,3,3,10,...,1,2,17,3,9,2018-11-12 15:22:58,1943.321667,80399.78552,12.111111,0.004631
1945504,F,2018-08-27 06:47:01,Macintosh,Safari,Springfield,MA,368,2,8,14,...,1,4,20,6,9,2018-11-17 14:34:23,1975.789444,97544.59256,17.444444,0.004555


In [20]:
### df_users_base = df_users_base.drop(columns="user_lifecycle_h", errors="ignore")


In [21]:
"user_lifecycle_h" in df_users_base.columns

True

In [22]:
print(df_users_base["count_cancellation_confirmation"].unique())
df_users_base = df_users_base.rename(columns={"count_cancellation_confirmation":"churn_label"})

[1 0]


In [23]:
df_users_base["avg_songs_session"] = df_users_base["count_nextsong"] / df_users_base["count_total_sessions"]

In [24]:
mask = df_users_base["user_lifecycle_h"] < 24
df_users_base[mask]

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1714714,F,2018-09-30 22:59:16,Macintosh,Safari,New Orleans-Metairie,LA,65,1,0,0,...,0,2,0,1,2018-10-01 09:25:13,10.4325,15332.72528,79.0,0.095854,64.0
1444644,F,2018-10-06 04:01:43,Macintosh,Safari,Baltimore-Columbia-Towson,MD,40,0,0,2,...,0,0,0,1,2018-10-07 02:00:54,21.986389,9873.95455,46.0,0.045483,39.0
1039566,M,2018-10-07 10:56:31,iPhone,Safari,Miami-Fort Lauderdale-West Palm Beach,FL,73,0,0,1,...,2,5,0,1,2018-10-07 16:32:59,5.607778,20749.91474,104.0,0.178324,75.0
1594905,M,2018-10-09 02:53:40,Windows,Chrome,San Antonio-New Braunfels,TX,304,2,2,11,...,4,15,0,1,2018-10-10 02:30:24,23.612222,84829.05169,399.0,0.042351,338.0
1501690,F,2018-10-23 18:06:07,Macintosh,Chrome,San Antonio-New Braunfels,TX,82,0,9,1,...,0,5,1,1,2018-10-24 00:43:46,6.6275,21688.30803,128.0,0.150886,84.0
1835558,M,2018-10-24 15:11:40,Windows,Firefox,Miami-Fort Lauderdale-West Palm Beach,FL,99,1,2,2,...,0,3,2,1,2018-10-24 22:20:59,7.155278,25865.30558,154.0,0.139757,102.0
1075367,F,2018-10-31 11:48:10,Macintosh,Firefox,Las Vegas-Henderson-Paradise,NV,10,0,0,0,...,1,0,0,1,2018-10-31 12:29:58,0.696667,1964.35182,25.0,1.435407,9.0
1745396,F,2018-11-07 15:38:32,Macintosh,Firefox,Houston-The Woodlands-Sugar Land,TX,24,0,0,1,...,0,1,0,1,2018-11-07 17:02:16,1.395556,5221.4753,33.0,0.716561,23.0
1649197,F,2018-11-07 15:54:59,Macintosh,Chrome,Washington-Arlington-Alexandria,DC-VA-MD-WV,83,0,0,2,...,2,8,1,1,2018-11-07 21:44:35,5.826667,21060.63695,120.0,0.171625,86.0
1868524,M,2018-11-08 08:19:58,Windows,Firefox,Washington-Arlington-Alexandria,DC-VA-MD-WV,53,0,0,1,...,1,2,0,1,2018-11-08 12:03:49,3.730833,13668.99197,76.0,0.268037,52.0


In [25]:
df_users_base.loc[[1714714]]

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1714714,F,2018-09-30 22:59:16,Macintosh,Safari,New Orleans-Metairie,LA,65,1,0,0,...,0,2,0,1,2018-10-01 09:25:13,10.4325,15332.72528,79.0,0.095854,64.0


In [26]:
df_users_base["thumbs_ratio"] = df_users_base["count_thumbs_up"] / (df_users_base["count_thumbs_down"] + df_users_base["count_thumbs_up"])
df_users_base["thumbs_ratio"] = df_users_base["thumbs_ratio"].fillna(0)
df_users_base["thumbs_ratio"] = df_users_base["thumbs_ratio"].replace(np.inf, 0)
df_users_base["errors_per_session"] = df_users_base["count_error"] / df_users_base["count_total_sessions"]
df_users_base["ads_per_session"] = df_users_base["count_roll_advert"] / df_users_base["count_total_sessions"]

In [27]:
mask = df_users_base["thumbs_ratio"] ==0
df_users_base[mask]

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1154025,M,2018-09-01 01:12:44,Macintosh,Safari,Washington-Arlington-Alexandria,DC-VA-MD-WV,22,0,0,0,...,2,2018-10-01 20:44:18,739.526111,4850.53741,26.5,0.002704,10.5,0.0,0.0,1.5
1534194,M,2018-09-29 21:50:50,Windows,Firefox,Miami-Fort Lauderdale-West Palm Beach,FL,3,0,0,0,...,1,2018-10-01 00:06:42,26.264444,598.02032,15.0,0.038074,2.0,0.0,0.0,1.0
1542469,M,2018-07-24 04:08:58,Windows,Chrome,Augusta-Richmond County,GA-SC,21,0,0,2,...,1,2018-10-01 01:29:42,1653.345556,5558.01170,93.0,0.000605,21.0,0.0,0.0,5.0
1641130,M,2018-08-29 07:20:08,iPhone,Safari,Houston-The Woodlands-Sugar Land,TX,6,0,0,0,...,1,2018-10-01 00:21:26,785.021667,1184.10223,187.0,0.001274,5.0,0.0,0.0,0.0
1883395,F,2018-09-22 19:38:07,Macintosh,Safari,Hartford-West Hartford-East Hartford,CT,33,1,0,2,...,1,2018-10-01 03:42:16,200.069167,8455.53488,65.0,0.004998,32.0,0.0,0.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1597068,M,2018-09-11 07:33:23,Windows,Firefox,Seattle-Tacoma-Bellevue,WA,9,0,0,0,...,1,2018-11-19 15:54:31,1664.352222,1899.93434,9.0,0.000601,8.0,0.0,0.0,1.0
1776870,F,2018-09-07 16:59:36,Macintosh,Safari,Montgomery,AL,9,0,0,0,...,1,2018-11-19 18:53:25,1753.896944,1567.16045,14.0,0.000570,8.0,0.0,0.0,1.0
1494594,M,2018-08-18 18:07:54,iPhone,Safari,New York-Newark-Jersey City,NY-NJ-PA,12,0,0,0,...,1,2018-11-19 19:52:00,2233.735000,3047.43992,18.0,0.000448,11.0,0.0,0.0,1.0
1413070,F,2018-09-24 16:01:54,iPhone,Safari,Mobile,AL,7,0,0,0,...,1,2018-11-19 21:01:06,1348.986667,1462.67157,6.0,0.000741,6.0,0.0,0.0,0.0


In [28]:
df_users_base.columns

Index(['gender', 'registration', 'operating_system', 'browser',
       'metropolitan_area', 'region', 'num_unique_artists', 'count_about',
       'count_add_friend', 'count_add_to_playlist', 'count_cancel',
       'churn_label', 'count_downgrade', 'count_error', 'count_help',
       'count_home', 'count_logout', 'count_nextsong', 'count_roll_advert',
       'count_save_settings', 'count_settings', 'count_submit_downgrade',
       'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up',
       'count_upgrade', 'count_total_sessions', 'last_time',
       'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency',
       'avg_songs_session', 'thumbs_ratio', 'errors_per_session',
       'ads_per_session'],
      dtype='object')

In [29]:
mask = df_users_base["count_submit_upgrade"] > 3
df_users_base[mask][["count_submit_upgrade", "count_submit_downgrade", "churn_label"]]

Unnamed: 0_level_0,count_submit_upgrade,count_submit_downgrade,churn_label
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1240184,4,3,1
1295776,4,3,1
1839740,4,4,0
1747195,6,5,0
1766016,5,4,0
1504480,4,3,1
1882951,4,3,1
1255222,5,4,0
1646275,4,3,1
1608306,4,3,0


In [30]:
df_last_level = (
    df_clean_churn.groupby("userId")["level"]
    .last()
    .reset_index(name="last_level")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_last_level)

In [31]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1769620,M,2018-09-04 11:15:48,iPhone,Safari,Kansas City,MO-KS,25,0,0,0,...,2018-10-23 06:43:47,1171.466389,6669.44219,27.0,0.000854,24.0,1.0,0.0,1.0,free
1242455,F,2018-09-06 06:54:34,Macintosh,Chrome,Boulder,CO,746,1,12,25,...,2018-11-19 23:59:56,1793.089444,241996.73303,15.5,0.007808,68.142857,0.783333,0.0,0.857143,paid
1165904,M,2018-09-01 17:10:57,Windows,Chrome,Allentown-Bethlehem-Easton,PA-NJ,104,0,0,3,...,2018-11-15 17:16:53,1800.098889,29421.586,31.666667,0.001667,36.666667,0.875,0.333333,2.666667,free
1978986,F,2018-07-16 04:06:49,Macintosh,Safari,Des Moines-West Des Moines,IA,1336,3,35,69,...,2018-10-22 20:29:57,2368.385556,510467.03078,16.73913,0.009711,87.608696,0.784173,0.0,0.478261,paid
1426268,M,2018-06-08 01:13:45,Macintosh,Chrome,Dallas-Fort Worth-Arlington,TX,448,0,14,16,...,2018-11-14 08:12:36,3822.980833,138998.47604,24.222222,0.002354,61.333333,0.888889,0.111111,0.333333,paid


In [32]:
location_change_counts = df_clean_churn.groupby("userId")["region"].nunique()
users_with_location_changes = location_change_counts[location_change_counts > 1]
print(f"Number of users who changed their location: {len(users_with_location_changes)}")

Number of users who changed their location: 0


In [33]:
location_change_counts = df_clean_churn.groupby("userId")["metropolitan_area"].nunique()
users_with_location_changes = location_change_counts[location_change_counts > 1]
print(f"Number of users who changed their location: {len(users_with_location_changes)}")

Number of users who changed their location: 0


In [34]:
mask = df_users_base["thumbs_ratio"].isna()
print(df_users_base.loc[mask, ["count_thumbs_up", "count_thumbs_down"]])

Empty DataFrame
Columns: [count_thumbs_up, count_thumbs_down]
Index: []


In [35]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "02_engineered_train.parquet"
df_users_base.to_parquet(checkpoint_file_path, index=True)