In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
DATA_DIR = Path("data/processing_checkpoint")
path_clean_dataset = DATA_DIR / "01_cleaned_train.parquet"

In [3]:
df_clean_churn = pd.read_parquet(path_clean_dataset)

In [4]:
df_clean_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
4937406,M,paid,1235208,NextSong,77264,6,333.45261,Saved,Devendra Banhart,2018-10-14 23:16:16,2018-08-13 22:23:38,Detroit-Warren-Dearborn,MI,Macintosh,Chrome
16390336,F,paid,1746527,Home,16785,0,0.0,No song,No artist,2018-10-19 19:11:41,2018-06-13 07:25:18,Las Vegas-Henderson-Paradise,NV,Windows,Edge
8351786,M,free,1605220,NextSong,100973,22,440.63302,Calvados Chopper,State Radio,2018-10-23 10:14:02,2018-08-12 02:16:23,San Antonio-New Braunfels,TX,Windows,Firefox
15799620,F,paid,1355350,NextSong,13697,38,246.83057,Fisheye,Apocalyptica,2018-10-16 03:13:41,2018-09-14 00:53:28,Cortland,NY,Windows,Firefox
11722454,M,paid,1008027,NextSong,167386,70,255.16363,Muito Pouco,Maria Rita,2018-11-11 14:15:46,2018-09-24 10:03:26,Ames,IA,Windows,Chrome


In [5]:
df_clean_churn = df_clean_churn.sort_values(by="time", ascending=True)
df_clean_churn["page"] = df_clean_churn["page"].str.strip().str.lower()

In [6]:
df_users_base = df_clean_churn[["userId", "gender", "registration", "operating_system", "browser", "metropolitan_area", "region"]].drop_duplicates(subset=["userId"])

In [7]:
len(df_users_base)

19140

In [8]:
df_users_base = df_users_base.set_index("userId")

In [9]:
df_unique_artists = (
    df_clean_churn.groupby("userId")["artist"].nunique().reset_index().rename(columns={"artist": "num_unique_artists"})
)
df_unique_artists = df_unique_artists.set_index("userId")
df_users_base = df_users_base.join(df_unique_artists)

In [10]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1875874,F,2018-08-30 19:33:49,Linux,Chrome,Rockford,IL,81
1859992,F,2018-06-10 08:15:54,Macintosh,Chrome,Salt Lake City,UT,84
1205229,F,2018-09-17 18:42:42,Macintosh,Safari,El Paso,TX,111
1957044,M,2018-09-14 13:26:54,Macintosh,Safari,Dallas-Fort Worth-Arlington,TX,1573
1278517,M,2018-09-11 19:51:56,Macintosh,Chrome,Macon,GA,1430


In [11]:
df_page_counts = (
    df_clean_churn.groupby("userId")["page"]
    .value_counts()
    .unstack(fill_value=0)
)

df_page_counts.columns = [f"count_{col.replace(' ', '_').lower()}" for col in df_page_counts.columns]

In [12]:
df_users_base = df_users_base.join(df_page_counts)
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_logout,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1697983,M,2018-09-28 23:18:48,Windows,Chrome,Chicago-Naperville-Elgin,IL-IN-WI,154,0,0,2,...,3,163,13,3,5,0,0,4,12,3
1246521,M,2018-08-27 12:18:38,Macintosh,Safari,Andrews,TX,108,1,0,3,...,2,110,9,0,0,0,0,0,4,1
1703830,M,2018-07-19 17:54:29,Windows,Chrome,Los Angeles-Long Beach-Anaheim,CA,342,0,8,13,...,9,378,5,2,3,1,1,1,40,1
1076804,F,2018-09-04 16:04:12,iPhone,Safari,Minneapolis-St. Paul-Bloomington,MN-WI,177,0,1,3,...,6,188,14,0,0,0,0,0,12,5
1254491,F,2018-09-01 06:45:48,Windows,Chrome,Miami-Fort Lauderdale-West Palm Beach,FL,1416,5,32,55,...,21,2129,12,8,23,0,1,18,104,1


In [13]:
df_session_count = (
    df_clean_churn.groupby("userId")["sessionId"]
    .nunique()
    .reset_index(name="count_total_sessions")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_session_count)


In [14]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_nextsong,count_roll_advert,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1359838,F,2018-08-15 19:06:38,Macintosh,Chrome,Cincinnati,OH-KY-IN,40,0,0,1,...,39,4,0,0,0,0,0,0,0,2
1378665,F,2018-08-24 23:18:53,Windows,Chrome,Washington,NC,667,2,10,30,...,853,10,0,1,0,1,4,46,1,6
1048011,M,2018-09-28 02:28:22,Linux,Firefox,Dallas-Fort Worth-Arlington,TX,180,1,2,5,...,196,0,0,2,0,0,1,15,0,1
1123874,F,2018-09-07 23:09:13,Windows,Edge,Washington-Arlington-Alexandria,DC-VA-MD-WV,312,0,5,11,...,366,30,0,2,0,0,3,22,2,9
1035633,F,2018-08-05 10:24:26,Macintosh,Safari,Sacramento--Roseville--Arden-Arcade,CA,447,0,7,19,...,526,1,1,4,0,0,6,51,0,10


In [15]:
# Build user lifecycle

df_last_time = (
    df_clean_churn.groupby("userId")["time"]
    .max()
    .reset_index(name="last_time")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_last_time)
df_users_base["user_lifecycle_h"] = df_users_base["last_time"] - df_users_base["registration"]
df_users_base["user_lifecycle_h"] = df_users_base["user_lifecycle_h"].dt.total_seconds() / 3600

df_users_base.head()

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_save_settings,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,Dallas-Fort Worth-Arlington,TX,797,5,18,33,...,2,7,0,0,9,51,0,9,2018-10-21 01:16:24,1763.900833
1484921,M,2018-09-16 09:11:42,Linux,Chrome,New York-Newark-Jersey City,NY-NJ-PA,465,1,11,15,...,0,1,0,1,4,48,1,8,2018-11-15 17:33:31,1448.363611
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,Lafayette,LA,482,0,11,21,...,1,3,1,2,4,68,3,9,2018-11-15 20:43:13,1480.669722
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,Hilo,HI,1117,1,22,44,...,2,12,1,1,16,83,7,34,2018-11-19 23:59:55,1738.191667
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,San Francisco-Oakland-Hayward,CA,437,1,5,14,...,0,2,0,0,5,25,0,6,2018-11-03 02:00:13,1030.581944


In [16]:
df_length = (
    df_clean_churn.groupby("userId")["length"].sum()
)
df_length = (
    df_length
    .reset_index(name="ttl_length")
    .set_index("userId")
)
df_users_base = df_users_base.join(df_length)
df_users_base.head(5)


Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_settings,count_submit_downgrade,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1749042,M,2018-08-08 13:22:21,Windows,Chrome,Dallas-Fort Worth-Arlington,TX,797,5,18,33,...,7,0,0,9,51,0,9,2018-10-21 01:16:24,1763.900833,256456.28661
1484921,M,2018-09-16 09:11:42,Linux,Chrome,New York-Newark-Jersey City,NY-NJ-PA,465,1,11,15,...,1,0,1,4,48,1,8,2018-11-15 17:33:31,1448.363611,136515.28536
1694515,M,2018-09-15 04:03:02,Macintosh,Chrome,Lafayette,LA,482,0,11,21,...,3,1,2,4,68,3,9,2018-11-15 20:43:13,1480.669722,146479.95366
1697168,F,2018-09-08 13:48:25,Macintosh,Firefox,Hilo,HI,1117,1,22,44,...,12,1,1,16,83,7,34,2018-11-19 23:59:55,1738.191667,386582.82117
1563081,F,2018-09-21 03:25:18,Macintosh,Chrome,San Francisco-Oakland-Hayward,CA,437,1,5,14,...,2,0,0,5,25,0,6,2018-11-03 02:00:13,1030.581944,123645.65861


In [17]:
df_item_per_session = (
    df_clean_churn.groupby("userId")["itemInSession"].max() / df_users_base["count_total_sessions"]
)

df_item_per_session = (
    df_item_per_session
    .reset_index(name="item_per_session")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_item_per_session)


In [18]:
df_users_base["frequency"] = df_users_base["count_total_sessions"] / df_users_base["user_lifecycle_h"]

In [19]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_submit_upgrade,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1509721,F,2018-09-13 08:32:12,Windows,Firefox,Baltimore-Columbia-Towson,MD,588,2,7,15,...,1,10,38,1,8,2018-10-10 05:40:38,645.140556,182224.95675,59.625,0.0124
1918902,M,2018-05-10 09:13:10,Windows,Firefox,Miami-Fort Lauderdale-West Palm Beach,FL,607,2,11,23,...,0,3,44,0,7,2018-11-13 08:39:57,4487.446389,191511.12147,28.571429,0.00156
1693397,F,2018-09-16 13:55:34,Windows,Chrome,Youngstown-Warren-Boardman,OH-PA,120,1,0,4,...,1,5,0,2,4,2018-11-19 10:15:36,1532.333889,32603.73624,30.0,0.00261
1093129,M,2018-09-30 17:33:00,Macintosh,Chrome,Chicago-Naperville-Elgin,IL-IN-WI,136,1,0,3,...,0,1,5,2,3,2018-11-01 20:51:37,771.310278,35037.97553,31.333333,0.003889
1958604,F,2018-09-28 09:39:33,Windows,Chrome,Charleston-North Charleston,SC,82,0,6,2,...,0,0,5,1,4,2018-10-29 20:17:03,754.625,19756.34212,13.25,0.005301


In [20]:
### df_users_base = df_users_base.drop(columns="user_lifecycle_h", errors="ignore")


In [21]:
"user_lifecycle_h" in df_users_base.columns

True

In [22]:
print(df_users_base["count_cancellation_confirmation"].unique())
df_users_base = df_users_base.rename(columns={"count_cancellation_confirmation":"churn_label"})

[1 0]


In [23]:
df_users_base["avg_songs_session"] = df_users_base["count_nextsong"] / df_users_base["count_total_sessions"]

In [24]:
mask = df_users_base["user_lifecycle_h"] < 24
df_users_base[mask]

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1714714,F,2018-09-30 22:59:16,Macintosh,Safari,New Orleans-Metairie,LA,65,1,0,0,...,0,2,0,1,2018-10-01 09:25:13,10.4325,15332.72528,79.0,0.095854,64.0
1444644,F,2018-10-06 04:01:43,Macintosh,Safari,Baltimore-Columbia-Towson,MD,40,0,0,2,...,0,0,0,1,2018-10-07 02:00:54,21.986389,9873.95455,46.0,0.045483,39.0
1039566,M,2018-10-07 10:56:31,iPhone,Safari,Miami-Fort Lauderdale-West Palm Beach,FL,73,0,0,1,...,2,5,0,1,2018-10-07 16:32:59,5.607778,20749.91474,104.0,0.178324,75.0
1594905,M,2018-10-09 02:53:40,Windows,Chrome,San Antonio-New Braunfels,TX,304,2,2,11,...,4,15,0,1,2018-10-10 02:30:24,23.612222,84829.05169,399.0,0.042351,338.0
1501690,F,2018-10-23 18:06:07,Macintosh,Chrome,San Antonio-New Braunfels,TX,82,0,9,1,...,0,5,1,1,2018-10-24 00:43:46,6.6275,21688.30803,128.0,0.150886,84.0
1835558,M,2018-10-24 15:11:40,Windows,Firefox,Miami-Fort Lauderdale-West Palm Beach,FL,99,1,2,2,...,0,3,2,1,2018-10-24 22:20:59,7.155278,25865.30558,154.0,0.139757,102.0
1075367,F,2018-10-31 11:48:10,Macintosh,Firefox,Las Vegas-Henderson-Paradise,NV,10,0,0,0,...,1,0,0,1,2018-10-31 12:29:58,0.696667,1964.35182,25.0,1.435407,9.0
1745396,F,2018-11-07 15:38:32,Macintosh,Firefox,Houston-The Woodlands-Sugar Land,TX,24,0,0,1,...,0,1,0,1,2018-11-07 17:02:16,1.395556,5221.4753,33.0,0.716561,23.0
1649197,F,2018-11-07 15:54:59,Macintosh,Chrome,Washington-Arlington-Alexandria,DC-VA-MD-WV,83,0,0,2,...,2,8,1,1,2018-11-07 21:44:35,5.826667,21060.63695,120.0,0.171625,86.0
1868524,M,2018-11-08 08:19:58,Windows,Firefox,Washington-Arlington-Alexandria,DC-VA-MD-WV,53,0,0,1,...,1,2,0,1,2018-11-08 12:03:49,3.730833,13668.99197,76.0,0.268037,52.0


In [25]:
df_users_base.loc[[1714714]]

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_thumbs_down,count_thumbs_up,count_upgrade,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1714714,F,2018-09-30 22:59:16,Macintosh,Safari,New Orleans-Metairie,LA,65,1,0,0,...,0,2,0,1,2018-10-01 09:25:13,10.4325,15332.72528,79.0,0.095854,64.0


In [26]:
df_users_base["thumbs_ratio"] = df_users_base["count_thumbs_up"] / (df_users_base["count_thumbs_down"] + df_users_base["count_thumbs_up"])
df_users_base["thumbs_ratio"] = df_users_base["thumbs_ratio"].replace(np.inf, 0)
df_users_base["errors_per_session"] = df_users_base["count_error"] / df_users_base["count_total_sessions"]
df_users_base["ads_per_session"] = df_users_base["count_roll_advert"] / df_users_base["count_total_sessions"]

In [27]:
mask = df_users_base["thumbs_ratio"] ==0
df_users_base[mask]

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,count_total_sessions,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1542469,M,2018-07-24 04:08:58,Windows,Chrome,Augusta-Richmond County,GA-SC,21,0,0,2,...,1,2018-10-01 01:29:42,1653.345556,5558.01170,93.0,0.000605,21.0,0.0,0.0,5.0
1641130,M,2018-08-29 07:20:08,iPhone,Safari,Houston-The Woodlands-Sugar Land,TX,6,0,0,0,...,1,2018-10-01 00:21:26,785.021667,1184.10223,187.0,0.001274,5.0,0.0,0.0,0.0
1278056,M,2018-09-24 01:08:46,Linux,Firefox,Philadelphia-Camden-Wilmington,PA-NJ-DE-MD,51,0,0,0,...,1,2018-10-01 03:21:37,170.214167,11992.53198,58.0,0.005875,51.0,0.0,0.0,1.0
1441204,F,2018-09-05 01:06:30,Macintosh,Safari,Houston-The Woodlands-Sugar Land,TX,29,0,0,1,...,2,2018-10-12 07:52:34,894.767778,7033.01263,18.0,0.002235,14.0,0.0,0.0,2.0
1072890,M,2018-09-18 06:45:18,Macintosh,Chrome,San Francisco-Oakland-Hayward,CA,40,0,3,2,...,2,2018-10-01 18:28:56,323.727222,10032.43945,21.5,0.006178,19.5,0.0,0.0,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1178165,M,2018-08-12 20:34:23,Windows,Chrome,Green Bay,WI,5,0,0,0,...,1,2018-11-15 15:50:46,2275.273056,1063.26022,5.0,0.000440,4.0,0.0,0.0,1.0
1072628,F,2018-08-19 22:30:44,Windows,Chrome,Santa Fe,NM,26,0,0,1,...,1,2018-11-16 01:22:31,2114.863056,5566.81315,28.0,0.000473,25.0,0.0,0.0,0.0
1802584,F,2018-06-30 04:31:34,Windows,Firefox,Lancaster,PA,13,0,0,0,...,1,2018-11-16 07:04:40,3338.551667,2815.91619,14.0,0.000300,12.0,0.0,0.0,0.0
1264392,F,2018-11-16 08:43:56,Macintosh,Chrome,Dallas-Fort Worth-Arlington,TX,49,0,0,1,...,2,2018-11-19 07:46:49,71.048056,11883.52393,15.5,0.028150,24.5,0.0,0.0,1.0


In [28]:
df_users_base.columns

Index(['gender', 'registration', 'operating_system', 'browser',
       'metropolitan_area', 'region', 'num_unique_artists', 'count_about',
       'count_add_friend', 'count_add_to_playlist', 'count_cancel',
       'churn_label', 'count_downgrade', 'count_error', 'count_help',
       'count_home', 'count_logout', 'count_nextsong', 'count_roll_advert',
       'count_save_settings', 'count_settings', 'count_submit_downgrade',
       'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up',
       'count_upgrade', 'count_total_sessions', 'last_time',
       'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency',
       'avg_songs_session', 'thumbs_ratio', 'errors_per_session',
       'ads_per_session'],
      dtype='object')

In [29]:
mask = df_users_base["count_submit_upgrade"] > 3
df_users_base[mask][["count_submit_upgrade", "count_submit_downgrade", "churn_label"]]

Unnamed: 0_level_0,count_submit_upgrade,count_submit_downgrade,churn_label
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1240184,4,3,1
1295776,4,3,1
1839740,4,4,0
1747195,6,5,0
1766016,5,4,0
1504480,4,3,1
1882951,4,3,1
1255222,5,4,0
1646275,4,3,1
1608306,4,3,0


In [30]:
df_last_level = (
    df_clean_churn.groupby("userId")["level"]
    .last()
    .reset_index(name="last_level")
    .set_index("userId")
)

df_users_base = df_users_base.join(df_last_level)

In [31]:
df_users_base.sample(5)

Unnamed: 0_level_0,gender,registration,operating_system,browser,metropolitan_area,region,num_unique_artists,count_about,count_add_friend,count_add_to_playlist,...,last_time,user_lifecycle_h,ttl_length,item_per_session,frequency,avg_songs_session,thumbs_ratio,errors_per_session,ads_per_session,last_level
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1633767,M,2018-09-03 10:02:55,Macintosh,Chrome,Altoona,PA,1396,5,43,70,...,2018-11-18 15:41:46,1829.6475,512501.43738,16.818182,0.012024,95.181818,0.830769,0.181818,0.045455,paid
1011238,F,2018-09-28 21:45:30,Linux,Chrome,Sacramento--Roseville--Arden-Arcade,CA,958,4,25,39,...,2018-11-17 02:56:01,1181.175278,316713.78701,19.882353,0.014392,75.705882,0.657895,0.058824,2.235294,paid
1200616,F,2018-09-12 10:15:27,Windows,Chrome,Bridgeport-Stamford-Norwalk,CT,494,1,5,22,...,2018-11-13 13:33:31,1491.301111,146863.74461,30.0,0.006706,59.2,0.837209,0.0,2.8,free
1281697,M,2018-09-30 15:06:59,Windows,Chrome,Dallas-Fort Worth-Arlington,TX,828,1,23,32,...,2018-11-19 18:19:08,1203.2025,268302.01057,23.888889,0.01496,59.722222,0.91453,0.0,0.0,paid
1576290,M,2018-07-30 02:38:33,Linux,Firefox,Boston-Cambridge-Newton,MA-NH,817,0,6,36,...,2018-11-16 17:23:38,2630.751389,258188.11348,24.181818,0.004181,95.727273,0.714286,0.0,0.090909,paid


In [32]:
location_change_counts = df_clean_churn.groupby("userId")["region"].nunique()
users_with_location_changes = location_change_counts[location_change_counts > 1]
print(f"Number of users who changed their location: {len(users_with_location_changes)}")

Number of users who changed their location: 0


In [33]:
location_change_counts = df_clean_churn.groupby("userId")["metropolitan_area"].nunique()
users_with_location_changes = location_change_counts[location_change_counts > 1]
print(f"Number of users who changed their location: {len(users_with_location_changes)}")

Number of users who changed their location: 0


In [34]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "02_engineered_train.parquet"
df_users_base.to_parquet(checkpoint_file_path, index=True)