In [1]:
import pandas as pd

In [2]:
from pathlib import Path

DATA_DIR = Path("data/churn-prediction-25-26")
file_path_dataset = DATA_DIR / "train.parquet"

In [3]:
df_churn = pd.read_parquet(file_path_dataset)

In [4]:
df_churn.columns

Index(['status', 'gender', 'firstName', 'level', 'lastName', 'userId', 'ts',
       'auth', 'page', 'sessionId', 'location', 'itemInSession', 'userAgent',
       'method', 'length', 'song', 'artist', 'time', 'registration'],
      dtype='object')

In [5]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17499636 entries, 0 to 25661583
Data columns (total 19 columns):
 #   Column         Dtype         
---  ------         -----         
 0   status         int64         
 1   gender         object        
 2   firstName      object        
 3   level          object        
 4   lastName       object        
 5   userId         object        
 6   ts             int64         
 7   auth           object        
 8   page           object        
 9   sessionId      int64         
 10  location       object        
 11  itemInSession  int64         
 12  userAgent      object        
 13  method         object        
 14  length         float64       
 15  song           object        
 16  artist         object        
 17  time           datetime64[us]
 18  registration   datetime64[us]
dtypes: datetime64[us](2), float64(1), int64(4), object(12)
memory usage: 2.6+ GB


In [6]:
df_churn.describe()

Unnamed: 0,status,ts,sessionId,itemInSession,length,time,registration
count,17499640.0,17499640.0,17499640.0,17499640.0,14291430.0,17499636,17499636
mean,209.1387,1540428000000.0,84802.94,105.5937,248.7135,2018-10-25 00:47:01.161927,2018-08-25 04:40:21.543066
min,200.0,1538352000000.0,1.0,0.0,0.522,2018-10-01 00:00:01,2017-10-14 22:05:25
25%,200.0,1539340000000.0,25159.0,26.0,199.8885,2018-10-12 10:33:57.750000,2018-08-10 21:14:59
50%,200.0,1540397000000.0,79038.0,66.0,234.0828,2018-10-24 15:58:54,2018-09-05 18:35:50
75%,200.0,1541500000000.0,138368.0,144.0,276.8714,2018-11-06 10:25:35,2018-09-20 17:24:57
max,404.0,1542672000000.0,207003.0,1426.0,3024.666,2018-11-20 00:00:00,2018-11-19 23:34:34
std,30.2305,1233485000.0,61414.27,116.8854,97.22845,,


In [7]:
print("Unique Value Count")
few_unique_columns = []

for column_name in df_churn.columns:
    unique_count = df_churn[column_name].nunique()
    print(f"{column_name}: {unique_count}")
    if unique_count < 20:
        few_unique_columns.append(column_name)

Unique Value Count
status: 3
gender: 2
firstName: 4967
level: 2
lastName: 1000
userId: 19140
ts: 4189091
auth: 2
page: 19
sessionId: 161194
location: 875
itemInSession: 1427
userAgent: 85
method: 2
length: 23379
song: 239299
artist: 37264
time: 4189091
registration: 19118


In [8]:
for column_name in few_unique_columns:
    unique_values = df_churn[column_name].unique()
    print(f"{column_name}: {unique_values}")

status: [200 307 404]
gender: ['M' 'F']
level: ['paid' 'free']
auth: ['Logged In' 'Cancelled']
page: ['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']
method: ['PUT' 'GET']


In [9]:
# total number of churns in dataset
churn_page ="Cancellation Confirmation"
all_churn_count = (df_churn["page"] == churn_page ).sum()
print(all_churn_count)

4271


In [10]:
# total number of unique users
unique_users = df_churn["userId"].nunique()
print(unique_users)

19140


In [11]:
# number of users who churned from 10.11.2018 to 20.11.2018
end_date = pd.Timestamp("2018-11-10")
churn_after_date = ((df_churn["page"] == churn_page) & (df_churn["time"] >= end_date)).sum()
print(churn_after_date)

667


### Data cleaning

In [12]:
df_churn.sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
9360390,200,M,David,paid,Leach,1108434,1540803426000,Logged In,NextSong,132649,"Akron, OH",76,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,188.05506,Haunted,Evanescence,2018-10-29 08:57:06,2018-08-16 06:38:29
23704137,307,F,Zoey,paid,Floyd,1121814,1539648189000,Logged In,Thumbs Up,13695,"Miami-Fort Lauderdale-West Palm Beach, FL",36,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",PUT,,,,2018-10-16 00:03:09,2018-08-23 17:35:26
21199825,200,M,Jacob,paid,Bryant,1838841,1538531603000,Logged In,NextSong,4364,"Portland-Vancouver-Hillsboro, OR-WA",8,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,210.88608,Jamaica Roots II(Agora E Sempre),Natiruts,2018-10-03 01:53:23,2018-09-12 15:07:09
8855020,307,F,Chloe,paid,Chapman,1018813,1540619506000,Logged In,Save Settings,123430,"Urban Honolulu, HI",70,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,PUT,,,,2018-10-27 05:51:46,2018-09-17 06:48:14
4612467,307,F,Alyssa,paid,Solomon,1640463,1539603782000,Logged In,Thumbs Up,79340,"Farmington, MO",16,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,,,,2018-10-15 11:43:02,2018-08-31 23:04:17


In [13]:
print((df_churn["userId"].str.len() != 7).sum())

0


In [14]:
mask = df_churn["auth"] == "Cancelled"
df_churn[mask].sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
5471386,200,M,Jackson,free,Fleming,1275938,1539795961000,Cancelled,Cancellation Confirmation,84901,"Houston-The Woodlands-Sugar Land, TX",57,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,GET,,,,2018-10-17 17:06:01,2018-09-07 15:39:24
9111282,200,F,Grace,free,Phillips,1763349,1540714746000,Cancelled,Cancellation Confirmation,131848,"Dallas-Fort Worth-Arlington, TX",6,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5...",GET,,,,2018-10-28 08:19:06,2018-08-19 00:39:51
4945879,200,F,Jaycee,paid,Novak,1109212,1539676539000,Cancelled,Cancellation Confirmation,83698,"Lexington-Fayette, KY",307,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,GET,,,,2018-10-16 07:55:39,2018-08-17 20:23:38
5929703,200,M,Camren,paid,Walker,1000025,1539894785000,Cancelled,Cancellation Confirmation,95858,"New Haven-Milford, CT",120,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-10-18 20:33:05,2018-07-10 09:30:08
20417212,200,M,Andrew,paid,Pittman,1663996,1539709890000,Cancelled,Cancellation Confirmation,10047,"Gainesville, TX",221,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-10-16 17:11:30,2018-08-03 05:59:51


In [15]:
cancelled_count = (df_churn["auth"] == "Cancelled" ).sum()
print(cancelled_count)
double_check_auth = ((df_churn["page"] == churn_page) & (df_churn["auth"] == "Cancelled")).sum()
print(double_check_auth)

4271
4271


In [16]:
df_churn['ts'] = pd.to_datetime(df_churn["ts"], unit="ms")

In [17]:
df_churn.sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
9562851,200,M,Jacob,paid,Graham,1111631,2018-10-29 21:15:07,Logged In,NextSong,18922,"Cleveland-Elyria, OH",136,"""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537....",PUT,216.2673,Baby It's You,JoJo / Lil' Bow Wow,2018-10-29 21:15:07,2018-09-08 19:03:11
22933322,200,F,Alanna,paid,Cunningham,1482597,2018-10-02 06:58:25,Logged In,NextSong,3605,"Tulsa, OK",19,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,216.76363,Hey_ Soul Sister,Train,2018-10-02 06:58:25,2018-09-28 11:57:38
9867627,200,F,Kennedy,paid,Ortiz,1046085,2018-10-30 16:09:31,Logged In,NextSong,119979,"New York-Newark-Jersey City, NY-NJ-PA",3,"""Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebK...",PUT,239.59465,Lift Me Up (Metro Edit),Geri Halliwell,2018-10-30 16:09:31,2018-09-16 12:20:48
19928817,200,M,Kaden,paid,Lane,1355852,2018-10-02 01:21:13,Logged In,NextSong,4232,"Cedar Rapids, IA",206,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,PUT,275.51302,After The Love Has Gone,Damage,2018-10-02 01:21:13,2018-07-22 14:40:01
5489772,200,M,Andriy,paid,Evans,1859459,2018-10-17 18:05:14,Logged In,Home,93050,"Bakersfield, CA",0,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",GET,,,,2018-10-17 18:05:14,2018-09-29 13:34:56


In [18]:
drop_columns = ["firstName", "lastName", "auth", "method", "ts"]
df_churn.drop(columns=drop_columns, inplace=True)

In [19]:
df_churn['userId'] = df_churn['userId'].astype(int)

In [20]:
df_churn.sample(5)

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
13381233,307,M,paid,1017934,Thumbs Up,172718,"Minneapolis-St. Paul-Bloomington, MN-WI",169,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",,,,2018-11-09 20:59:52,2018-08-31 19:20:18
8418270,200,F,paid,1963108,NextSong,125085,"Detroit-Warren-Dearborn, MI",30,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",227.99628,I'm In Miami Bitch,LMFAO,2018-10-26 01:59:47,2018-08-01 14:33:20
6269219,200,M,free,1924180,NextSong,98322,"Birmingham-Hoover, AL",14,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5...",131.23873,Fight For Your Life,The Casualties,2018-10-19 17:28:56,2018-09-19 22:46:55
10306559,200,F,paid,1763192,NextSong,131352,"Dallas-Fort Worth-Arlington, TX",122,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",233.87383,Redbull (featuring Redman),Wu-Tang Clan featuring Redman,2018-10-31 19:02:38,2018-09-07 13:29:21
25636527,200,F,paid,1621446,NextSong,35496,"Niles-Benton Harbor, MI",46,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",188.31628,This Girl,Kylie Minogue,2018-11-19 16:26:27,2018-06-03 02:30:12


In [21]:
nan_count = df_churn.isna().sum()

In [22]:
print(nan_count)

status                 0
gender                 0
level                  0
userId                 0
page                   0
sessionId              0
location               0
itemInSession          0
userAgent              0
length           3208203
song             3208203
artist           3208203
time                   0
registration           0
dtype: int64


In [23]:
empty_string_count = (df_churn.select_dtypes(include=["object"]) == "").sum()

In [24]:
print(empty_string_count)

gender       0
level        0
page         0
location     0
userAgent    0
song         0
artist       0
dtype: int64


In [25]:
mask = df_churn["status"] == 307
df_churn[mask].sample(10)

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
15428193,307,F,paid,1782082,Thumbs Up,194957,"San Jose-Sunnyvale-Santa Clara, CA",22,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",,,,2018-11-16 05:41:04,2018-07-29 15:46:40
11526113,307,F,paid,1107583,Logout,145288,"Baltimore-Columbia-Towson, MD",44,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-04 15:48:32,2018-09-07 22:19:49
12294895,307,F,paid,1274116,Thumbs Up,150313,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",243,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,,,,2018-11-06 22:27:49,2018-09-20 17:29:52
11932037,307,F,paid,1710351,Thumbs Up,156864,"St. Louis, MO-IL",103,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",,,,2018-11-05 23:31:26,2018-03-29 18:12:50
2245301,307,F,free,1214558,Thumbs Up,46651,"New York-Newark-Jersey City, NY-NJ-PA",7,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",,,,2018-10-08 09:17:16,2018-09-25 03:36:22
20497182,307,F,paid,1121066,Thumbs Down,11776,"Phoenix-Mesa-Scottsdale, AZ",90,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; r...,,,,2018-10-19 15:02:33,2018-09-15 05:49:53
942178,307,F,paid,1981399,Add Friend,27830,"San Diego-Carlsbad, CA",165,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-10-03 22:10:51,2018-09-16 16:09:18
3923333,307,M,paid,1157304,Thumbs Down,66346,"Del Rio, TX",153,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; r...,,,,2018-10-12 19:01:41,2018-09-27 01:46:52
6010349,307,M,free,1600234,Logout,94335,"New York-Newark-Jersey City, NY-NJ-PA",51,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-10-19 01:15:00,2018-09-05 15:20:19
25229921,307,F,paid,1255404,Thumbs Up,27003,"San Jose-Sunnyvale-Santa Clara, CA",70,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",,,,2018-11-12 11:07:15,2018-09-24 23:00:27


In [26]:
mask = (df_churn["status"] == 307) & (df_churn["song"].notna())
len(df_churn[mask])

0

In [27]:
mask = df_churn["status"] == 307
unique_values_307 = df_churn[mask]["page"].unique()
print(unique_values_307)
unique_values_page = df_churn["page"].unique()
print(unique_values_page)

['Thumbs Up' 'Add Friend' 'Thumbs Down' 'Logout' 'Save Settings' 'Cancel'
 'Submit Downgrade' 'Submit Upgrade']
['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']


In [28]:
mask = df_churn["status"] == 404
df_churn[mask]

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
2788052,404,F,free,1697168,Error,58979,"Hilo, HI",14,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-10-09 19:12:32,2018-09-08 13:48:25
13897750,404,F,paid,1697168,Error,175256,"Hilo, HI",37,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-11-12 00:55:32,2018-09-08 13:48:25
16297984,404,F,paid,1697168,Error,201893,"Hilo, HI",318,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-11-19 13:08:39,2018-09-08 13:48:25
602280,404,M,paid,1222580,Error,30295,"Watertown, SD",16,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-02 22:06:26,2018-08-16 02:31:00
602568,404,M,paid,1222580,Error,30295,"Watertown, SD",18,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-02 22:07:35,2018-08-16 02:31:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25087053,404,M,paid,1934047,Error,1109,"El Dorado, AR",75,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-09 07:50:11,2018-08-31 04:28:43
25089645,404,M,paid,1934047,Error,1109,"El Dorado, AR",100,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-09 09:19:53,2018-08-31 04:28:43
25179115,404,F,free,1912269,Error,2534,"Seattle-Tacoma-Bellevue, WA",72,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-11 04:03:34,2018-11-11 01:12:59
25248788,404,M,free,1882230,Error,2951,"Houston-The Woodlands-Sugar Land, TX",98,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-11-12 17:33:20,2018-04-06 02:14:55


In [29]:
unique_values_404 = df_churn[mask]["page"].unique()
print(unique_values_404)

['Error']


In [30]:
mask = df_churn["status"] == 200
unique_values_200 = df_churn[mask]["page"].unique()
print(unique_values_200)

['NextSong' 'Downgrade' 'Help' 'Home' 'Add to Playlist' 'About' 'Settings'
 'Cancellation Confirmation' 'Roll Advert' 'Upgrade']


In [31]:
pages_200 = set(unique_values_200)
pages_307 = set(unique_values_307)

status_overlap = pages_200.intersection(pages_307)
if len(status_overlap) == 0:
    print("No overlap here. Safe to delete status")
else:
    print(f"Overlap in {status_overlap}. Can't delete status")

No overlap here. Safe to delete status


In [32]:
df_churn.drop(columns="status", inplace=True)

In [33]:
df_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
15452425,M,paid,1638697,NextSong,196737,"Phoenix-Mesa-Scottsdale, AZ",1,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",251.0624,New Heights,A Fine Frenzy,2018-11-16 07:32:59,2018-09-25 23:42:50
14531673,F,paid,1517931,NextSong,179373,"Terre Haute, IN",83,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",149.83791,Heaven Is A Truck,Pavement,2018-11-13 19:45:21,2018-09-26 02:36:44
1941155,F,paid,1981692,NextSong,26864,"Phoenix-Mesa-Scottsdale, AZ",597,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,227.99628,January Wedding,The Avett Brothers,2018-10-07 02:05:27,2018-09-29 09:03:52
22274845,F,free,1621675,NextSong,23347,"New York-Newark-Jersey City, NY-NJ-PA",59,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",237.53098,I Think I See The Light,Cat Stevens,2018-11-06 01:57:32,2018-07-25 19:04:37
1369621,F,free,1273869,NextSong,42105,"Phoenix-Mesa-Scottsdale, AZ",4,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",360.4371,Slow Motion [Instrumental],Third Eye Blind,2018-10-05 01:47:53,2018-09-30 19:00:27


In [34]:
location_split = df_churn['location'].str.split(', ', expand=True)

In [35]:
df_churn['metropolitan_area'] = location_split[0].str.strip()
df_churn['state'] = location_split[1].str.strip()

In [36]:
df_churn['artist'] = df_churn['artist'].fillna("No artist")
df_churn['song'] = df_churn['song'].fillna("No song")
df_churn['length'] = df_churn['length'].fillna(0)

In [37]:
print(df_churn.isna().sum())

gender               0
level                0
userId               0
page                 0
sessionId            0
location             0
itemInSession        0
userAgent            0
length               0
song                 0
artist               0
time                 0
registration         0
metropolitan_area    0
state                0
dtype: int64


In [38]:
unique_states = df_churn["state"].unique()
print(len(unique_states))
unique_area = df_churn["metropolitan_area"].unique()
print(len(unique_area))
unique_location = df_churn["location"].unique()
print(len(unique_location))

100
806
875


In [39]:
print(unique_states)

['TX' 'CA' 'HI' 'SD' 'MD' 'FL' 'IN' 'PA' 'NY' 'VA' 'PA-NJ-DE-MD' 'VA-NC'
 'MA-NH' 'GA' 'KY' 'MO-IL' 'DC-VA-MD-WV' 'AZ' 'OH' 'OR' 'TN-MS-AR'
 'IL-IN-WI' 'TN-GA' 'NY-NJ-PA' 'MI' 'WA' 'WY' 'AL' 'NH' 'NV' 'SC' 'MN-WI'
 'TN' 'MN' 'CO' 'MO' 'OH-KY-IN' 'IN-KY' 'NE-IA' 'RI-MA' 'MA-CT' 'LA' 'CT'
 'OK' 'NC' 'NJ' 'WV' 'KS' 'MD-WV' 'MS' 'AR' 'PA-NJ' 'IL' 'WI' 'MS-LA' 'IA'
 'OR-WA' 'TN-VA' 'ME' 'NM' 'NC-SC' 'IA-IL' 'UT' 'KY-IN' 'GA-AL' 'MO-KS'
 'OH-PA' 'MA' 'NE' 'IN-MI' 'ID' 'AR-OK' 'TN-KY' 'SC-NC' 'MT' 'WV-KY-OH'
 'GA-SC' 'AR-MO' 'DE' 'ND' 'KY-IL' 'VT' 'AK' 'WY-ID' 'UT-ID' 'WV-OH'
 'TX-AR' 'MD-DE' 'IA-NE-SD' 'OR-ID' 'ND-MN' 'VA-WV' 'ID-WA' 'NH-VT'
 'IL-MO' 'WI-MI' 'IA-IL-MO' 'WI-MN' 'WV-VA' 'MI-WI']


In [40]:
df_churn.rename(columns={"state": "region"}, inplace=True)

In [41]:
df_churn.drop(columns="location", inplace=True)

In [42]:
pd.set_option('display.max_colwidth', 500)
df_churn["userAgent"].sample(5)

6172859                                                               Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0
1371291                                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
876181                                                                Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
21728063    "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53"
24441730             "Mozilla/5.0 (iPad; CPU OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53"
Name: userAgent, dtype: object

In [43]:
unique_useragents = df_churn["userAgent"].unique()
print(unique_useragents)

['"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4"'
 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0'
 '"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"'
 '"Mozilla/5.0 (Windows NT 6.3; 

In [44]:
user_agent_lower = df_churn["userAgent"].str.lower()
df_churn["operating_system"] = "Other"

df_churn.loc[user_agent_lower.str.contains("windows"), "operating_system"] = "Windows"
df_churn.loc[user_agent_lower.str.contains("macintosh"), "operating_system"] = "Macintosh"
df_churn.loc[user_agent_lower.str.contains("linux|x11|ubuntu", regex=True), "operating_system"] = "Linux"
df_churn.loc[user_agent_lower.str.contains("iphone"), "operating_system"] = "iPhone"
df_churn.loc[user_agent_lower.str.contains("ipad"), "operating_system"] = "iPad"


In [45]:
print(df_churn["operating_system"].value_counts())

operating_system
Windows      8565354
Macintosh    6931492
Linux        1134259
iPhone        629357
iPad          239174
Name: count, dtype: int64


In [47]:
df_churn['browser'] = 'Other'

df_churn.loc[user_agent_lower.str.contains("safari"), "browser"] = "Safari"
df_churn.loc[user_agent_lower.str.contains("chrome"), "browser"] = "Chrome"
df_churn.loc[user_agent_lower.str.contains("firefox"), "browser"] = "Firefox"
df_churn.loc[user_agent_lower.str.contains("trident|edge|msie"), "browser"] = "Edge"

In [48]:
print(df_churn["browser"].value_counts())

browser
Chrome     9368776
Firefox    4043953
Safari     3165525
Edge        921382
Name: count, dtype: int64


In [49]:
df_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,userAgent,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
1467336,F,free,1093398,NextSong,45519,27,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",219.0624,La Alacena,HÃƒÂƒÃ‚Â©roes del Silencio,2018-10-05 09:20:15,2018-09-24 15:03:07,Salisbury,MD-DE,Macintosh,Chrome
21267575,M,paid,1219995,NextSong,6709,30,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",285.54404,Satisfaction,Benny Benassi Presents The Biz,2018-10-04 17:28:51,2018-07-07 00:24:38,Los Angeles-Long Beach-Anaheim,CA,Macintosh,Chrome
2008656,M,free,1586064,Roll Advert,8810,28,"""Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",0.0,No song,No artist,2018-10-07 10:48:21,2018-09-13 08:12:58,Brunswick,GA,Windows,Chrome
25182764,M,paid,1424904,NextSong,32052,34,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0,228.30975,Cupid,Amy Winehouse,2018-11-11 07:05:16,2018-09-23 11:04:07,Boston-Cambridge-Newton,MA-NH,Macintosh,Firefox
25070317,M,paid,1539604,NextSong,30798,6,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0),194.45506,Yesterdays,Guns N' Roses,2018-11-08 23:34:37,2018-08-01 03:44:40,Virginia Beach-Norfolk-Newport News,VA-NC,Windows,Edge


In [50]:
df_churn.drop(columns="userAgent", inplace=True)

In [51]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
992,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1360,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1825,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2366,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [52]:
df_churn = df_churn.reset_index(drop=True)

In [53]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
3,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
4,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [None]:
level_change_counts = df_churn.groupby("userId")["level"].nunique()
users_with_changes = level_change_counts[level_change_counts > 1]
print(f"# of users that changed between levels: {len(users_with_changes)}")

# of changed between levels: 10019


In [57]:
os_change_counts = df_churn.groupby("userId")["operating_system"].nunique()
users_os_changes = os_change_counts[os_change_counts > 1]
browser_change_counts = df_churn.groupby("userId")["browser"].nunique()
users_browser_changes = browser_change_counts[browser_change_counts > 1]
print(f"# of users that changed between Operating Systems: {len(users_os_changes)}")
print(f"# of users that changed between Browsers: {len(users_browser_changes)}")

# of users that changed between Operating Systems: 0
# of users that changed between Browsers: 0


In [None]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "01_cleaned_train.parquet"
df_churn.to_parquet(checkpoint_file_path, index=False)