In [1]:
import pandas as pd

In [2]:
from pathlib import Path

DATA_DIR = Path("data/churn-prediction-25-26")
file_path_dataset = DATA_DIR / "train.parquet"

In [3]:
df_churn = pd.read_parquet(file_path_dataset)

In [4]:
df_churn.columns

Index(['status', 'gender', 'firstName', 'level', 'lastName', 'userId', 'ts',
       'auth', 'page', 'sessionId', 'location', 'itemInSession', 'userAgent',
       'method', 'length', 'song', 'artist', 'time', 'registration'],
      dtype='object')

In [5]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17499636 entries, 0 to 25661583
Data columns (total 19 columns):
 #   Column         Dtype         
---  ------         -----         
 0   status         int64         
 1   gender         object        
 2   firstName      object        
 3   level          object        
 4   lastName       object        
 5   userId         object        
 6   ts             int64         
 7   auth           object        
 8   page           object        
 9   sessionId      int64         
 10  location       object        
 11  itemInSession  int64         
 12  userAgent      object        
 13  method         object        
 14  length         float64       
 15  song           object        
 16  artist         object        
 17  time           datetime64[us]
 18  registration   datetime64[us]
dtypes: datetime64[us](2), float64(1), int64(4), object(12)
memory usage: 2.6+ GB


In [6]:
df_churn.describe()

Unnamed: 0,status,ts,sessionId,itemInSession,length,time,registration
count,17499640.0,17499640.0,17499640.0,17499640.0,14291430.0,17499636,17499636
mean,209.1387,1540428000000.0,84802.94,105.5937,248.7135,2018-10-25 00:47:01.161927,2018-08-25 04:40:21.543066
min,200.0,1538352000000.0,1.0,0.0,0.522,2018-10-01 00:00:01,2017-10-14 22:05:25
25%,200.0,1539340000000.0,25159.0,26.0,199.8885,2018-10-12 10:33:57.750000,2018-08-10 21:14:59
50%,200.0,1540397000000.0,79038.0,66.0,234.0828,2018-10-24 15:58:54,2018-09-05 18:35:50
75%,200.0,1541500000000.0,138368.0,144.0,276.8714,2018-11-06 10:25:35,2018-09-20 17:24:57
max,404.0,1542672000000.0,207003.0,1426.0,3024.666,2018-11-20 00:00:00,2018-11-19 23:34:34
std,30.2305,1233485000.0,61414.27,116.8854,97.22845,,


In [7]:
print("Unique Value Count")
few_unique_columns = []

for column_name in df_churn.columns:
    unique_count = df_churn[column_name].nunique()
    print(f"{column_name}: {unique_count}")
    if unique_count < 20:
        few_unique_columns.append(column_name)

Unique Value Count
status: 3
gender: 2
firstName: 4967
level: 2
lastName: 1000
userId: 19140
ts: 4189091
auth: 2
page: 19
sessionId: 161194
location: 875
itemInSession: 1427
userAgent: 85
method: 2
length: 23379
song: 239299
artist: 37264
time: 4189091
registration: 19118


In [8]:
for column_name in few_unique_columns:
    unique_values = df_churn[column_name].unique()
    print(f"{column_name}: {unique_values}")

status: [200 307 404]
gender: ['M' 'F']
level: ['paid' 'free']
auth: ['Logged In' 'Cancelled']
page: ['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']
method: ['PUT' 'GET']


In [9]:
# total number of churns in dataset
churn_page ="Cancellation Confirmation"
all_churn_count = (df_churn["page"] == churn_page ).sum()
print(all_churn_count)

4271


In [10]:
# total number of unique users
unique_users = df_churn["userId"].nunique()
print(unique_users)

19140


In [11]:
# number of users who churned from 10.11.2018 to 20.11.2018
end_date = pd.Timestamp("2018-11-10")
churn_after_date = ((df_churn["page"] == churn_page) & (df_churn["time"] >= end_date)).sum()
print(churn_after_date)

667


### Data cleaning

In [12]:
df_churn.sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
9864715,200,M,Roshan,paid,Cohen,1411673,1540915194000,Logged In,NextSong,138480,"Los Angeles-Long Beach-Anaheim, CA",98,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,202.29179,Out Loud,Dispatch,2018-10-30 15:59:54,2018-06-20 03:26:07
13682239,200,M,Daken,paid,Fox,1213955,1541898404000,Logged In,NextSong,177718,"Riverside-San Bernardino-Ontario, CA",19,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,PUT,256.83546,Seven-Mile Island,Jason Isbell and the 400 Unit,2018-11-11 01:06:44,2018-08-25 11:17:46
516805,200,F,Alexandra,paid,Estrada,1418545,1538498672000,Logged In,NextSong,31368,"Cleveland-Elyria, OH",118,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,173.45261,Mean Old World,Little Walter,2018-10-02 16:44:32,2018-08-08 17:46:34
11667040,200,M,Owen,paid,Simmons,1893507,1541398165000,Logged In,NextSong,144981,"Detroit-Warren-Dearborn, MI",29,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,229.58975,Drop The World,Lil Wayne / Eminem,2018-11-05 06:09:25,2018-07-29 18:25:52
23866999,200,F,Shylah,paid,Ross,1081909,1539871549000,Logged In,NextSong,6151,"New York-Newark-Jersey City, NY-NJ-PA",70,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,204.17261,Orange TrÃÂ¤gt Nur Die MÃÂ¼llabfuhr (Go West),Mickie Krause,2018-10-18 14:05:49,2018-09-23 15:28:03


In [13]:
print((df_churn["userId"].str.len() != 7).sum())

0


In [15]:
mask = df_churn["auth"] == "Cancelled"
df_churn[mask].sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
20714776,200,F,Abigail,free,King,1912179,1540910216000,Cancelled,Cancellation Confirmation,14943,"Nashville-Davidson--Murfreesboro--Franklin, TN",33,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-10-30 14:36:56,2018-09-25 06:37:28
11602929,200,M,Aaron,paid,Reyes,1369953,1541375378000,Cancelled,Cancellation Confirmation,127984,"Los Angeles-Long Beach-Anaheim, CA",123,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,GET,,,,2018-11-04 23:49:38,2018-09-22 04:42:46
1440738,200,M,Javier,free,Clark,1325198,1538723705000,Cancelled,Cancellation Confirmation,43092,"Knoxville, TN",10,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",GET,,,,2018-10-05 07:15:05,2018-09-13 21:27:53
20319617,200,F,Kaavya,free,Zamora,1976950,1539351674000,Cancelled,Cancellation Confirmation,9178,"Nashville-Davidson--Murfreesboro--Franklin, TN",15,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5...",GET,,,,2018-10-12 13:41:14,2018-05-09 23:32:53
22502821,200,M,Sampson,paid,Marks,1728973,1542223084000,Cancelled,Cancellation Confirmation,26827,"San Diego-Carlsbad, CA",87,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-11-14 19:18:04,2018-09-04 09:14:20


In [16]:
cancelled_count = (df_churn["auth"] == "Cancelled" ).sum()
print(cancelled_count)
double_check_auth = ((df_churn["page"] == churn_page) & (df_churn["auth"] == "Cancelled")).sum()
print(double_check_auth)

4271
4271


In [17]:
df_churn['ts'] = pd.to_datetime(df_churn["ts"], unit="ms")

In [18]:
df_churn.sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
13977259,200,M,Javier,paid,Davis,1826456,2018-11-12 08:05:58,Logged In,NextSong,167717,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",286,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,251.81995,Pretty Noose,Soundgarden,2018-11-12 08:05:58,2018-09-21 00:32:12
730179,200,M,Isaias,free,Rodriguez,1077302,2018-10-03 08:05:55,Logged In,NextSong,27130,"Miami-Fort Lauderdale-West Palm Beach, FL",27,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8...",PUT,655.77751,Sehr kosmisch,Harmonia,2018-10-03 08:05:55,2018-08-14 19:44:34
4620656,200,M,Micah,paid,Chen,1416393,2018-10-15 12:15:59,Logged In,NextSong,82360,"Seattle-Tacoma-Bellevue, WA",111,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,PUT,284.3424,Theme From Gutbuster (Junior Cartier Remix Edit),Bentley Rhythm Ace,2018-10-15 12:15:59,2018-09-29 12:47:55
13644502,200,M,James,paid,Mckee,1617874,2018-11-10 21:01:14,Logged In,NextSong,174362,"Brenham, TX",102,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,PUT,228.70159,Dancing In The Street,The Mamas & The Papas,2018-11-10 21:01:14,2018-09-05 06:15:42
10239538,200,F,Kaylee,paid,Sparks,1085138,2018-10-31 15:21:25,Logged In,NextSong,135362,"Columbus, NE",145,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,207.12444,Sorry for Laughing,Propaganda,2018-10-31 15:21:25,2018-06-09 00:00:02


In [19]:
drop_columns = ["firstName", "lastName", "auth", "method", "ts"]
df_churn.drop(columns=drop_columns, inplace=True)

In [20]:
df_churn['userId'] = df_churn['userId'].astype(int)

In [21]:
df_churn.sample(5)

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
1532520,200,F,paid,1703006,NextSong,44930,"Minneapolis-St. Paul-Bloomington, MN-WI",87,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",383.73832,Make Love To Your Mind,Bill Withers,2018-10-05 14:01:48,2018-09-09 08:12:20
1725853,200,F,paid,1299911,NextSong,48075,"Houston-The Woodlands-Sugar Land, TX",65,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",331.38893,Nearly Home,Lange,2018-10-06 02:21:46,2018-05-17 21:46:04
23767380,200,F,paid,1437552,NextSong,2789,"Fort Collins, CO",173,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",255.50322,The Remedy (I Won't Worry) (New EQ'd LP Version),Jason Mraz,2018-10-16 23:12:36,2018-07-29 07:57:46
15814036,200,M,paid,1049186,NextSong,198298,"Panama City, FL",124,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,283.92444,Sleeping In My Bed,Darius Rucker,2018-11-17 09:27:31,2018-09-07 20:51:41
6419857,200,M,paid,1400558,NextSong,102484,"Spencer, IA",29,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",317.17832,Ronnie,Metallica,2018-10-20 01:53:03,2018-08-17 01:42:03


In [22]:
nan_count = df_churn.isna().sum()

In [23]:
print(nan_count)

status                 0
gender                 0
level                  0
userId                 0
page                   0
sessionId              0
location               0
itemInSession          0
userAgent              0
length           3208203
song             3208203
artist           3208203
time                   0
registration           0
dtype: int64


In [24]:
empty_string_count = (df_churn.select_dtypes(include=["object"]) == "").sum()

In [25]:
print(empty_string_count)

gender       0
level        0
page         0
location     0
userAgent    0
song         0
artist       0
dtype: int64


In [26]:
mask = df_churn["status"] == 307
df_churn[mask].sample(10)

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
4003605,307,F,paid,1108397,Add Friend,66001,"College Station-Bryan, TX",711,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-10-12 23:38:35,2018-07-31 16:19:00
6321713,307,M,paid,1373847,Add Friend,82337,"Racine, WI",186,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,,,,2018-10-19 20:07:54,2018-09-05 13:03:19
15950995,307,F,paid,1633577,Thumbs Up,199550,"Los Angeles-Long Beach-Anaheim, CA",51,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-18 00:27:43,2018-07-21 02:16:12
9803865,307,M,paid,1240421,Thumbs Up,120204,"Atlanta-Sandy Springs-Roswell, GA",539,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; r...,,,,2018-10-30 12:25:28,2018-07-10 01:00:59
4933963,307,M,paid,1036042,Logout,81401,"McAllen-Edinburg-Mission, TX",302,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-10-16 07:04:36,2018-09-09 17:31:46
12168576,307,F,paid,1045410,Thumbs Up,153628,"Portland-Vancouver-Hillsboro, OR-WA",38,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-06 15:22:38,2018-08-22 04:24:41
10802415,307,F,paid,1099028,Thumbs Up,144978,"Miami-Fort Lauderdale-West Palm Beach, FL",208,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,,,,2018-11-02 01:52:21,2018-09-19 06:12:58
7387363,307,F,paid,1893294,Thumbs Up,113829,"New York-Newark-Jersey City, NY-NJ-PA",28,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-23 11:15:20,2018-09-20 10:08:53
14520512,307,M,paid,1913353,Thumbs Up,182247,"Bakersfield, CA",26,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) G...,,,,2018-11-13 19:07:44,2018-06-08 05:41:46
280459,307,M,paid,1052337,Logout,12925,"Memphis, TN-MS-AR",48,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",,,,2018-10-01 23:04:17,2018-05-19 03:10:05


In [33]:
mask = (df_churn["status"] == 307) & (df_churn["song"].notna())
len(df_churn[mask])

0

In [34]:
mask = df_churn["status"] == 307
unique_values_307 = df_churn[mask]["page"].unique()
print(unique_values_307)
unique_values_page = df_churn["page"].unique()
print(unique_values_page)

['Thumbs Up' 'Add Friend' 'Thumbs Down' 'Logout' 'Save Settings' 'Cancel'
 'Submit Downgrade' 'Submit Upgrade']
['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']


In [35]:
mask = df_churn["status"] == 404
df_churn[mask]

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
2788052,404,F,free,1697168,Error,58979,"Hilo, HI",14,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-10-09 19:12:32,2018-09-08 13:48:25
13897750,404,F,paid,1697168,Error,175256,"Hilo, HI",37,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-11-12 00:55:32,2018-09-08 13:48:25
16297984,404,F,paid,1697168,Error,201893,"Hilo, HI",318,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-11-19 13:08:39,2018-09-08 13:48:25
602280,404,M,paid,1222580,Error,30295,"Watertown, SD",16,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-02 22:06:26,2018-08-16 02:31:00
602568,404,M,paid,1222580,Error,30295,"Watertown, SD",18,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-02 22:07:35,2018-08-16 02:31:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25087053,404,M,paid,1934047,Error,1109,"El Dorado, AR",75,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-09 07:50:11,2018-08-31 04:28:43
25089645,404,M,paid,1934047,Error,1109,"El Dorado, AR",100,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-09 09:19:53,2018-08-31 04:28:43
25179115,404,F,free,1912269,Error,2534,"Seattle-Tacoma-Bellevue, WA",72,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-11 04:03:34,2018-11-11 01:12:59
25248788,404,M,free,1882230,Error,2951,"Houston-The Woodlands-Sugar Land, TX",98,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-11-12 17:33:20,2018-04-06 02:14:55


In [36]:
unique_values_404 = df_churn[mask]["page"].unique()
print(unique_values_404)

['Error']


In [37]:
mask = df_churn["status"] == 200
unique_values_200 = df_churn[mask]["page"].unique()
print(unique_values_200)

['NextSong' 'Downgrade' 'Help' 'Home' 'Add to Playlist' 'About' 'Settings'
 'Cancellation Confirmation' 'Roll Advert' 'Upgrade']


In [38]:
pages_200 = set(unique_values_200)
pages_307 = set(unique_values_307)

status_overlap = pages_200.intersection(pages_307)
if len(status_overlap) == 0:
    print("No overlap here. Safe to delete status")
else:
    print(f"Overlap in {status_overlap}. Can't delete status")

No overlap here. Safe to delete status


In [39]:
df_churn.drop(columns="status", inplace=True)

In [40]:
df_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
6265265,F,paid,1341040,NextSong,91175,"Rochester, NY",59,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,186.87955,Swan Lake (Beat Poets Mix),Steinski,2018-10-19 17:17:02,2018-05-18 00:57:27
16182988,M,paid,1635382,NextSong,148219,"Orlando-Kissimmee-Sanford, FL",29,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",187.6371,Smothered (Album Version),Spineshank,2018-11-19 03:27:57,2018-09-27 15:01:02
21868725,M,free,1752294,NextSong,16811,"Sanford, NC",79,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",193.27955,Ten Feet Tall,XTC,2018-10-22 19:01:08,2018-09-11 18:33:42
20881246,F,free,1410948,NextSong,16192,"Austin-Round Rock, TX",81,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,235.28444,Learn To Fly,Foo Fighters,2018-11-09 17:43:34,2018-07-26 18:05:48
1049006,M,paid,1332589,NextSong,36253,"Traverse City, MI",91,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",326.66077,Sexy M.F. (LP Version),Prince & The New Power Generation,2018-10-04 05:33:23,2018-07-10 17:53:46


In [41]:
location_split = df_churn['location'].str.split(', ', expand=True)

In [42]:
df_churn['metropolitan_area'] = location_split[0].str.strip()
df_churn['state'] = location_split[1].str.strip()

In [43]:
df_churn['artist'] = df_churn['artist'].fillna("No artist")
df_churn['song'] = df_churn['song'].fillna("No song")
df_churn['length'] = df_churn['length'].fillna(0)

In [44]:
print(df_churn.isna().sum())

gender               0
level                0
userId               0
page                 0
sessionId            0
location             0
itemInSession        0
userAgent            0
length               0
song                 0
artist               0
time                 0
registration         0
metropolitan_area    0
state                0
dtype: int64


In [45]:
unique_states = df_churn["state"].unique()
print(len(unique_states))
unique_area = df_churn["metropolitan_area"].unique()
print(len(unique_area))
unique_location = df_churn["location"].unique()
print(len(unique_location))

100
806
875


In [46]:
print(unique_states)

['TX' 'CA' 'HI' 'SD' 'MD' 'FL' 'IN' 'PA' 'NY' 'VA' 'PA-NJ-DE-MD' 'VA-NC'
 'MA-NH' 'GA' 'KY' 'MO-IL' 'DC-VA-MD-WV' 'AZ' 'OH' 'OR' 'TN-MS-AR'
 'IL-IN-WI' 'TN-GA' 'NY-NJ-PA' 'MI' 'WA' 'WY' 'AL' 'NH' 'NV' 'SC' 'MN-WI'
 'TN' 'MN' 'CO' 'MO' 'OH-KY-IN' 'IN-KY' 'NE-IA' 'RI-MA' 'MA-CT' 'LA' 'CT'
 'OK' 'NC' 'NJ' 'WV' 'KS' 'MD-WV' 'MS' 'AR' 'PA-NJ' 'IL' 'WI' 'MS-LA' 'IA'
 'OR-WA' 'TN-VA' 'ME' 'NM' 'NC-SC' 'IA-IL' 'UT' 'KY-IN' 'GA-AL' 'MO-KS'
 'OH-PA' 'MA' 'NE' 'IN-MI' 'ID' 'AR-OK' 'TN-KY' 'SC-NC' 'MT' 'WV-KY-OH'
 'GA-SC' 'AR-MO' 'DE' 'ND' 'KY-IL' 'VT' 'AK' 'WY-ID' 'UT-ID' 'WV-OH'
 'TX-AR' 'MD-DE' 'IA-NE-SD' 'OR-ID' 'ND-MN' 'VA-WV' 'ID-WA' 'NH-VT'
 'IL-MO' 'WI-MI' 'IA-IL-MO' 'WI-MN' 'WV-VA' 'MI-WI']


In [47]:
df_churn.rename(columns={"state": "region"}, inplace=True)

In [48]:
df_churn.drop(columns="location", inplace=True)

In [49]:
pd.set_option('display.max_colwidth', 500)
df_churn["userAgent"].sample(5)

13916917     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"
4159007     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
3586942        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"
3599180                                                           Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko
25614540                                                             Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20100101 Firefox/31.0
Name: userAgent, dtype: object

In [52]:
unique_useragents = df_churn["userAgent"].unique()
print(unique_useragents)

['"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4"'
 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0'
 '"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"'
 '"Mozilla/5.0 (Windows NT 6.3; 

In [53]:
user_agent_lower = df_churn["userAgent"].str.lower()
df_churn["operating_system"] = "Other"

df_churn.loc[user_agent_lower.str.contains("windows"), "operating_system"] = "Windows"
df_churn.loc[user_agent_lower.str.contains("macintosh"), "operating_system"] = "Macintosh"
df_churn.loc[user_agent_lower.str.contains("linux|x11|ubuntu", regex=True), "operating_system"] = "Linux"
df_churn.loc[user_agent_lower.str.contains("iphone"), "operating_system"] = "iPhone"
df_churn.loc[user_agent_lower.str.contains("ipad"), "operating_system"] = "iPad"


In [54]:
print(df_churn["operating_system"].value_counts())

operating_system
Windows      8565354
Macintosh    6931492
Linux        1134259
iPhone        629357
iPad          239174
Name: count, dtype: int64


In [55]:
df_churn['browser'] = 'Other'

df_churn.loc[user_agent_lower.str.contains("safari"), "browser"] = "Safari"
df_churn.loc[user_agent_lower.str.contains("chrome"), "browser"] = "Chrome"
df_churn.loc[user_agent_lower.str.contains("firefox"), "browser"] = "Firefox"
df_churn.loc[user_agent_lower.str.contains("trident|edge|msie"), "browser"] = "Edge"

In [56]:
print(df_churn["browser"].value_counts())

browser
Chrome     9368776
Firefox    4043953
Safari     3165525
Edge        921382
Name: count, dtype: int64


In [57]:
df_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,userAgent,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
6012248,F,paid,1234911,NextSong,97964,146,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4""",256.522,Ode to LRC (Album),Band Of Horses,2018-10-19 01:22:10,2018-09-18 10:21:40,Pensacola-Ferry Pass-Brent,FL,Macintosh,Safari
8399476,M,paid,1879733,Thumbs Up,123123,15,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0,0.0,No song,No artist,2018-10-26 00:47:27,2018-08-15 09:09:20,Memphis,TN-MS-AR,Macintosh,Firefox
20033478,M,paid,1768550,NextSong,1234,281,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",265.29914,Hi-De-Ho,Blood_ Sweat & Tears,2018-10-04 08:07:23,2018-08-26 01:31:49,Burlington-South Burlington,VT,Windows,Chrome
10219872,M,paid,1264670,NextSong,143690,2,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",272.84853,"Dope Boy Magic [Feat. Nicholas ""Play Boy Nick"" Smith_ Corey ""Black Owned C Bone"" Andrews and Chino Dolla] [Amended Album Version]",Yung Joc,2018-10-31 14:11:55,2018-09-24 15:34:06,Detroit-Warren-Dearborn,MI,Windows,Chrome
5020467,M,paid,1878524,NextSong,44293,76,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",231.18322,Paris 2004,Peter Bjorn And John,2018-10-16 13:04:23,2018-09-11 18:17:58,Birmingham-Hoover,AL,Linux,Chrome


In [58]:
df_churn.drop(columns="userAgent", inplace=True)

In [59]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
992,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1360,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1825,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2366,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [60]:
df_churn = df_churn.reset_index(drop=True)

In [61]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
3,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
4,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [62]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "01_cleaned_train.parquet"
df_churn.to_parquet(checkpoint_file_path, index=False)