In [2]:
import pandas as pd

In [3]:
from pathlib import Path

DATA_DIR = Path("data/churn-prediction-25-26")
file_path_dataset = DATA_DIR / "train.parquet"

In [4]:
df_churn = pd.read_parquet(file_path_dataset)

In [5]:
df_churn.columns

Index(['status', 'gender', 'firstName', 'level', 'lastName', 'userId', 'ts',
       'auth', 'page', 'sessionId', 'location', 'itemInSession', 'userAgent',
       'method', 'length', 'song', 'artist', 'time', 'registration'],
      dtype='object')

In [6]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17499636 entries, 0 to 25661583
Data columns (total 19 columns):
 #   Column         Dtype         
---  ------         -----         
 0   status         int64         
 1   gender         object        
 2   firstName      object        
 3   level          object        
 4   lastName       object        
 5   userId         object        
 6   ts             int64         
 7   auth           object        
 8   page           object        
 9   sessionId      int64         
 10  location       object        
 11  itemInSession  int64         
 12  userAgent      object        
 13  method         object        
 14  length         float64       
 15  song           object        
 16  artist         object        
 17  time           datetime64[us]
 18  registration   datetime64[us]
dtypes: datetime64[us](2), float64(1), int64(4), object(12)
memory usage: 2.6+ GB


In [7]:
df_churn.describe()

Unnamed: 0,status,ts,sessionId,itemInSession,length,time,registration
count,17499640.0,17499640.0,17499640.0,17499640.0,14291430.0,17499636,17499636
mean,209.1387,1540428000000.0,84802.94,105.5937,248.7135,2018-10-25 00:47:01.161927,2018-08-25 04:40:21.543066
min,200.0,1538352000000.0,1.0,0.0,0.522,2018-10-01 00:00:01,2017-10-14 22:05:25
25%,200.0,1539340000000.0,25159.0,26.0,199.8885,2018-10-12 10:33:57.750000,2018-08-10 21:14:59
50%,200.0,1540397000000.0,79038.0,66.0,234.0828,2018-10-24 15:58:54,2018-09-05 18:35:50
75%,200.0,1541500000000.0,138368.0,144.0,276.8714,2018-11-06 10:25:35,2018-09-20 17:24:57
max,404.0,1542672000000.0,207003.0,1426.0,3024.666,2018-11-20 00:00:00,2018-11-19 23:34:34
std,30.2305,1233485000.0,61414.27,116.8854,97.22845,,


In [8]:
print("Unique Value Count")
few_unique_columns = []

for column_name in df_churn.columns:
    unique_count = df_churn[column_name].nunique()
    print(f"{column_name}: {unique_count}")
    if unique_count < 20:
        few_unique_columns.append(column_name)

Unique Value Count
status: 3
gender: 2
firstName: 4967
level: 2
lastName: 1000
userId: 19140
ts: 4189091
auth: 2
page: 19
sessionId: 161194
location: 875
itemInSession: 1427
userAgent: 85
method: 2
length: 23379
song: 239299
artist: 37264
time: 4189091
registration: 19118


In [9]:
for column_name in few_unique_columns:
    unique_values = df_churn[column_name].unique()
    print(f"{column_name}: {unique_values}")

status: [200 307 404]
gender: ['M' 'F']
level: ['paid' 'free']
auth: ['Logged In' 'Cancelled']
page: ['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']
method: ['PUT' 'GET']


In [10]:
# total number of churns in dataset
churn_page ="Cancellation Confirmation"
all_churn_count = (df_churn["page"] == churn_page ).sum()
print(all_churn_count)

4271


In [11]:
# total number of unique users
unique_users = df_churn["userId"].nunique()
print(unique_users)

19140


In [12]:
# number of users who churned from 10.11.2018 to 20.11.2018
end_date = pd.Timestamp("2018-11-10")
churn_after_date = ((df_churn["page"] == churn_page) & (df_churn["time"] >= end_date)).sum()
print(churn_after_date)

667


### Data cleaning

In [13]:
df_churn.sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
23827794,200,M,Austen,paid,Horton,1484921,1539815266000,Logged In,NextSong,15304,"New York-Newark-Jersey City, NY-NJ-PA",1,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",PUT,1670.45179,Raga Haripriya,Pandit Hariprasad Chaurasia,2018-10-17 22:27:46,2018-09-16 09:11:42
9904477,200,F,Jasmine,paid,Patel,1627556,1540923084000,Logged In,NextSong,130280,"Torrington, CT",58,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,267.91138,Almost Lover (Album Version),A Fine Frenzy,2018-10-30 18:11:24,2018-06-04 13:16:00
5011934,200,F,Alexis,free,Brown,1234928,1539693030000,Logged In,NextSong,66419,"Los Angeles-Long Beach-Anaheim, CA",24,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,239.3073,You're The One,Dwight Yoakam,2018-10-16 12:30:30,2018-09-10 09:40:07
7818288,307,M,Aydyn,paid,Robinson,1575924,1540390586000,Logged In,Thumbs Up,112486,"Los Angeles-Long Beach-Anaheim, CA",443,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",PUT,,,,2018-10-24 14:16:26,2018-08-20 08:47:24
25448384,307,M,Richard,paid,Singh,1177669,1542309829000,Logged In,Thumbs Down,34618,"New York-Newark-Jersey City, NY-NJ-PA",67,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,,,,2018-11-15 19:23:49,2018-09-21 06:59:52


In [14]:
print((df_churn["userId"].str.len() != 7).sum())

0


In [15]:
mask = df_churn["auth"] == "Cancelled"
df_churn[mask].sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
7689239,200,M,Braylon,paid,Hendrix,1188019,1540358776000,Cancelled,Cancellation Confirmation,86892,"Lafayette, LA",129,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-10-24 05:26:16,2018-08-15 17:52:50
20155032,200,M,Lucas,free,Brown,1717006,1538958684000,Cancelled,Cancellation Confirmation,7024,"Roanoke, VA",89,Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) G...,GET,,,,2018-10-08 00:31:24,2018-09-05 20:40:03
21391908,200,F,Paetyn,free,Webb,1186765,1539012301000,Cancelled,Cancellation Confirmation,8378,"Boston-Cambridge-Newton, MA-NH",23,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",GET,,,,2018-10-08 15:25:01,2018-09-21 22:47:06
21889306,200,F,Zia,free,Clark,1368544,1540288497000,Cancelled,Cancellation Confirmation,15696,"New York-Newark-Jersey City, NY-NJ-PA",26,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",GET,,,,2018-10-23 09:54:57,2018-09-29 16:36:10
21357848,200,M,Eric,paid,Rodriguez,1377031,1538920848000,Cancelled,Cancellation Confirmation,8305,"Thomasville, GA",122,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",GET,,,,2018-10-07 14:00:48,2018-09-07 17:39:43


In [16]:
cancelled_count = (df_churn["auth"] == "Cancelled" ).sum()
print(cancelled_count)
double_check_auth = ((df_churn["page"] == churn_page) & (df_churn["auth"] == "Cancelled")).sum()
print(double_check_auth)

4271
4271


In [17]:
df_churn['ts'] = pd.to_datetime(df_churn["ts"], unit="ms")

In [18]:
df_churn.sample(5)

Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
2024249,200,M,Baylen,free,Simmons,1380061,2018-10-07 12:41:33,Logged In,Roll Advert,49785,"New Philadelphia-Dover, OH",4,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",GET,,,,2018-10-07 12:41:33,2018-09-28 04:23:12
22070219,200,M,Aaron,free,Lawrence,1197367,2018-10-29 16:56:44,Logged In,NextSong,13709,"San Jose-Sunnyvale-Santa Clara, CA",34,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5...",PUT,147.1473,Piss And Vinegar (Album Version),Against Me!,2018-10-29 16:56:44,2018-09-15 18:48:48
20700335,200,M,Samuel,free,Wallace,1798955,2018-10-29 19:03:28,Logged In,NextSong,13934,"Chicago-Naperville-Elgin, IL-IN-WI",67,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) G...,PUT,267.15383,Ghost Under Rocks,Ra Ra Riot,2018-10-29 19:03:28,2018-09-22 08:23:09
247947,200,F,Hiba,paid,White,1593526,2018-10-01 20:44:11,Logged In,NextSong,6785,"Los Angeles-Long Beach-Anaheim, CA",293,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5...",PUT,263.60118,Day Too Soon (Album Version),Sia,2018-10-01 20:44:11,2018-09-23 23:12:47
25587663,200,M,Angel,paid,Sanchez,1168061,2018-11-18 15:56:38,Logged In,NextSong,36555,"Nashville-Davidson--Murfreesboro--Franklin, TN",4,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",PUT,493.5571,Fire Wire,Cosmic Gate,2018-11-18 15:56:38,2018-08-30 05:39:41


In [19]:
drop_columns = ["firstName", "lastName", "auth", "method", "ts"]
df_churn.drop(columns=drop_columns, inplace=True)

In [21]:
df_churn['userId'] = df_churn['userId'].astype(int)

In [22]:
df_churn.sample(5)

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
160360,200,M,paid,1405047,NextSong,26250,"Detroit-Warren-Dearborn, MI",56,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",162.87302,Camarera De Mi Amor,Antonio MachÃÂ­n,2018-10-01 15:03:38,2018-09-12 08:21:09
9009096,200,M,paid,1661966,NextSong,129416,"Dallas-Fort Worth-Arlington, TX",213,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",192.93995,Let Your Love Come Down,Blind Blake,2018-10-27 21:05:25,2018-03-26 08:55:31
20269907,200,F,free,1086740,Add to Playlist,8815,"Phoenix-Mesa-Scottsdale, AZ",44,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,,,,2018-10-10 22:44:41,2018-10-08 11:49:04
2552505,200,F,paid,1806776,NextSong,52111,"Rochester, NY",115,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",406.15138,Taxi (LP Version),Harry Chapin,2018-10-09 04:38:24,2018-05-24 09:49:08
7664915,200,F,paid,1752022,Settings,111467,"Springfield, MO",211,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",,,,2018-10-24 03:44:38,2018-07-28 23:02:09


In [23]:
nan_count = df_churn.isna().sum()

In [24]:
print(nan_count)

status                 0
gender                 0
level                  0
userId                 0
page                   0
sessionId              0
location               0
itemInSession          0
userAgent              0
length           3208203
song             3208203
artist           3208203
time                   0
registration           0
dtype: int64


In [25]:
empty_string_count = (df_churn.select_dtypes(include=["object"]) == "").sum()

In [26]:
print(empty_string_count)

gender       0
level        0
page         0
location     0
userAgent    0
song         0
artist       0
dtype: int64


In [27]:
mask = df_churn["status"] == 307
df_churn[mask].sample(10)

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
5690379,307,M,paid,1102502,Thumbs Up,87157,"New York-Newark-Jersey City, NY-NJ-PA",21,Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko...,,,,2018-10-18 06:19:56,2018-09-02 06:56:51
4616984,307,M,paid,1004060,Add Friend,78672,"Russellville, AR",2,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",,,,2018-10-15 12:01:21,2018-09-22 07:40:46
5787734,307,F,paid,1621832,Thumbs Down,95686,"San Antonio-New Braunfels, TX",41,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",,,,2018-10-18 12:49:58,2018-09-07 06:45:46
23489524,307,F,paid,1500075,Thumbs Up,11801,"Urban Honolulu, HI",92,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-10-11 20:34:21,2018-09-11 12:51:17
7086162,307,F,free,1134610,Logout,107067,"Shreveport-Bossier City, LA",34,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",,,,2018-10-22 16:32:27,2018-08-16 15:46:26
16178170,307,F,paid,1508496,Thumbs Up,202805,"Washington-Arlington-Alexandria, DC-VA-MD-WV",18,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-11-19 02:58:35,2018-08-30 17:26:14
5196997,307,M,paid,1822046,Thumbs Up,84564,"Houston-The Woodlands-Sugar Land, TX",86,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like...",,,,2018-10-16 23:01:15,2018-07-20 17:23:53
23239595,307,M,paid,1768791,Thumbs Up,8816,"Boone, IA",132,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-07 21:18:32,2018-08-22 08:29:29
3241291,307,F,paid,1160692,Add Friend,58000,"El Campo, TX",1,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_...",,,,2018-10-10 23:35:26,2018-09-21 13:20:32
12120690,307,M,paid,1591288,Thumbs Up,160727,"Grants, NM",171,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-11-06 12:29:10,2018-09-28 21:33:47


In [28]:
mask = (df_churn["status"] == 307) & (df_churn["song"].notna())
len(df_churn[mask])

0

In [29]:
mask = df_churn["status"] == 307
unique_values_307 = df_churn[mask]["page"].unique()
print(unique_values_307)
unique_values_page = df_churn["page"].unique()
print(unique_values_page)

['Thumbs Up' 'Add Friend' 'Thumbs Down' 'Logout' 'Save Settings' 'Cancel'
 'Submit Downgrade' 'Submit Upgrade']
['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']


In [30]:
mask = df_churn["status"] == 404
df_churn[mask]

Unnamed: 0,status,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
2788052,404,F,free,1697168,Error,58979,"Hilo, HI",14,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-10-09 19:12:32,2018-09-08 13:48:25
13897750,404,F,paid,1697168,Error,175256,"Hilo, HI",37,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-11-12 00:55:32,2018-09-08 13:48:25
16297984,404,F,paid,1697168,Error,201893,"Hilo, HI",318,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; r...,,,,2018-11-19 13:08:39,2018-09-08 13:48:25
602280,404,M,paid,1222580,Error,30295,"Watertown, SD",16,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-02 22:06:26,2018-08-16 02:31:00
602568,404,M,paid,1222580,Error,30295,"Watertown, SD",18,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-10-02 22:07:35,2018-08-16 02:31:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25087053,404,M,paid,1934047,Error,1109,"El Dorado, AR",75,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-09 07:50:11,2018-08-31 04:28:43
25089645,404,M,paid,1934047,Error,1109,"El Dorado, AR",100,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-09 09:19:53,2018-08-31 04:28:43
25179115,404,F,free,1912269,Error,2534,"Seattle-Tacoma-Bellevue, WA",72,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-11 04:03:34,2018-11-11 01:12:59
25248788,404,M,free,1882230,Error,2951,"Houston-The Woodlands-Sugar Land, TX",98,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",,,,2018-11-12 17:33:20,2018-04-06 02:14:55


In [31]:
unique_values_404 = df_churn[mask]["page"].unique()
print(unique_values_404)

['Error']


In [32]:
mask = df_churn["status"] == 200
unique_values_200 = df_churn[mask]["page"].unique()
print(unique_values_200)

['NextSong' 'Downgrade' 'Help' 'Home' 'Add to Playlist' 'About' 'Settings'
 'Cancellation Confirmation' 'Roll Advert' 'Upgrade']


In [33]:
pages_200 = set(unique_values_200)
pages_307 = set(unique_values_307)

status_overlap = pages_200.intersection(pages_307)
if len(status_overlap) == 0:
    print("No overlap here. Safe to delete status")
else:
    print(f"Overlap in {status_overlap}. Can't delete status")

No overlap here. Safe to delete status


In [34]:
df_churn.drop(columns="status", inplace=True)

In [35]:
df_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,location,itemInSession,userAgent,length,song,artist,time,registration
25518394,F,paid,1221049,NextSong,35439,"Boston-Cambridge-Newton, MA-NH",128,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",232.22812,Hero/Heroine,Boys Like Girls,2018-11-16 20:28:31,2018-07-02 06:53:14
4332188,F,paid,1125199,NextSong,75864,"Bloomington, IN",23,Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...,280.13669,Du bist nicht allein,WESTERNHAGEN (HCL),2018-10-14 08:48:16,2018-09-25 07:24:20
20279844,M,paid,1838648,NextSong,9200,"Plymouth, IN",168,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK...",195.29098,Last Scene Of Struggling,Finger Eleven,2018-10-11 08:22:20,2018-09-03 13:27:50
11000236,F,paid,1984747,NextSong,146516,"Charlotte-Concord-Gastonia, NC-SC",25,"""Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebK...",219.79383,X Gon' Give It To Ya,DMX,2018-11-02 15:07:54,2018-05-20 17:46:31
25257009,F,paid,1562005,Help,32313,"Glens Falls, NY",10,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",,,,2018-11-12 19:51:21,2018-09-30 14:43:42


In [36]:
location_split = df_churn['location'].str.split(', ', expand=True)

In [37]:
df_churn['metropolitan_area'] = location_split[0].str.strip()
df_churn['state'] = location_split[1].str.strip()

In [38]:
df_churn['artist'] = df_churn['artist'].fillna("No artist")
df_churn['song'] = df_churn['song'].fillna("No song")
df_churn['length'] = df_churn['length'].fillna(0)

In [39]:
print(df_churn.isna().sum())

gender               0
level                0
userId               0
page                 0
sessionId            0
location             0
itemInSession        0
userAgent            0
length               0
song                 0
artist               0
time                 0
registration         0
metropolitan_area    0
state                0
dtype: int64


In [40]:
unique_states = df_churn["state"].unique()
print(len(unique_states))
unique_area = df_churn["metropolitan_area"].unique()
print(len(unique_area))
unique_location = df_churn["location"].unique()
print(len(unique_location))

100
806
875


In [41]:
print(unique_states)

['TX' 'CA' 'HI' 'SD' 'MD' 'FL' 'IN' 'PA' 'NY' 'VA' 'PA-NJ-DE-MD' 'VA-NC'
 'MA-NH' 'GA' 'KY' 'MO-IL' 'DC-VA-MD-WV' 'AZ' 'OH' 'OR' 'TN-MS-AR'
 'IL-IN-WI' 'TN-GA' 'NY-NJ-PA' 'MI' 'WA' 'WY' 'AL' 'NH' 'NV' 'SC' 'MN-WI'
 'TN' 'MN' 'CO' 'MO' 'OH-KY-IN' 'IN-KY' 'NE-IA' 'RI-MA' 'MA-CT' 'LA' 'CT'
 'OK' 'NC' 'NJ' 'WV' 'KS' 'MD-WV' 'MS' 'AR' 'PA-NJ' 'IL' 'WI' 'MS-LA' 'IA'
 'OR-WA' 'TN-VA' 'ME' 'NM' 'NC-SC' 'IA-IL' 'UT' 'KY-IN' 'GA-AL' 'MO-KS'
 'OH-PA' 'MA' 'NE' 'IN-MI' 'ID' 'AR-OK' 'TN-KY' 'SC-NC' 'MT' 'WV-KY-OH'
 'GA-SC' 'AR-MO' 'DE' 'ND' 'KY-IL' 'VT' 'AK' 'WY-ID' 'UT-ID' 'WV-OH'
 'TX-AR' 'MD-DE' 'IA-NE-SD' 'OR-ID' 'ND-MN' 'VA-WV' 'ID-WA' 'NH-VT'
 'IL-MO' 'WI-MI' 'IA-IL-MO' 'WI-MN' 'WV-VA' 'MI-WI']


In [42]:
df_churn.rename(columns={"state": "region"}, inplace=True)

In [43]:
df_churn.drop(columns="location", inplace=True)

In [44]:
pd.set_option('display.max_colwidth', 500)
df_churn["userAgent"].sample(5)

1265610                                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
21821498                                                              Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
1483041                         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"
12465771    "Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53"
23113094                                "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"
Name: userAgent, dtype: object

In [45]:
unique_useragents = df_churn["userAgent"].unique()
print(unique_useragents)

['"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4"'
 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0'
 '"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"'
 '"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"'
 '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"'
 '"Mozilla/5.0 (Windows NT 6.3; 

In [46]:
user_agent_lower = df_churn["userAgent"].str.lower()
df_churn["operating_system"] = "Other"

df_churn.loc[user_agent_lower.str.contains("windows"), "operating_system"] = "Windows"
df_churn.loc[user_agent_lower.str.contains("macintosh"), "operating_system"] = "Macintosh"
df_churn.loc[user_agent_lower.str.contains("linux|x11|ubuntu", regex=True), "operating_system"] = "Linux"
df_churn.loc[user_agent_lower.str.contains("iphone"), "operating_system"] = "iPhone"
df_churn.loc[user_agent_lower.str.contains("ipad"), "operating_system"] = "iPad"


In [47]:
print(df_churn["operating_system"].value_counts())

operating_system
Windows      8565354
Macintosh    6931492
Linux        1134259
iPhone        629357
iPad          239174
Name: count, dtype: int64


In [48]:
df_churn['browser'] = 'Other'

df_churn.loc[user_agent_lower.str.contains("safari"), "browser"] = "Safari"
df_churn.loc[user_agent_lower.str.contains("chrome"), "browser"] = "Chrome"
df_churn.loc[user_agent_lower.str.contains("firefox"), "browser"] = "Firefox"
df_churn.loc[user_agent_lower.str.contains("trident|edge|msie"), "browser"] = "Edge"

In [49]:
print(df_churn["browser"].value_counts())

browser
Chrome     9368776
Firefox    4043953
Safari     3165525
Edge        921382
Name: count, dtype: int64


In [50]:
df_churn.sample(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,userAgent,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
15049877,F,paid,1610016,NextSong,179241,264,"""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",207.20281,Rosario Tijeras,Juanes,2018-11-15 04:49:09,2018-09-04 04:15:13,Brunswick,GA,Windows,Chrome
1465998,M,free,1073857,NextSong,44292,36,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0,297.69098,Ordinary World,Red,2018-10-05 09:14:04,2018-08-07 21:26:30,Hagerstown-Martinsburg,MD-WV,Windows,Firefox
12448460,F,free,1316155,NextSong,148207,4,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",219.76771,I Don't Know,Wax Tailor,2018-11-07 08:33:48,2018-09-04 20:07:59,Detroit-Warren-Dearborn,MI,Macintosh,Chrome
7099116,F,free,1115949,NextSong,109613,24,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53""",259.94404,The Memory Of Trees (Instrumental),Enya,2018-10-22 17:14:08,2018-07-31 16:48:23,Oklahoma City,OK,iPhone,Safari
5191483,F,paid,1195247,NextSong,86941,259,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0,210.20689,Memphis Jellyroll,Stefan Grossman,2018-10-16 22:41:58,2018-08-04 01:06:50,Los Angeles-Long Beach-Anaheim,CA,Windows,Firefox


In [51]:
del df_churn["userAgent"]

In [52]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
992,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1360,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1825,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2366,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [54]:
df_churn.index = range(len(df_churn))

In [None]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
3,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
4,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [56]:
level_change_counts = df_churn.groupby("userId")["level"].nunique()
users_with_changes = level_change_counts[level_change_counts > 1]
print(f"# of users that changed between levels: {len(users_with_changes)}")

# of users that changed between levels: 10019


In [57]:
os_change_counts = df_churn.groupby("userId")["operating_system"].nunique()
users_os_changes = os_change_counts[os_change_counts > 1]
browser_change_counts = df_churn.groupby("userId")["browser"].nunique()
users_browser_changes = browser_change_counts[browser_change_counts > 1]
print(f"# of users that changed between Operating Systems: {len(users_os_changes)}")
print(f"# of users that changed between Browsers: {len(users_browser_changes)}")

# of users that changed between Operating Systems: 0
# of users that changed between Browsers: 0


In [58]:
df_churn.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17499636 entries, 0 to 17499635
Data columns (total 15 columns):
 #   Column             Dtype         
---  ------             -----         
 0   gender             object        
 1   level              object        
 2   userId             int32         
 3   page               object        
 4   sessionId          int64         
 5   itemInSession      int64         
 6   length             float64       
 7   song               object        
 8   artist             object        
 9   time               datetime64[us]
 10  registration       datetime64[us]
 11  metropolitan_area  object        
 12  region             object        
 13  operating_system   object        
 14  browser            object        
dtypes: datetime64[us](2), float64(1), int32(1), int64(2), object(9)
memory usage: 10.4 GB


In [None]:
for col in df_churn.select_dtypes(include="object"):
    nunique = df_churn[col].nunique()
    total = len(df_churn)

    if nunique < total * 0.5:
        df_churn[col] = df_churn[col].astype("category")


In [60]:
df_churn.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17499636 entries, 0 to 17499635
Data columns (total 15 columns):
 #   Column             Dtype         
---  ------             -----         
 0   gender             category      
 1   level              category      
 2   userId             int32         
 3   page               category      
 4   sessionId          int64         
 5   itemInSession      int64         
 6   length             float64       
 7   song               category      
 8   artist             category      
 9   time               datetime64[us]
 10  registration       datetime64[us]
 11  metropolitan_area  category      
 12  region             category      
 13  operating_system   category      
 14  browser            category      
dtypes: category(9), datetime64[us](2), float64(1), int32(1), int64(2)
memory usage: 1.0 GB


In [61]:
df_churn.head(5)

Unnamed: 0,gender,level,userId,page,sessionId,itemInSession,length,song,artist,time,registration,metropolitan_area,region,operating_system,browser
0,M,paid,1749042,NextSong,22683,278,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
1,M,paid,1749042,NextSong,22683,279,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
2,M,paid,1749042,NextSong,22683,280,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
3,M,paid,1749042,NextSong,22683,281,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome
4,M,paid,1749042,NextSong,22683,282,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21,Dallas-Fort Worth-Arlington,TX,Windows,Chrome


In [62]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "01_cleaned_train.parquet"
df_churn.to_parquet(checkpoint_file_path, index=False)

In [63]:
df_churn.dtypes

gender                     category
level                      category
userId                        int32
page                       category
sessionId                     int64
itemInSession                 int64
length                      float64
song                       category
artist                     category
time                 datetime64[us]
registration         datetime64[us]
metropolitan_area          category
region                     category
operating_system           category
browser                    category
dtype: object