# Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import time

# Data Load

Loading data from pickle files for faster and smaller storage footprint.  
Sessions Data has been prefiltered to remove:  
- sessions.is_session == True
- sessions.is_wau == False
- sessions.is_mau == False
- sessions.is_developer == False

In [2]:
# Read Files
data_path = "Data/"
events = pd.read_pickle(data_path + 'events.pkl')
attr = pd.read_pickle(data_path + 'attr.pkl')
session = pd.read_pickle(data_path + 'session.pkl')

In [3]:
events.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,2201961907282901522,4,1543713091129,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,2201961907282901522,6,1543713093116,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [4]:
session.head()

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash
0,5558845121177764917,1542215364580,Asia/Manila,28800000.0,25837591,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,30,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,18781111175537580,1539215568666,Asia/Manila,28800000.0,11343848,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,10,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,1477540082628742048,1540120743010,Asia/Manila,28800000.0,13499724,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,13,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,8184875317380844086,1542671625528,Asia/Manila,28800000.0,32788010,1538874289458,False,PH,0,makati,14.554729,121.024445,en_GB,Android OS,41,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,4706180700083856343,1538997913013,Asia/Manila,28800000.0,5872534,1538874289458,False,PH,11,davao city,7.190708,125.455338,en_GB,Android OS,4,546a3d98-d540-4e72-ad82-9ebd64e0839b,9943447915df3a45fd6720a026af905b6da6b56a37701b...


# Time Split

In [5]:
print(
    f"Beginning of Sessions Data: {datetime.utcfromtimestamp(session.start_timestamp.min()/1000).strftime('%Y-%m-%d %H:%M:%S')}")
print(
    f"End of Sessions Data: {datetime.utcfromtimestamp(session.start_timestamp.max()/1000).strftime('%Y-%m-%d %H:%M:%S')}")

Beginning of Sessions Data: 2018-10-01 07:00:04
End of Sessions Data: 2018-12-14 23:59:59


We will create labels during period Dec 1st and Dec 14th, and use features from Oct 1st to Nov 30th to train the model.

In [6]:
# Get the time stamp to split data.
nov_30_2018 = datetime(2018, 11, 30, 23, 59, 59).timestamp() * 1000
dec_7_2018 = datetime(2018, 12, 7, 23, 59, 59).timestamp() * 1000

print(f"Nov. 30th, 2018 in UTC: {nov_30_2018}")
print(f"Dec. 7th, 2018 in UTC: {dec_7_2018}")

Nov. 30th, 2018 in UTC: 1543622399000.0
Dec. 7th, 2018 in UTC: 1544227199000.0


In [7]:
# Split the data specifically for df feature generation
events_training = events[events.event_timestamp <= nov_30_2018]
session_training = session[session.start_timestamp <= dec_7_2018]

# Training Set Labels

In [8]:
# Label1 7-day purchase
purchase_user_7 = set(events[(events.event == '8') & (
    events.event_timestamp > nov_30_2018) & (events.event_timestamp <= dec_7_2018)].user_id_hash)
# Label2 14-day purchase
purchase_user_14 = set(events[(events.event == '8') & (
    events.event_timestamp > nov_30_2018)].user_id_hash)

In [9]:
print(f"# of Users Purchasing in 7 Days: {len(purchase_user_7)}")
print(f"# of Users Purchasing in 14 Days: {len(purchase_user_14)}")

# of Users Purchasing in 7 Days: 4729
# of Users Purchasing in 14 Days: 6126


### Create Data Frame for Training Set

In [10]:
training = pd.DataFrame(data=list(set(events.user_id_hash) & set(session.user_id_hash)),
                        columns=['user_id_hash'])

### Create Labels

In [11]:
def label_create(df, label_1='user_purchase_binary_7_days', label_2='user_purchase_binary_14_days'):
    df[label_1] = df['user_id_hash'].apply(lambda x: x in purchase_user_7)
    df[label_2] = df['user_id_hash'].apply(lambda x: x in purchase_user_14)
    return df

In [12]:
training = label_create(training)

In [13]:
training.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,2880cfd05d5afc0784ddb111f9ff4d0f0ebcb2e10337c0...,False,False
1,5d04148ec90fcb81d610a2664e1c26c07ff4c844344fc6...,False,False
2,8d746e4ab853f23b9f62e5e77a0396baecd63b372f1e14...,False,False
3,0aa6b40eb5885be065ddfd9459c3d83b4e2c633fd83ea8...,False,False
4,ca1c270bdc376000df83d8be3114df02c88f6931b77ced...,False,False


<br>
<br>
<br>

# Save Notebook

In [14]:
import dill
dill.dump_session('Notebook_Saves/compute_target.db')