In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
event1 = pickle.load(open('../Data/event1.pkl', 'rb'))
event2 = pickle.load(open('../Data/event2.pkl', 'rb'))
events = pd.concat([event1, event2])

In [3]:
sessions = pickle.load(open('../Data/sessions.pkl', 'rb'))

### feature from 10.1-11.30, label from 12.1-12.14

In [4]:
events.columns

Index(['session_id', 'event', 'event_timestamp', 'event_value',
       'user_id_hash'],
      dtype='object')

In [5]:
sessions.columns

Index(['session_id', 'start_timestamp', 'timezone', 'timezone_offset',
       'previous_sessions_duration', 'user_created_timestamp',
       'is_user_first_session', 'is_session', 'country', 'region', 'city',
       'latitude', 'longitude', 'locale', 'os_name', 'session_index',
       'device_id', 'user_id_hash'],
      dtype='object')

In [6]:
events['event_time'] = pd.to_datetime(events['event_timestamp'],unit='ms')
sessions['start_time'] = pd.to_datetime(sessions['start_timestamp'],unit='ms')

In [7]:
events = events.sort_values(by='event_time')
sessions = sessions.sort_values(by='start_time')
events = events.reset_index(drop=True)
sessions = sessions.reset_index(drop=True)

In [8]:
events.tail()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash,event_time
111946529,177520398663374058,55,1544831999738,0.0,9c57e2aac03f96e2934ffdb2284c7a715e2db154add6f4...,2018-12-14 23:59:59.738
111946530,177520398663374058,63,1544831999820,0.0,9c57e2aac03f96e2934ffdb2284c7a715e2db154add6f4...,2018-12-14 23:59:59.820
111946531,177520398663374058,47,1544831999847,0.0,9c57e2aac03f96e2934ffdb2284c7a715e2db154add6f4...,2018-12-14 23:59:59.847
111946532,6897354445902686402,5,1544831999968,0.0,96bf27d07f30a55533f968729140a15efe4fbea13acf9d...,2018-12-14 23:59:59.968
111946533,6897354445902686402,55,1544831999977,0.0,96bf27d07f30a55533f968729140a15efe4fbea13acf9d...,2018-12-14 23:59:59.977


In [9]:
sessions.tail()

Unnamed: 0,session_id,start_timestamp,timezone,timezone_offset,previous_sessions_duration,user_created_timestamp,is_user_first_session,is_session,country,region,city,latitude,longitude,locale,os_name,session_index,device_id,user_id_hash,start_time
6199331,4630145786485338110,1544831992054,America/New_York,4276967296,57572666,1543168229079,False,True,US,ny,new york,40.712776,-74.005974,en_US,Android OS,29,2286f0ad-fe81-46c1-bbe2-c53fa47dc0b7,acbe8be8a5385c199ba6cf46d99583f5c934ab6e178e57...,2018-12-14 23:59:52.054
6199332,5151306523957173938,1544831993499,America/Los_Angeles,4266167296,49514614,1543599631466,False,True,US,ca,los angeles,34.052235,-118.243683,en-US_US,iOS,50,81D95D32-3E50-435F-9E93-1A03F60BAEF1,6c72cc39fbcac0cc069851ebdec246df36fefd6429117f...,2018-12-14 23:59:53.499
6199333,3875638290458693581,1544831996682,Europe/Budapest,3600000,194801716,1542807614627,False,True,HU,bk,baja,46.181793,18.954306,hu_HU,Android OS,106,e7fbf700-5a7a-4936-8eb7-7e42473c6545,0c3ee12e6bb7a0c41b9d76783beb7d15266ade9c60cf4a...,2018-12-14 23:59:56.682
6199334,4641995394180881661,1544831997984,America/Chicago,4273367296,5428285,1541706994860,False,True,US,ar,maumelle,34.866756,-92.40432,en_US,Android OS,5,cd7fb7ba-6e2a-4cec-8e7a-1e6ecca5f71b,df83ee2cc447f02ace5af1422780f047749eec51b6f68f...,2018-12-14 23:59:57.984
6199335,4396398648740242752,1544831999484,America/Los_Angeles,4266167296,14045050,1541982118974,False,True,US,wa,seattle,47.606209,-122.332069,en-US_US,iOS,25,0956AA94-F0D9-4295-B67F-AF03672E10A3,1168d0cd1e76c4cc78098df49a93afa63b5f19e3b0d43e...,2018-12-14 23:59:59.484


In [10]:
event_end_time = events['event_time'][events.shape[0]-1] - pd.Timedelta(days=14)
session_end_time = sessions['start_time'][sessions.shape[0]-1] - pd.Timedelta(days=14)

In [11]:
# features from 10.1 - 11.30
events_feature = events[events['event_time'] <= event_end_time]
sessions_feature = sessions[sessions['start_time'] <= session_end_time]

In [12]:
events_feature = events_feature.drop('event_time', axis=1)

In [13]:
events_feature_1 = events_feature[:48902576]
events_feature_2 = events_feature[48902576:]

In [14]:
pickle.dump(events_feature_1, open('../Data/events_feature_1.pkl', 'wb'))
pickle.dump(events_feature_2, open('../Data/events_feature_2.pkl', 'wb'))
pickle.dump(sessions_feature, open('../Data/sessions_feature.pkl', 'wb'))

In [15]:
# labels from 12.1 - 12.14
events_label = events[events['event_time'] > event_end_time]

### compute labels

In [16]:
events_label.head()

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash,event_time
97805153,77389957355639787,5,1543622400044,1.0,20450cdc6f2baa71f5a7614923abb3097a92dfadfa95c5...,2018-12-01 00:00:00.044
97805154,4167658236034840927,45,1543622400083,0.0,150d1a292b55f6d71c5b09f714ae08af579675b2fb590c...,2018-12-01 00:00:00.083
97805155,7138928556980582923,45,1543622400106,0.0,715c63497a42c283199a5565b6bbb9aaa7e7b512b4b54d...,2018-12-01 00:00:00.106
97805156,3524213767395801970,45,1543622400112,0.0,90abb567b69e5f7746f7911f00a8f2f66f13d9b3a54915...,2018-12-01 00:00:00.112
97805157,3103980515953267388,45,1543622400124,0.0,b62a4d5bb100f060b6bdc9bb8239dbe80ccc1fbb7e10ad...,2018-12-01 00:00:00.124


In [17]:
seven_endtime = pd.to_datetime(event_end_time) + pd.Timedelta(days=7)
fourteen_endtime = pd.to_datetime(event_end_time) + pd.Timedelta(days=14)

In [18]:
events_label['purchases'] = list(events_label['event'].eq('8'))
# events_label['event'] = [e=='8' for e in events_label.event]
events_7 = events_label[np.logical_and(events_label['event_time'] > event_end_time, 
                                       events_label['event_time'] <= seven_endtime)]
events_14 = events_label[events_label['event_time'] > event_end_time]  # = events_label.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
print(len(events_label), len(events_14))

14141381 14141381


In [20]:
events_14 = events_14.groupby('user_id_hash')['purchases'].sum().reset_index(name='purchase_in_14_days')
events_7 = events_7.groupby('user_id_hash')['purchases'].sum().reset_index(name='purchase_in_7_days')

In [21]:
training_label = events_14.merge(events_7, how='left', on='user_id_hash')

In [22]:
training_label['purchase_in_7_days'] = [int(e!=0) for e in training_label.purchase_in_7_days]
training_label['purchase_in_14_days'] = [int(e!=0) for e in training_label.purchase_in_14_days]

In [23]:
training_label.head()

Unnamed: 0,user_id_hash,purchase_in_14_days,purchase_in_7_days
0,00000db35682058bb7916cb90f85709f54c1a0f7a3b6de...,0,0
1,000059859ec188af6035870faf885c3038cedda05b3a54...,0,0
2,000062e9be78f3da274fec338e78f89d12000e781967f2...,0,0
3,00006ad44c0d33a2a526caf75514643639b7b7070dfffd...,0,0
4,000081a7d841b0f953bdba6c185fd5cc335fd6015a30c3...,0,0


In [24]:
pickle.dump(training_label, open('../Data/training_label.pkl', 'wb'))