In [1]:
import numpy as np
import pandas as pd

In [2]:
data_path = "/Users/christopherolley/data/Leanplum-data"

In [3]:
attr_types = {
   'app_id': 'category',
   'session_id': 'category',
   'attribute': np.int16,
   'attribute_value': 'category',
   'user_id_hash': 'category'
}

event_types = {
   'user_id_hash': 'category',
   'app_id': 'category',
   'session_id': 'category',
   'event': 'category',
   'event_timestamp': np.int64,
   'event_value': np.float16
}

sessions_types = {
   'app_id': 'category',
   'session_id': 'category',
   'start_timestamp': np.int64,
   'timezone': 'category',
   'timezone_offset': np.float64,
   'previous_sessions_duration': np.int64,
   'user_created_timestamp': np.int64,
   'is_user_first_session': bool,
   'is_session': bool,
   'is_wau': bool,
   'is_mau': bool,
   'country': 'category',
   'region': 'category',
   'city': 'category',
   'latitude': np.float16,
   'longitude': np.float16,
   'locale': 'category',
   'os_name': 'category',
   'session_index': np.int32,
   'device_id': 'category',
   'user_id_hash': 'category'
}

Read in sessions and events from sessions, and remove sessions that are not true sessions:

In [4]:
sessions = pd.read_csv("%s/sessions.csv" % data_path, dtype=sessions_types)
events = pd.read_csv("%s/events.csv" % data_path, dtype=event_types)
sessions = sessions[(sessions.is_session == True) & (sessions.is_wau == False) & (sessions.is_mau == False) & (sessions.is_developer == False)]

Load in the labels:

In [13]:
train_labels_df = pd.read_feather('train_labels_df')
val_labels_df = pd.read_feather('val_labels_df')
label_df = pd.read_feather('label_df')

Define the time period for computing features (using 1st October to 30th November):

In [8]:
from datetime import datetime, timezone

In [9]:
start_date_features_train = '01.10.2018 00:00:00'
end_date_features_train = '30.11.2018 23:59:59'
pattern = '%d.%m.%Y %H:%M:%S'
features_train_start_date = int(datetime.strptime(start_date_features_train, pattern).replace(tzinfo=timezone.utc).timestamp())*1000
features_train_end_date = int(datetime.strptime(end_date_features_train, pattern).replace(tzinfo=timezone.utc).timestamp())*1000

Features are: 
- Number of sessions for each user in the last 2 months
- Average amount of time spent in previous sessions for each user
- Number of purchases made per user in the last month

In [10]:
def create_features_df(data_df):
    # compute features over the last two months
    prev_month_window_start = data_df.event_timestamp.max() - 2*28*24*60*60*1000
    prev_month_data = data_df[data_df.event_timestamp > prev_month_window_start]
    # Number of sessions for each user in the last month
    num_sessions = pd.DataFrame(prev_month_data['user_id_hash'].value_counts().reset_index())
    num_sessions.columns = ['user_id_hash', 'num_previous_sessions']
    # Average amount of time spent in previous sessions per user
    previous_sessions_time = pd.DataFrame(data_df.groupby('user_id_hash').previous_sessions_duration.mean().reset_index())
    previous_sessions_time.columns = ['user_id_hash', 'previous_sessions_avg_duration']
    # Number of purchases made per user in the last month
    prev_month_data['purchase'] = np.where(prev_month_data.event=='8', 1, 0)
    num_purchases = pd.DataFrame(prev_month_data.groupby('user_id_hash').purchase.sum().reset_index())
    # Create overall features dataframe
    features_df = pd.merge(previous_sessions_time, num_sessions, on='user_id_hash')
    features_df = pd.merge(features_df, num_purchases, on='user_id_hash')
    
    num_sessions, previous_sessions_time, num_purchases = None, None, None
    return features_df

In [14]:
sessions_features_df = sessions[(sessions.start_timestamp > features_train_start_date) & (sessions.start_timestamp < features_train_end_date)]
data_df = pd.merge(sessions_features_df, events, on=['session_id', 'user_id_hash'])
train_data_df = pd.merge(data_df, train_labels_df, on='user_id_hash')
val_data_df = pd.merge(data_df, val_labels_df, on='user_id_hash')
del data_df

In [15]:
train_features_df = create_features_df(train_data_df)
val_features_df = create_features_df(val_data_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [16]:
train_df = pd.merge(train_features_df, train_labels_df, on='user_id_hash')
del train_features_df
val_df = pd.merge(val_features_df, val_labels_df, on='user_id_hash')
del val_features_df

In [17]:
train_df.to_feather('train_df')
val_df.to_feather('val_df')

In [18]:
del train_df
del val_df

In [19]:
full_data_df = pd.merge(sessions, events, on=['session_id', 'user_id_hash'])
full_data_features_df = create_features_df(full_data_df)
del full_data_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [20]:
full_data_features_df.to_feather('full_data_df')