In [23]:
import os
from datetime import datetime

import pandas as pd

pd.set_option('display.max_columns', None)

In [3]:
profiles = pd.read_json('../data/raw/profile.json')
offers = pd.read_json('../data/raw/offers.json')
transactions = pd.read_json('../data/raw/transactions.json')

In [4]:
# Profiles
profiles['registered_on'] = pd.to_datetime(profiles['registered_on'], format='%Y%m%d')
today = pd.Timestamp('2018-07-27')

profiles['days_since_register'] = (
    today - pd.to_datetime(profiles['registered_on'])
).dt.days.clip(lower=0).astype('Int64')
profiles['months_since_register'] = profiles['days_since_register'] / 30

# Offers
offers['category'] = offers.index

# Transactions
for idx, item in transactions.iterrows():
    value_dict = item.value
    item.value['offer_id'] = value_dict['offer_id'] if value_dict['offer_id'] else value_dict['offer id']
    item.value.pop('offer id', None)

expanded = pd.DataFrame(transactions['value'].tolist())
transactions = pd.concat([transactions, expanded], axis=1)
transactions = transactions.drop('value', axis=1)

In [5]:
profiles['is_new_customer'] = (profiles['months_since_register'] <= 8)
profiles['is_continuous_customer'] = ((profiles['months_since_register'] > 8) & (profiles['months_since_register'] <= 12))
profiles['is_tenured_customer'] = ((profiles['months_since_register'] > 12) & (profiles['months_since_register'] <= 26))
profiles['is_high_tenured_customer'] = ((profiles['months_since_register'] > 26) & (profiles['months_since_register'] <= 46))
profiles['is_extreme_tenured_customer'] = ((profiles['months_since_register'] > 46) & (profiles['months_since_register'] <= 57))

In [6]:
category_names = offers['channels'].explode().dropna().unique().tolist()

dummies = (
    pd.get_dummies(offers['channels'].explode())
      .groupby(level=0).max()
      .reindex(offers.index, fill_value=0)
      .reindex(columns=category_names, fill_value=0)
)

offers = offers.join(dummies.add_prefix('has_'))

In [7]:
data = transactions.merge(
    profiles.add_prefix('profile_'), left_on='account_id', right_on='profile_id', how='left'
).merge(
    offers.add_prefix('offer_'), left_on='offer_id', right_on='offer_id', how='left',
)

In [8]:
offers_received = data[data.event == "offer received"].copy()
offers_completed = transactions[transactions.event == "offer completed"].copy()
offers_completed.rename(columns={'time_since_test_start': 'completion_time'}, inplace=True)

merged = offers_received.merge(offers_completed, on=['account_id', 'offer_id'], how='left')
merged['label'] = (
    (merged['completion_time'] >= merged['time_since_test_start']) &
    (merged['completion_time'] <= merged['time_since_test_start'] + merged['offer_duration'])
).astype(int)



offers_received_labeled = (
    merged.groupby(['account_id', 'offer_id', 'time_since_test_start', 'offer_duration'], as_index=False)
          .agg({'label': 'max'})
)

In [9]:
tx = transactions[transactions.event == "transaction"].copy()

daily_tx = tx.groupby(['account_id', 'time_since_test_start'], as_index=False).agg(
    daily_spent=('amount', 'sum'),
    daily_count=('amount', 'count')
)

daily_tx['cum_spent'] = daily_tx.groupby('account_id')['daily_spent'].cumsum()
daily_tx['cum_count'] = daily_tx.groupby('account_id')['daily_count'].cumsum()

offers_received_w_daily = offers_received.merge(
    daily_tx,
    on=['account_id', 'time_since_test_start'],
    how='left'
)

offers_received_w_daily[['daily_spent', 'daily_count', 'cum_spent', 'cum_count']] = \
    offers_received_w_daily[['daily_spent', 'daily_count', 'cum_spent', 'cum_count']].fillna(0)

offers_received_w_daily['hist_spent'] = offers_received_w_daily['cum_spent'] - offers_received_w_daily['daily_spent']
offers_received_w_daily['hist_count'] = offers_received_w_daily['cum_count'] - offers_received_w_daily['daily_count']

In [10]:
dataset = offers_received_labeled.merge(
    offers_received_w_daily[
        ['account_id', 'offer_id', 'time_since_test_start', 'hist_spent', 'hist_count']
    ],
    on=['account_id', 'offer_id', 'time_since_test_start'],
    how='left'
)

In [11]:
dataset = dataset.merge(
    offers[['duration', 'min_value', 'offer_type', 'discount_value', 'has_email', 'has_mobile', 'has_social', 'has_web', 'id']],
    left_on='offer_id',
    right_on='id',
    how='left'
)

In [12]:
dataset = dataset.merge(
    profiles[[
        'age', 
        'gender', 
        'months_since_register', 
        'is_new_customer', 
        'is_continuous_customer', 
        'is_tenured_customer', 
        'is_high_tenured_customer', 
        'is_extreme_tenured_customer',
        'credit_card_limit',
        'id'
    ]],
    left_on='account_id',
    right_on='id',
    how='left'
)

In [13]:
daily_tx = tx.groupby(['account_id', 'time_since_test_start'], as_index=False).agg(
    daily_spent=('amount', 'sum'),
    daily_count=('amount', 'count')
).sort_values(['account_id', 'time_since_test_start'])

daily_tx['rolling_spent_30d'] = daily_tx.groupby('account_id')['daily_spent'].transform(
    lambda x: x.shift(1).rolling(window=30, min_periods=1).sum()
)
daily_tx['rolling_count_30d'] = daily_tx.groupby('account_id')['daily_count'].transform(
    lambda x: x.shift(1).rolling(window=30, min_periods=1).sum()
)

dataset = dataset.merge(
    daily_tx[['account_id', 'time_since_test_start', 'rolling_spent_30d', 'rolling_count_30d']],
    on=['account_id', 'time_since_test_start'], how='left'
)

dataset[['rolling_spent_30d', 'rolling_count_30d']] = dataset[['rolling_spent_30d', 'rolling_count_30d']].fillna(0)

In [14]:
offers_received_all = data[data.event == "offer received"].copy()
offers_completed_all = data[data.event == "offer completed"].copy()

offers_received_all = offers_received_all.sort_values(['account_id', 'time_since_test_start'])
offers_received_all['offer_count'] = 1
offers_received_all['cum_offers_received'] = offers_received_all.groupby('account_id')['offer_count'].cumsum()

offers_completed_all = offers_completed_all[['account_id', 'offer_id', 'time_since_test_start']]
offers_completed_all = offers_completed_all.rename(columns={'time_since_test_start': 'completion_time'})

offers_received_all = offers_received_all.merge(
    offers_completed_all,
    on=['account_id', 'offer_id'],
    how='left'
)

offers_received_all['completed_before'] = (
    (offers_received_all['completion_time'].notna()) &
    (offers_received_all['completion_time'] < offers_received_all['time_since_test_start'])
).astype(int)

offers_received_all['cum_completed_before'] = offers_received_all.groupby('account_id')['completed_before'].cumsum()

offers_received_all['hist_offer_completion_rate'] = (
    (offers_received_all['cum_completed_before'] - offers_received_all['completed_before']) /
    (offers_received_all['cum_offers_received'] - 1).replace(0, 1)
)

hist_completion = offers_received_all[['account_id', 'offer_id', 'time_since_test_start', 'hist_offer_completion_rate']]

dataset = dataset.merge(hist_completion, on=['account_id', 'offer_id', 'time_since_test_start'], how='left')
dataset['hist_offer_completion_rate'] = dataset['hist_offer_completion_rate'].fillna(0)

In [15]:
dataset.head()

Unnamed: 0,account_id,offer_id,time_since_test_start,offer_duration,label,hist_spent,hist_count,duration,min_value,offer_type,discount_value,has_email,has_mobile,has_social,has_web,id_x,age,gender,months_since_register,is_new_customer,is_continuous_customer,is_tenured_customer,is_high_tenured_customer,is_extreme_tenured_customer,credit_card_limit,id_y,rolling_spent_30d,rolling_count_30d,hist_offer_completion_rate
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,24.0,7.0,1,58.4,4.0,7,10,discount,2,True,True,False,True,2906b810c7d4411798c6938adc9daaa5,33,M,15.4,False,False,True,False,False,72000.0,0009655768c64bdeb2e877511632db8f,58.4,4.0,0.0
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,14.0,4.0,0,0.0,0.0,4,0,informational,0,True,True,False,True,3f207df678b143eea3cee63160fa8bed,33,M,15.4,False,False,True,False,False,72000.0,0009655768c64bdeb2e877511632db8f,0.0,0.0,0.0
2,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,7.0,3.0,0,0.0,0.0,3,0,informational,0,True,True,True,False,5a8bc65990b245e5a138643cd4eb9837,33,M,15.4,False,False,True,False,False,72000.0,0009655768c64bdeb2e877511632db8f,0.0,0.0,0.0
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,17.0,5.0,1,0.0,0.0,5,5,bogo,5,True,True,True,True,f19421c1d4aa40978ebb69ca19b0e20d,33,M,15.4,False,False,True,False,False,72000.0,0009655768c64bdeb2e877511632db8f,0.0,0.0,0.0
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,21.0,10.0,1,0.0,0.0,10,10,discount,2,True,True,True,True,fafdcd668e3743c1bb461111dcafc2a4,33,M,15.4,False,False,True,False,False,72000.0,0009655768c64bdeb2e877511632db8f,0.0,0.0,0.0


In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86432 entries, 0 to 86431
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   account_id                   86432 non-null  object 
 1   offer_id                     86432 non-null  object 
 2   time_since_test_start        86432 non-null  float64
 3   offer_duration               86432 non-null  float64
 4   label                        86432 non-null  int64  
 5   hist_spent                   86432 non-null  float64
 6   hist_count                   86432 non-null  float64
 7   duration                     86432 non-null  int64  
 8   min_value                    86432 non-null  int64  
 9   offer_type                   86432 non-null  object 
 10  discount_value               86432 non-null  int64  
 11  has_email                    86432 non-null  bool   
 12  has_mobile                   86432 non-null  bool   
 13  has_social      

In [26]:
os.makedirs('../data/processed', exist_ok=True)
now = datetime.now().strftime('%Y-%m-%d-%H-%M')
dataset.to_csv(f'../data/processed/dataset_{now}.csv', index=False)

In [21]:
1

1