## Assignment 2

### Task 2: Create Training Data for Incomplete Journeys

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet("data/journeys_flattened.parquet")

In [4]:
end_timestamp = pd.Timestamp('2023-01-23 12:29:56+0000', tz='UTC')

In [6]:
df.head()

Unnamed: 0,id,events,event_names,timestamps,journey_length
0,-1356064073 -3165934,"[19, 19, 19, 19, 19, 3, 19, 12, 4, 4, 4, 4, 4,...","[application_web_view, application_web_view, a...","[2022-04-23T05:03:55.000000, 2022-04-23T05:03:...",61
1,1016646926 834076938,"[12, 1, 4, 4, 4, 11, 1, 4, 4, 4, 5, 5, 4, 4, 4...","[application_web_approved, promotion_created, ...","[2021-11-21T17:25:30.000000, 2021-11-21T17:25:...",33
2,1999477972 1122102787,"[2, 12, 1, 1, 21, 1, 21, 1, 1, 1, 21]","[campaign_click, application_web_approved, pro...","[2021-01-06T06:00:00.000000, 2021-01-06T13:19:...",11
3,1396677495 -1818244372,"[12, 4, 4, 4, 4, 2, 4, 4, 1, 1, 1, 21, 4, 1, 1...","[application_web_approved, browse_products, br...","[2021-03-18T04:54:46.000000, 2021-03-18T04:55:...",19
4,-1227506087 1921269525,"[29, 19, 19, 3, 19, 19, 19, 3, 19, 19, 3, 12, ...","[account_activitation, application_web_view, a...","[2021-07-18T00:00:00.000000, 2021-07-18T11:07:...",33


In [None]:
outcomes = []
for row in df.itertuples(index=False):
    if 28 in row.events:
        outcomes.append('successful')
    elif (end_timestamp - pd.Timestamp(row.timestamps[-1], tz='UTC')) / np.timedelta64(1, 'D') > 60:
        outcomes.append('unsuccessful')
    else:
        outcomes.append('ongoing')

In [29]:
df['outcome'] = outcomes

In [30]:
df.head()

Unnamed: 0,id,events,event_names,timestamps,journey_length,outcome
0,-1356064073 -3165934,"[19, 19, 19, 19, 19, 3, 19, 12, 4, 4, 4, 4, 4,...","[application_web_view, application_web_view, a...","[2022-04-23T05:03:55.000000, 2022-04-23T05:03:...",61,unsuccessful
1,1016646926 834076938,"[12, 1, 4, 4, 4, 11, 1, 4, 4, 4, 5, 5, 4, 4, 4...","[application_web_approved, promotion_created, ...","[2021-11-21T17:25:30.000000, 2021-11-21T17:25:...",33,successful
2,1999477972 1122102787,"[2, 12, 1, 1, 21, 1, 21, 1, 1, 1, 21]","[campaign_click, application_web_approved, pro...","[2021-01-06T06:00:00.000000, 2021-01-06T13:19:...",11,unsuccessful
3,1396677495 -1818244372,"[12, 4, 4, 4, 4, 2, 4, 4, 1, 1, 1, 21, 4, 1, 1...","[application_web_approved, browse_products, br...","[2021-03-18T04:54:46.000000, 2021-03-18T04:55:...",19,unsuccessful
4,-1227506087 1921269525,"[29, 19, 19, 3, 19, 19, 19, 3, 19, 19, 3, 12, ...","[account_activitation, application_web_view, a...","[2021-07-18T00:00:00.000000, 2021-07-18T11:07:...",33,successful


In [31]:
print(f"There are {df[df['outcome'] == 'successful'].shape[0]} successful journeys")
print(f"There are {df[df['outcome'] == 'unsuccessful'].shape[0]} unsuccessful journeys")
print(f"There are {df[df['outcome'] == 'ongoing'].shape[0]} ongoing journeys")

There are 279363 successful journeys
There are 992757 unsuccessful journeys
There are 158325 ongoing journeys


In [None]:
# Percentages by outcome type
print(279363 / df.shape[0] * 100)
print(992757 / df.shape[0] * 100)
print(158325 / df.shape[0] * 100)

19.52979667166511
69.40196931724044
11.06823401109445


In [None]:
training_dataset = df[df['outcome'] != 'ongoing'].copy()

In [38]:
training_dataset['outcome'].head()

0    unsuccessful
1      successful
2    unsuccessful
3    unsuccessful
4      successful
Name: outcome, dtype: object

In [48]:
train_df, val_df = train_test_split(training_dataset, test_size=0.3, random_state=0)

In [67]:
# Set seed for reproducibility
np.random.seed(742)

samples = []

for row in train_df.itertuples(index=False):
    # Get random time splits accordingly based on 
    if row.outcome == 'successful':
        first_event = row.timestamps[0]
        last_event = row.timestamps[-1]
        time_splits = first_event + (last_event - first_event) * np.random.random(size=5)
    else:
        first_event = row.timestamps[0]
        end_cutoff = row.timestamps[-1] + np.timedelta64(60, 'D')
        time_splits = first_event + (end_cutoff - first_event) * np.random.random(size=5)

    # Split training data accordingly
    for split in time_splits:
        journey_steps = (row.timestamps <= split).sum()
        samples.append([
            row.id,
            row.events[:journey_steps],
            row.event_names[:journey_steps],
            row.timestamps[:journey_steps],
            journey_steps,
            row.outcome
        ])

In [68]:
complete_train_df = pd.DataFrame(samples, columns=train_df.columns)

In [69]:
complete_train_df.shape

(4452420, 6)

We now have almost 4.5 million training samples which look like ongoing journeys but have actual labels.

Let's now do the same for our validation samples so that we are able to test the generalizability of our models.

In [72]:
# Set seed for reproducibility
np.random.seed(284)

val_samples = []

for row in val_df.itertuples(index=False):
    # Get random time splits accordingly based on 
    if row.outcome == 'successful':
        first_event = row.timestamps[0]
        last_event = row.timestamps[-1]
        time_splits = first_event + (last_event - first_event) * np.random.random(size=5)
    else:
        first_event = row.timestamps[0]
        end_cutoff = row.timestamps[-1] + np.timedelta64(60, 'D')
        time_splits = first_event + (end_cutoff - first_event) * np.random.random(size=5)

    # Split training data accordingly
    for split in time_splits:
        journey_steps = (row.timestamps <= split).sum()
        val_samples.append([
            row.id,
            row.events[:journey_steps],
            row.event_names[:journey_steps],
            row.timestamps[:journey_steps],
            journey_steps,
            row.outcome
        ])

In [73]:
complete_val_df = pd.DataFrame(val_samples, columns=val_df.columns)

In [78]:
complete_val_df.shape

(1908180, 6)

We have nearly 2 million validation samples to test our models.

In [79]:
complete_train_df.to_parquet('data/training_data.parquet', index=False)
complete_val_df.to_parquet('data/validation_data.parquet', index=False)