## Assignment 3

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

### Task 1: ICE and PDP Plots

### Task 2: Create Training Data for Incomplete Journeys

In [13]:
df = pd.read_parquet("data/journeys_flattened.parquet")

In [14]:
start_timestamp = pd.Timestamp('2020-11-03 03:31:30+0000', tz='UTC')
end_timestamp = pd.Timestamp('2023-01-23 12:29:56+0000', tz='UTC')

In [15]:
outcomes = []
for row in df.itertuples(index=False):
    if 28 in row.events:
        outcomes.append('successful')
    elif (end_timestamp - pd.Timestamp(row.timestamps[-1], tz='UTC')) / np.timedelta64(1, 'D') > 60:
        outcomes.append('unsuccessful')
    else:
        outcomes.append('ongoing')

df['outcome'] = outcomes

In [16]:
first_timestamps = []
last_timestamps = []

for row in df.itertuples(index=False):
    first_timestamps.append(pd.Timestamp(row.timestamps[0], tz='UTC'))
    if row.outcome == 'successful':
        last_timestamps.append(pd.Timestamp(row.timestamps[np.where(row.events == 28)[0][0]], tz='UTC'))
    elif row.outcome == 'unsuccessful':
        last_timestamps.append(pd.Timestamp(row.timestamps[-1] + np.timedelta64(60, 'D'), tz='UTC'))
    else:
        last_timestamps.append(end_timestamp)

df['start_timestamp'] = first_timestamps
df['end_timestamp'] = last_timestamps

In [17]:
training_dataset = df[df['outcome'] != 'ongoing'].copy()
test_dataset = df[df['outcome'] == 'ongoing'].copy()

In [18]:
train_df, val_df = train_test_split(training_dataset, test_size=0.3, random_state=0)

In [19]:
np.random.seed(823)

# We have about 27 months worth of data and want a cutoff about once every 2 weeks (on average)
time_splits = start_timestamp + (end_timestamp - start_timestamp) * np.random.random(size=54)

sampled_journeys_train = []
sampled_journeys_val = []

# We want about 4.5 million journeys from our 54 time splits => want about 85,000 journeys per time split
for split in time_splits:
    ongoing_journeys_train = train_df[(train_df['start_timestamp'] < split) & (train_df['end_timestamp'] > split)]
    ongoing_journeys_train = ongoing_journeys_train.sample(n=min(ongoing_journeys_train.shape[0], 85000))
    ongoing_journeys_train['current_datetime'] = split
    sampled_journeys_train.append(ongoing_journeys_train)

    ongoing_journeys_val = val_df[(val_df['start_timestamp'] < split) & (val_df['end_timestamp'] > split)]
    ongoing_journeys_val = ongoing_journeys_val.sample(n=min(ongoing_journeys_val.shape[0], 45000))
    ongoing_journeys_val['current_datetime'] = split
    sampled_journeys_val.append(ongoing_journeys_val)

sampled_train_df = pd.concat(sampled_journeys_train, ignore_index=True)
sampled_val_df = pd.concat(sampled_journeys_val, ignore_index=False)

In [20]:
sampled_train_df.reset_index(drop=True, inplace=True)
sampled_val_df.reset_index(drop=True, inplace=True)

In [30]:
# Update columns in datasets so that they reflect the journey up until the cutoff time
def truncate_journeys(df):
    for i in range(len(df)):
        timestamps = df.at[i, 'timestamps']
        cutoff_time = df.at[i, 'current_datetime'].tz_localize(None)

        cutoff_index = np.searchsorted(timestamps, cutoff_time, side='right')

        df.at[i, 'timestamps'] = timestamps[:cutoff_index]
        df.at[i, 'events'] = df.at[i, 'events'][:cutoff_index]
        df.at[i, 'event_names'] = df.at[i, 'event_names'][:cutoff_index]
        df.at[i, 'journey_length'] = cutoff_index

truncate_journeys(sampled_train_df)
truncate_journeys(sampled_val_df)

In [31]:
# This gives us about 4 million journeys for training and 2 million for validation, which seems sufficient for now
print(f"There are {sampled_train_df.shape[0]} training journeys and {sampled_val_df.shape[0]} validation journeys")

There are 4004170 training journeys and 2041474 validation journeys


In [32]:
# Shuffle rows so that splits from the same journey are not grouped together
sampled_train_df = sampled_train_df.sample(frac=1, random_state=523).reset_index(drop=True)
sampled_val_df = sampled_val_df.sample(frac=1, random_state=784).reset_index(drop=True)

In [33]:
test_dataset['current_datetime'] = end_timestamp

In [50]:
def normalize_list_column(col):
    def normalize(x):
        if x is None or (isinstance(x, float) and np.isnan(x)):
            return []                 # empty list, not NaN
        if isinstance(x, np.ndarray):
            return x.tolist()
        if isinstance(x, list):
            return x
        return [x]                    # scalar â†’ list
    return col.apply(normalize)

In [None]:
sampled_train_df['events'] = normalize_list_column(sampled_train_df['events'])
sampled_val_df['events']   = normalize_list_column(sampled_val_df['events'])
sampled_train_df['timestamps'] = normalize_list_column(sampled_train_df['timestamps'])
sampled_val_df['timestamps']   = normalize_list_column(sampled_val_df['timestamps'])
sampled_train_df['event_names'] = normalize_list_column(sampled_train_df['event_names'])
sampled_val_df['event_names']   = normalize_list_column(sampled_val_df['event_names'])

In [55]:
sampled_train_df.to_parquet('data/training_data.parquet', index=False)
sampled_val_df.to_parquet('data/validation_data.parquet', index=False)
test_dataset.to_parquet('data/testing_data.parquet', index=False)

#### Feature Engineering

In [2]:
train_df = pd.read_parquet('data/training_data.parquet')
val_df = pd.read_parquet('data/validation_data.parquet')

In [3]:
# Important to notice that there is massive class imbalance!!
# We should use a metric like F1 score to evaluate our model rather than accuracy
train_df['outcome'].value_counts()

outcome
unsuccessful    3735256
successful       268914
Name: count, dtype: int64

In [4]:
# Number of days that the journey has been going for up until the cutoff
train_df['days_into_journey'] = [(current_datetime - first_datetime) / np.timedelta64(1, 'D') for current_datetime, first_datetime in zip(train_df['current_datetime'], train_df['start_timestamp'])]
val_df['days_into_journey'] = [(current_datetime - first_datetime) / np.timedelta64(1, 'D') for current_datetime, first_datetime in zip(val_df['current_datetime'], val_df['start_timestamp'])]

In [5]:
# Create indicator features for all events
for i in range(1, 30):
    if i not in (17, 25, 28):
        train_df[f"event{i}"] = [i in ev for ev in train_df["events"]]
        val_df[f"event{i}"] = [i in ev for ev in val_df["events"]]

In [6]:
# Add a feature that measures the time difference (in days) between the cutoff timestamp and the last action
train_df["time_diff_days"] = [
    (cur - ts[-1]) / np.timedelta64(1, 'D') for ts, cur in zip(train_df["timestamps"], train_df["current_datetime"].dt.tz_localize(None))
]
val_df["time_diff_days"] = [
    (cur - ts[-1]) / np.timedelta64(1, 'D') for ts, cur in zip(val_df["timestamps"], val_df["current_datetime"].dt.tz_localize(None))
]

In [7]:
quantiles = [0.25, 0.5, 0.75, 0.85, 0.9, 0.95, 1]
q_cols = [f"days_between_actions_q_{int(q*100)}" for q in quantiles]

def compute_row_quantiles(ts):
    ts = np.asarray(ts)
    if ts.size < 2:
        return [np.nan] * len(quantiles)

    diffs_days = np.diff(ts) / np.timedelta64(1, 'D')
    return np.quantile(diffs_days, quantiles)

train_result = np.vstack([
    compute_row_quantiles(ts)
    for ts in train_df['timestamps']
])

train_df[q_cols] = train_result

val_result = np.vstack([
    compute_row_quantiles(ts)
    for ts in val_df['timestamps']
])

val_df[q_cols] = val_result

In [8]:
# Include the first event as a feature
train_df['first_event'] = [events[0] for events in train_df['events']]
val_df['first_event'] = [events[0] for events in val_df['events']]

In [9]:
# Get the counts for events that occur more than once during journeys in the training dataset
repeatable_events = set()
for row in train_df.itertuples(index=False):
    repeatable_events.update([item for item, count in Counter(row.events).items() if count > 1])

train_event_counts = train_df['events'].apply(Counter)
train_counts_df = (
    pd.DataFrame(train_event_counts.tolist())
      .reindex(columns=repeatable_events)
      .fillna(0)
      .astype(int)
      .rename(columns=lambda e: f'event_{e}_count')
)

train_df = train_df.join(train_counts_df)

val_event_counts = val_df['events'].apply(Counter)
val_counts_df = (
    pd.DataFrame(val_event_counts.tolist())
      .reindex(columns=repeatable_events)
      .fillna(0)
      .astype(int)
      .rename(columns=lambda e: f'event_{e}_count')
)

val_df = val_df.join(val_counts_df)

In [10]:
# Add extra indicator about if this milestone was reached
milestone_to_events = {
    1: (12, 15),
    2: (7, 18),
    3: (29,),
    4: (8,),
    5: (27,)
}

for milestone, events in milestone_to_events.items():
    train_df[f'reached_milestone_{milestone}'] = [any([event in journey_events for event in events]) for journey_events in train_df['events']]
    val_df[f'reached_milestone_{milestone}'] = [any([event in journey_events for event in events]) for journey_events in val_df['events']]

In [11]:
# Add the number of days that it takes to reach each milestone in the journey (assuming that the milestone was actually achieved)
for milestone in range(1, 6):
    train_df[f'days_to_milestone_{milestone}'] = [(timestamps[np.where(np.isin(events, milestone_to_events[milestone]))[0][0]] - timestamps[0]) / np.timedelta64(1, 'D') if indicator else np.nan for timestamps, events, indicator in zip(train_df['timestamps'], train_df['events'], train_df[f'reached_milestone_{milestone}'])]
    val_df[f'days_to_milestone_{milestone}'] = [(timestamps[np.where(np.isin(events, milestone_to_events[milestone]))[0][0]] - timestamps[0]) / np.timedelta64(1, 'D') if indicator else np.nan for timestamps, events, indicator in zip(val_df['timestamps'], val_df['events'], val_df[f'reached_milestone_{milestone}'])]

In [12]:
train_df.to_parquet('data/training_data_features.parquet', index=False)
val_df.to_parquet('data/validation_data_features.parquet', index=False)

In [19]:
X_train = train_df.drop(columns=['id', 'events', 'event_names', 'timestamps', 'outcome', 'start_timestamp', 'end_timestamp', 'current_datetime'])
y_train = (train_df["outcome"] == 'successful').astype('int')

X_val = val_df.drop(columns=['id', 'events', 'event_names', 'timestamps', 'outcome', 'start_timestamp', 'end_timestamp', 'current_datetime'])
y_val = (val_df["outcome"] == 'successful').astype('int')

### Task 3: Fit a Predictive Model

### Task 4: Cross Validation

### Task 5: Prediction on Incomplete Journeys