## Sampling data

This script applies some initial cleaning steps and splits the data into a train and test set

In [1]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import shuffle


In [2]:
# set variables from config file
config_path = os.path.abspath('..')

with open(config_path + '\config-example.json', 'r') as f:
    config = json.load(f)

dataset_output_fname = config['DEFAULT']['dataset_output_fname']
dataset_sampled_fname = config['DEFAULT']['dataset_sampled_fname']
processing_path = config['DEFAULT']['processing_path']
dataset_fname_suffix = config['DEFAULT']['dataset_fname_suffix']

In [None]:
client_data = pd.read_csv(os.path.join(processing_path,dataset_output_fname) + dataset_fname_suffix, header = 0, delimiter = ',',)

In [4]:
client_data.head()

Unnamed: 0,EventName,StartDate,EventType,BookingReference,AttendeeReference,GroupSize,IsLeadAttendee,AttendeeGrossCost,EventId,BookingStatus,AttendeeType,TicketType,StatusCreatedDate,ClientId
0,Developing Reading and Writing Skills,28/11/2017 00:00,"Others, Group 1",B462687,A514297,1,True,0.0,14983,Registered,Attendee,,11/11/2017 09:15,153.0
1,Teaching Grammar: Classroom choices,13/12/2017 00:00,"Others, Group 1",B462689,A514299,1,True,0.0,14960,Registered,Attendee,,11/11/2017 09:18,153.0
2,Teaching Grammar: Classroom choices,13/12/2017 00:00,"Others, Group 1",B467150,A519135,1,True,0.0,14960,Registered,Attendee,,22/11/2017 17:02,153.0
3,Academic vocabulary: what do students need to ...,19/04/2018 00:00,"Others, Group 2",B533234,A588469,1,True,0.0,15893,Registered,Attendee,,11/04/2018 13:38,153.0
4,Researching the classroom,14/11/2018 00:00,"Others, Group 4",B604614,A662691,1,True,0.0,17350,Registered,Attendee,,15/10/2018 14:33,153.0


In [5]:
client_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658446 entries, 0 to 658445
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   EventName          658446 non-null  object 
 1   StartDate          658446 non-null  object 
 2   EventType          655202 non-null  object 
 3   BookingReference   658446 non-null  object 
 4   AttendeeReference  658446 non-null  object 
 5   GroupSize          658446 non-null  int64  
 6   IsLeadAttendee     658446 non-null  object 
 7   AttendeeGrossCost  658446 non-null  float64
 8   EventId            658446 non-null  int64  
 9   BookingStatus      658442 non-null  object 
 10  AttendeeType       658446 non-null  object 
 11  TicketType         166470 non-null  object 
 12  StatusCreatedDate  658442 non-null  object 
 13  ClientId           658440 non-null  float64
dtypes: float64(2), int64(2), object(10)
memory usage: 70.3+ MB


### Imputing missing values

For values which do not change often over time such as EventType, BookingStatus, StatusCreatedDate and ClientId  ( which are all already established), it's safe to use the data from previous inputs where available to fill the missing values in more recent inputs.

In [6]:
# for client_dataset
fill_columns = ['EventType','BookingStatus','StatusCreatedDate', 'ClientId']

for c in fill_columns:
    print(c)
    client_data[c] = client_data[c].fillna(method='ffill')

EventType
BookingStatus
StatusCreatedDate
ClientId


In [7]:
# chekcing length of filled data
len(client_data)

658446

In [8]:
client_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658446 entries, 0 to 658445
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   EventName          658446 non-null  object 
 1   StartDate          658446 non-null  object 
 2   EventType          658446 non-null  object 
 3   BookingReference   658446 non-null  object 
 4   AttendeeReference  658446 non-null  object 
 5   GroupSize          658446 non-null  int64  
 6   IsLeadAttendee     658446 non-null  object 
 7   AttendeeGrossCost  658446 non-null  float64
 8   EventId            658446 non-null  int64  
 9   BookingStatus      658446 non-null  object 
 10  AttendeeType       658446 non-null  object 
 11  TicketType         166470 non-null  object 
 12  StatusCreatedDate  658446 non-null  object 
 13  ClientId           658446 non-null  float64
dtypes: float64(2), int64(2), object(10)
memory usage: 70.3+ MB


### Clean up

In [9]:
client_data.reset_index(drop=True)

Unnamed: 0,EventName,StartDate,EventType,BookingReference,AttendeeReference,GroupSize,IsLeadAttendee,AttendeeGrossCost,EventId,BookingStatus,AttendeeType,TicketType,StatusCreatedDate,ClientId
0,Developing Reading and Writing Skills,28/11/2017 00:00,"Others, Group 1",B462687,A514297,1,True,0.0,14983,Registered,Attendee,,11/11/2017 09:15,153.0
1,Teaching Grammar: Classroom choices,13/12/2017 00:00,"Others, Group 1",B462689,A514299,1,True,0.0,14960,Registered,Attendee,,11/11/2017 09:18,153.0
2,Teaching Grammar: Classroom choices,13/12/2017 00:00,"Others, Group 1",B467150,A519135,1,True,0.0,14960,Registered,Attendee,,22/11/2017 17:02,153.0
3,Academic vocabulary: what do students need to ...,19/04/2018 00:00,"Others, Group 2",B533234,A588469,1,True,0.0,15893,Registered,Attendee,,11/04/2018 13:38,153.0
4,Researching the classroom,14/11/2018 00:00,"Others, Group 4",B604614,A662691,1,True,0.0,17350,Registered,Attendee,,15/10/2018 14:33,153.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658441,Corruption and Standard in British Politics: T...,20/07/2022 17:00,Other Events,IHR1106055,A1178476,1,True,0.0,26428,Attending,Attendee,Standard,12/07/2022 10:04,219.0
658442,Johanna of Austria (1547-1578) – An Austrian A...,18/07/2022 18:00,Seminar,B1106057,A1178478,1,True,0.0,26342,Attending,Attendee,Standard,12/07/2022 10:31,219.0
658443,The Future of History,15/07/2022 18:00,Lecture,IHR1106063,A1178485,1,True,0.0,26117,Attending,Attendee,Standard,12/07/2022 11:16,219.0
658444,Media and Democracy in India: An Insider’s Per...,14/07/2022 10:00,Seminar,B1106065,A1178487,1,True,0.0,26430,Attending,Attendee,Standard,12/07/2022 11:42,219.0


In [10]:
# Replacing values in columns 

replacement_dict = {'IsLeadAttendee': {r'^Registered$':'True',0:'False'},'BookingStatus':{r'^Registered$':'Attending', 0: 'Cancelled'},
                    'AttendeeType':{r'^Day 1. 24th June 2022$':'Attendee',2020: 'Attendee'}}

client_data_cleaned = client_data.replace(replacement_dict,regex=True)

### Export Data

In [None]:
# export
client_data_cleaned.to_csv(os.path.join(processing_path,dataset_sampled_fname) + dataset_fname_suffix,index = False)