In [1]:
from datetime import datetime

import pandas as pd

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def parser(s):
    if pd.isna(s):
        return
    if len(s) <= 10:
        return datetime.strptime(s, '%Y-%m-%d')
    return datetime.strptime(s, '%Y-%m-%d %H:%M:%S')


In [4]:
df = pd.read_csv('Cross Road Analytics Competition Dataset.csv',
                 low_memory=False,
                 parse_dates=['event_date', 'add_datetime', 'ResaleDate'],
                 date_parser=parser)

In [5]:
df.sample(5)

Unnamed: 0,acct_id,acct_type_desc,event_name,event_date,plan_event_name,comp_name,section_name,row_name,SeatNum,price_code,PC1,Price,paid,add_datetime,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Start Year,LastYear,Term,TicketType,SeatUniqueID,Season,ClubExpYear,Tenure,UniqueID,isAttended,Resold,ResalePrice,ResaleDate,isSTM
87264,124621590,Season,CLT21JAX,2021-11-14,21FS,Not Comp,228,1,11,FR,F,111.0,Y,2021-03-31 16:19:39,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,228-1-11,2021,,21.0,CLT21JAX-228-1-11,Yes,,,NaT,
359401,1773331790,Personal,CLT21NYJ,2021-11-04,,Not Comp,431,9,11,LGZ4,L,81.0,Y,2021-11-03 10:28:11,GROUP,SOLD,,0,GA,Manifest,,,,Group,431-9-11,2021,,1.0,CLT21NYJ-431-9-11,Yes,,,NaT,
109773,117622090,Season,CLT21JAX,2021-11-14,21FS,Not Comp,537,1,5,KR,K,86.0,Y,2021-04-01 08:24:32,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,537-1-5,2021,,25.0,CLT21JAX-537-1-5,Yes,,,NaT,
636151,1247817290,Employee,CLT21TEN,2021-10-31,21FS,Not Comp,629,11,7,TR,T,53.0,Y,2021-07-29 09:27:13,OPEN,SOLD,,0,GA,Manifest,,,,Employee,629-11-7,2021,,8.0,CLT21TEN-629-11-7,Yes,,,NaT,
694469,1828698990,House,CLT22HOU,2023-01-08,,Not Comp,614,20,9,NE1,N,58.0,N,2022-12-26 12:30:42,VT SELL,SOLD,,0,GA,Manifest,,,,Broker,614-20-9,2022,,0.0,CLT22HOU-614-20-9,???,,,NaT,


# Event Level

In [None]:
fill_rate_df = (df.groupby(['event_date'])['isAttended'].value_counts(
    normalize=True, dropna=False)*100).to_frame('fill_rate').reset_index()

fill_rate_df = fill_rate_df[fill_rate_df.isAttended != 'No'].copy()

In [None]:
fill_rate_df['fill_rate_lag_1'] = fill_rate_df['fill_rate'].shift(1)
fill_rate_df['fill_rate_lag_2'] = fill_rate_df['fill_rate'].shift(2)
fill_rate_df['fill_rate_lag_3'] = fill_rate_df['fill_rate'].shift(3)

In [None]:
# fill_rate_df.to_csv('fill_rate.csv', index=False)

# Seat Level

In [6]:
df.SeatUniqueID.value_counts()

101-10-1     20
453-20-16    20
453-2-4      20
453-2-5      20
453-2-6      20
             ..
218-8-25     20
616-19-19    19
616-19-17    19
616-19-20    19
616-19-18    19
Name: SeatUniqueID, Length: 64346, dtype: int64

In [26]:
event_seat_is_attended_df = df[['event_date', 'SeatUniqueID', 'isAttended']].sort_values(
    ['event_date', 'SeatUniqueID', 'isAttended'])

In [27]:
event_seat_is_attended_df.head()

Unnamed: 0,event_date,SeatUniqueID,isAttended
386076,2021-08-15,101-10-1,Yes
386077,2021-08-15,101-10-10,No
386078,2021-08-15,101-10-11,No
386079,2021-08-15,101-10-12,No
386080,2021-08-15,101-10-13,Yes


In [28]:
event_date_df = df['event_date'].drop_duplicates().reset_index().drop(
    columns=['index']).sort_values(['event_date'])

event_date_df['lagged_date_1'] = event_date_df['event_date'].shift(1)
event_date_df['lagged_date_2'] = event_date_df['event_date'].shift(2)
event_date_df['lagged_date_3'] = event_date_df['event_date'].shift(3)

In [29]:
event_date_df.head()

Unnamed: 0,event_date,lagged_date_1,lagged_date_2,lagged_date_3
6,2021-08-15,NaT,NaT,NaT
7,2021-09-12,2021-08-15,NaT,NaT
2,2021-09-19,2021-09-12,2021-08-15,NaT
0,2021-10-17,2021-09-19,2021-09-12,2021-08-15
9,2021-10-31,2021-10-17,2021-09-19,2021-09-12


In [30]:
merged_df = event_seat_is_attended_df.merge(event_date_df, on='event_date', how='left')

In [31]:
event_seat_is_attended_df.head()

Unnamed: 0,event_date,SeatUniqueID,isAttended
386076,2021-08-15,101-10-1,Yes
386077,2021-08-15,101-10-10,No
386078,2021-08-15,101-10-11,No
386079,2021-08-15,101-10-12,No
386080,2021-08-15,101-10-13,Yes


In [32]:
merged_df.sample(100)

Unnamed: 0,event_date,SeatUniqueID,isAttended,lagged_date_1,lagged_date_2,lagged_date_3
726269,2022-08-27,204-7-16,Yes,2022-08-20,2022-01-02,2021-12-18
1025327,2022-10-30,637-5-2,Yes,2022-10-16,2022-10-02,2022-09-25
369562,2021-11-04,608-18-12,Yes,2021-10-31,2021-10-17,2021-09-19
12651,2021-08-15,139-22-19,Yes,NaT,NaT,NaT
578720,2021-12-18,645-6-25,Yes,2021-11-28,2021-11-14,2021-11-04
502035,2021-11-28,615-7-6,Yes,2021-11-14,2021-11-04,2021-10-31
543332,2021-12-18,320-4N-7,Yes,2021-11-28,2021-11-14,2021-11-04
699993,2022-08-20,628-18-24,No,2022-01-02,2021-12-18,2021-11-28
862717,2022-10-02,244-13-6,Yes,2022-09-25,2022-08-27,2022-08-20
436916,2021-11-14,614-14-20,No,2021-11-04,2021-10-31,2021-10-17


In [33]:
merged_df = merged_df.merge(event_seat_is_attended_df,
                            left_on=['lagged_date_1', 'SeatUniqueID'],
                            right_on=['event_date', 'SeatUniqueID'],
                            how='left', suffixes=('', '_lagg_1'))

merged_df = merged_df.merge(event_seat_is_attended_df,
                            left_on=['lagged_date_2', 'SeatUniqueID'],
                            right_on=['event_date', 'SeatUniqueID'],
                            how='left', suffixes=('', '_lagg_2'))

merged_df = merged_df.merge(event_seat_is_attended_df,
                            left_on=['lagged_date_3', 'SeatUniqueID'],
                            right_on=['event_date', 'SeatUniqueID'],
                            how='left', suffixes=('', '_lagg_3'))

In [36]:
merged_df.head()

Unnamed: 0,event_date,SeatUniqueID,isAttended,lagged_date_1,lagged_date_2,lagged_date_3,event_date_lagg_1,isAttended_lagg_1,event_date_lagg_2,isAttended_lagg_2,event_date_lagg_3,isAttended_lagg_3
0,2021-08-15,101-10-1,Yes,NaT,NaT,NaT,NaT,,NaT,,NaT,
1,2021-08-15,101-10-10,No,NaT,NaT,NaT,NaT,,NaT,,NaT,
2,2021-08-15,101-10-11,No,NaT,NaT,NaT,NaT,,NaT,,NaT,
3,2021-08-15,101-10-12,No,NaT,NaT,NaT,NaT,,NaT,,NaT,
4,2021-08-15,101-10-13,Yes,NaT,NaT,NaT,NaT,,NaT,,NaT,


In [43]:
merged_df_ = merged_df[['event_date', 'SeatUniqueID', 'isAttended_lagg_1',
                        'isAttended_lagg_2', 'isAttended_lagg_3']].copy()



In [52]:
merged_df_['isAttended_lagg_1'] = merged_df_['isAttended_lagg_1'].fillna('???').replace('???', None)
merged_df_['isAttended_lagg_2'] = merged_df_['isAttended_lagg_2'].fillna('???').replace('???', None)
merged_df_['isAttended_lagg_3'] = merged_df_['isAttended_lagg_3'].fillna('???').replace('???', None)

In [59]:
merged_df_['isAttended_lagg_1'] = merged_df_['isAttended_lagg_1'].replace({'Yes': 1, 'No': 0})
merged_df_['isAttended_lagg_2'] = merged_df_['isAttended_lagg_2'].replace({'Yes': 1, 'No': 0})
merged_df_['isAttended_lagg_3'] = merged_df_['isAttended_lagg_3'].replace({'Yes': 1, 'No': 0})

In [60]:
print(merged_df_['isAttended_lagg_1'].unique())
print(merged_df_['isAttended_lagg_2'].unique())
print(merged_df_['isAttended_lagg_3'].unique())

[nan  1.  0.]
[nan  1.  0.]
[nan  1.  0.]


In [63]:
print(merged_df_['isAttended_lagg_1'].value_counts(dropna=False))
print(merged_df_['isAttended_lagg_2'].value_counts(dropna=False))
print(merged_df_['isAttended_lagg_3'].value_counts(dropna=False))

1.0    900941
0.0    257283
NaN    128692
Name: isAttended_lagg_1, dtype: int64
1.0    847701
0.0    246177
NaN    193038
Name: isAttended_lagg_2, dtype: int64
1.0    791833
NaN    257384
0.0    237699
Name: isAttended_lagg_3, dtype: int64


In [68]:
merged_df_.sample(50)

Unnamed: 0,event_date,SeatUniqueID,isAttended_lagg_1,isAttended_lagg_2,isAttended_lagg_3
1031922,2022-11-20,105-19-9,1.0,1.0,1.0
518012,2021-12-18,109-12-5,1.0,1.0,1.0
525190,2021-12-18,132-19-23,1.0,0.0,0.0
323119,2021-11-04,102-36W-12,0.0,0.0,0.0
395532,2021-11-14,127-9-4,1.0,1.0,1.0
6310,2021-08-15,116-14-18,,,
345185,2021-11-04,230-10-15,1.0,1.0,1.0
883014,2022-10-02,604-14-4,1.0,1.0,0.0
1098022,2022-11-28,110-23-15,1.0,1.0,1.0
492102,2021-11-28,451-23N-5,1.0,0.0,0.0


In [66]:
# merged_df_.to_csv('seat_level_lagged_isattended.csv', index=False)