In [20]:
import sys
import os
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [21]:
import enums as e

### Remote I/O

In [22]:
processed_dir = "../data/processed"
interim_dir = "../data/interim"
external_dir = "../data/external"

survey_data_input_file = os.path.join(processed_dir, "data_model_output.csv")
sdia_arrivals_input_file = os.path.join(external_dir, "sdia", "san-daily-arrivals-sept-and-oct-2024-as-csv.csv")
sdia_departures_input_file = os.path.join(external_dir, "sdia", "san-daily-departures-sept-and-oct-2024-as-csv.csv")

output_file = os.path.join(interim_dir, "compare-flight-time-of-day.csv")

### Data Reads

In [23]:
survey_df = pd.read_csv(survey_data_input_file)
arrivals_df = pd.read_csv(sdia_arrivals_input_file)
departures_df = pd.read_csv(sdia_departures_input_file)

  survey_df = pd.read_csv(survey_data_input_file)


### Reductions

In [24]:
arrivals_df.head()

Unnamed: 0,Date,Mkt Al,Alliance,Op Al,Orig,Dest,Miles,Flight,Stops,Equip,Seats,Dep Term,Arr Term,Dep Time,Arr Time,Block Mins,Arr Flag,Orig WAC,Dest WAC
0,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,2032,0,321,190,,2,714,902,288,0,36,91
1,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,1954,0,321,190,,2,855,1043,288,0,36,91
2,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,2059,0,321,190,,2,1131,1336,305,0,36,91
3,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,1740,0,321,190,,2,1644,1843,299,0,36,91
4,"Sep 30, 2024",AA,oneworld,AA,DFW,SAN,1171,2991,0,32Q,196,,2,720,824,184,0,74,91


In [25]:
bin_edges = []
for hour in range(24):
    bin_edges.append(hour * 100)
    bin_edges.append(hour * 100 + 30)

In [26]:
arrivals_df["binned_arrival"] = pd.cut(arrivals_df["Arr Time"], bins=bin_edges, include_lowest=False)
arrivals_summed_df = arrivals_df.groupby(["binned_arrival", "Date", "Arr Term"], observed=True)["Seats"].sum().reset_index()
arrivals_mean_df = arrivals_summed_df.groupby(["binned_arrival", "Arr Term"], observed=True)["Seats"].mean().reset_index()
arrivals_mean_df.columns = ["time_interval", "terminal", "observed_mean_arrival_seats"]
sum(arrivals_mean_df["observed_mean_arrival_seats"])

44999.276697175286

In [27]:
departures_df.head()

Unnamed: 0,Date,Mkt Al,Alliance,Op Al,Orig,Dest,Miles,Flight,Stops,Equip,Seats,Dep Term,Arr Term,Dep Time,Arr Time,Block Mins,Arr Flag,Orig WAC,Dest WAC
0,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,598,0,321,190,2,,645,1427,282,0,91,36
1,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,1645,0,321,190,2,,1204,1942,278,0,91,36
2,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,2512,0,321,190,2,,2125,503,278,1,91,36
3,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,2056,0,321,190,2,,2210,548,278,1,91,36
4,"Nov 1, 2024",AA,oneworld,AA,SAN,DFW,1171,2090,0,321,190,2,,615,1126,191,0,91,74


In [28]:
departures_df["binned_departure"] = pd.cut(departures_df["Dep Time"], bins=bin_edges, include_lowest=False)
departures_summed_df = departures_df.groupby(["binned_departure", "Date", "Dep Term"], observed=True)["Seats"].sum().reset_index()
departures_mean_df = departures_summed_df.groupby(["binned_departure", "Dep Term"], observed=True)["Seats"].mean().reset_index()
departures_mean_df.columns = ["time_interval", "terminal", "observed_mean_departure_seats"]
sum(departures_mean_df["observed_mean_departure_seats"])

44851.606438016

In [29]:
start_time = 500
interval_duration = 30
intervals = []
for i in range(48):
    start = start_time + i * interval_duration
    end = start + interval_duration
    intervals.append((start, end))

interval_index = pd.IntervalIndex.from_tuples(intervals, closed='right')

time_index_map_df = pd.DataFrame(interval_index, columns=["time_interval"])

# add a column with the index values
time_index_map_df["time_index"] = range(1, 49)
time_index_map_df.head()

Unnamed: 0,time_interval,time_index
0,"(500, 530]",1
1,"(530, 560]",2
2,"(560, 590]",3
3,"(590, 620]",4
4,"(620, 650]",5


In [30]:
intervals = [
    (500, 530),
    (530, 600),
    (600, 630),
    (630, 700),
    (700, 730),
    (730, 800),
    (800, 830),
    (830, 900),
    (900, 930),
    (930, 1000),
    (1000, 1030),
    (1030, 1100),
    (1100, 1130),
    (1130, 1200),
    (1200, 1230),
    (1230, 1300),
    (1300, 1330),
    (1330, 1400),
    (1400, 1430),
    (1430, 1500),
    (1500, 1530),
    (1530, 1600),
    (1600, 1630),
    (1630, 1700),
    (1700, 1730),
    (1730, 1800),
    (1800, 1830),
    (1830, 1900),
    (1900, 1930),
    (1930, 2000),
    (2000, 2030),
    (2030, 2100),
    (2100, 2130),
    (2130, 2200),
    (2200, 2230),
    (2230, 2300),
    (2300, 2330),
    (2330, 2400),
    (0, 30),
    (30, 100),
    (100, 130),
    (130, 200),
    (200, 230),
    (230, 300),
    (300, 330),
    (330, 400),
    (400, 430),
    (430, 500)
]

interval_index = pd.IntervalIndex.from_tuples(intervals, closed="right")
index_map_df = pd.DataFrame(interval_index, columns=["time_interval"])
index_map_df["time_index"] = range(1, 49)

In [31]:
observed_df = pd.merge(arrivals_mean_df, departures_mean_df, on=["time_interval", "terminal"], how = "outer")
observed_df = pd.merge(observed_df, index_map_df, on=["time_interval"], how="left")
observed_df.head()

Unnamed: 0,time_interval,terminal,observed_mean_arrival_seats,observed_mean_departure_seats,time_index
0,"(0, 30]",2,194.0,,39
1,"(30, 100]",2,194.0,,40
2,"(600, 630]",1,,892.625,3
3,"(600, 630]",2,,2056.4375,3
4,"(630, 700]",1,232.0,665.15625,4


In [32]:
working_df = survey_df[[
    "unique_id", "is_completed", "inbound_or_outbound", "marketsegment", "airport_terminal", "flight_arrival_time", "flight_departure_time", "party_size_flight",  "weight"
    ]].copy()
working_df = working_df.loc[working_df["marketsegment"] == e.Type.PASSENGER].copy()
working_df["travelers"] = working_df["weight"] * (working_df["party_size_flight"]+1)
working_df.head()

Unnamed: 0,unique_id,is_completed,inbound_or_outbound,marketsegment,airport_terminal,flight_arrival_time,flight_departure_time,party_size_flight,weight,travelers
807,808,True,2.0,1,1.0,17.0,,1.0,,
808,809,True,2.0,1,2.0,20.0,,0.0,,
809,810,True,2.0,1,2.0,24.0,,0.0,,
810,811,True,2.0,1,2.0,17.0,,0.0,,
811,812,True,2.0,1,1.0,18.0,,0.0,,


In [33]:
survey_arrivals_df = working_df.groupby(["flight_arrival_time", "airport_terminal"], observed=True)["travelers"].agg(["sum", "count"]).reset_index()
survey_arrivals_df.columns = ["time_index", "terminal", "survey_arrivals", "survey_arrivals_observations"]

survey_departures_df = working_df.groupby(["flight_departure_time", "airport_terminal"], observed=True)["travelers"].agg(["sum", "count"]).reset_index()
survey_departures_df.columns = ["time_index", "terminal", "survey_departures", "survey_departures_observations"]

join_df = pd.merge(survey_arrivals_df, survey_departures_df, on=["time_index", "terminal"], how = 'outer')
join_df["terminal"] = join_df["terminal"].astype("int")
join_df["time_index"] = join_df["time_index"].astype("int")
join_df.head()

Unnamed: 0,time_index,terminal,survey_arrivals,survey_arrivals_observations,survey_departures,survey_departures_observations
0,1,2,0.0,0.0,0.119188,1.0
1,1,99,,,0.0,0.0
2,2,2,0.0,0.0,0.485859,3.0
3,3,1,,,53.028256,2.0
4,3,2,0.0,0.0,2889.682869,18.0


In [34]:
output_df = pd.merge(observed_df, join_df, on=["time_index", "terminal"], how = "outer")
output_df["time_label"] = output_df["time_index"].apply(lambda x: e.DepartTime(x).name)
output_df["time_interval"] = output_df["time_interval"].fillna(output_df["time_index"].map(index_map_df.set_index("time_index")["time_interval"].to_dict()))
output_df = output_df.fillna(0.0)
output_df.head()

Unnamed: 0,time_interval,terminal,observed_mean_arrival_seats,observed_mean_departure_seats,time_index,survey_arrivals,survey_arrivals_observations,survey_departures,survey_departures_observations,time_label
0,"(500.0, 530.0]",2,0.0,0.0,1,0.0,0.0,0.119188,1.0,FIVE_TO_FIVE_THIRTY
1,"(500.0, 530.0]",99,0.0,0.0,1,0.0,0.0,0.0,0.0,FIVE_TO_FIVE_THIRTY
2,"(530.0, 600.0]",2,0.0,0.0,2,0.0,0.0,0.485859,3.0,FIVE_THIRTY_TO_SIX
3,"(600.0, 630.0]",1,0.0,892.625,3,0.0,0.0,53.028256,2.0,SIX_TO_SIX_THIRTY
4,"(600.0, 630.0]",2,0.0,2056.4375,3,0.0,0.0,2889.682869,18.0,SIX_TO_SIX_THIRTY


In [35]:
output_df['time_index'].max()

np.int64(40)

In [36]:
output_df.to_csv(output_file, index=False)

In [37]:
departures_mean_df = pd.merge(departures_mean_df, index_map_df, on=["time_interval"], how="left")
departures_mean_df.head()
departures_mean_df.to_csv(os.path.join(interim_dir, "time_of_day_targets.csv"), index=False)