In [83]:
import sys
import os
import pandas as pd
import numpy as np
sys.path.insert(0, os.path.abspath("../data_model/"))

In [84]:
import enums as e

### Remote I/O

In [85]:
processed_dir = "../data/processed"
interim_dir = "../data/interim"
external_dir = "../data/external"

survey_data_input_file = os.path.join(processed_dir, "data_model_output.csv")
sdia_arrivals_input_file = os.path.join(external_dir, "sdia", "san-daily-arrivals-sept-and-oct-2024-as-csv.csv")
sdia_departures_input_file = os.path.join(external_dir, "sdia", "san-daily-departures-sept-and-oct-2024-as-csv.csv")

output_file = os.path.join(interim_dir, "compare-flight-time-of-day.csv")

### Data Reads

In [86]:
survey_df = pd.read_csv(survey_data_input_file)
arrivals_df = pd.read_csv(sdia_arrivals_input_file)
departures_df = pd.read_csv(sdia_departures_input_file)

  survey_df = pd.read_csv(survey_data_input_file)


### Reductions

In [87]:
arrivals_df.head()

Unnamed: 0,Date,Mkt Al,Alliance,Op Al,Orig,Dest,Miles,Flight,Stops,Equip,Seats,Dep Term,Arr Term,Dep Time,Arr Time,Block Mins,Arr Flag,Orig WAC,Dest WAC
0,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,2032,0,321,190,,2,714,902,288,0,36,91
1,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,1954,0,321,190,,2,855,1043,288,0,36,91
2,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,2059,0,321,190,,2,1131,1336,305,0,36,91
3,"Sep 30, 2024",AA,oneworld,AA,CLT,SAN,2077,1740,0,321,190,,2,1644,1843,299,0,36,91
4,"Sep 30, 2024",AA,oneworld,AA,DFW,SAN,1171,2991,0,32Q,196,,2,720,824,184,0,74,91


In [88]:
bin_edges = []
for hour in range(24):
    bin_edges.append(hour * 100)
    bin_edges.append(hour * 100 + 30)

In [89]:
arrivals_df["binned_arrival"] = pd.cut(arrivals_df["Arr Time"], bins=bin_edges, include_lowest=False)
arrivals_summed_df = arrivals_df.groupby(["binned_arrival", "Date"], observed=True)["Seats"].sum().reset_index()
arrivals_mean_df = arrivals_summed_df.groupby(["binned_arrival"], observed=True)["Seats"].mean().reset_index()
arrivals_mean_df.columns = ["time_interval", "observed_mean_arrival_seats"]
sum(arrivals_mean_df["observed_mean_arrival_seats"])

44121.21875

In [90]:
departures_df.head()

Unnamed: 0,Date,Mkt Al,Alliance,Op Al,Orig,Dest,Miles,Flight,Stops,Equip,Seats,Dep Term,Arr Term,Dep Time,Arr Time,Block Mins,Arr Flag,Orig WAC,Dest WAC
0,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,598,0,321,190,2,,645,1427,282,0,91,36
1,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,1645,0,321,190,2,,1204,1942,278,0,91,36
2,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,2512,0,321,190,2,,2125,503,278,1,91,36
3,"Nov 1, 2024",AA,oneworld,AA,SAN,CLT,2077,2056,0,321,190,2,,2210,548,278,1,91,36
4,"Nov 1, 2024",AA,oneworld,AA,SAN,DFW,1171,2090,0,321,190,2,,615,1126,191,0,91,74


In [91]:
departures_df["binned_departure"] = pd.cut(departures_df["Dep Time"], bins=bin_edges, include_lowest=False)
departures_summed_df = departures_df.groupby(["binned_departure", "Date"], observed=True)["Seats"].sum().reset_index()
departures_mean_df = departures_summed_df.groupby(["binned_departure"], observed=True)["Seats"].mean().reset_index()
departures_mean_df.columns = ["time_interval", "observed_mean_departure_seats"]
sum(departures_mean_df["observed_mean_departure_seats"])

43864.06754032258

In [92]:
observed_df = pd.merge(arrivals_mean_df, departures_mean_df, on="time_interval")
observed_df["time_index"] = observed_df.index + 4
observed_df.head()

Unnamed: 0,time_interval,observed_mean_arrival_seats,observed_mean_departure_seats,time_index
0,"(630, 700]",232.0,1599.03125,4
1,"(700, 730]",794.375,1285.53125,5
2,"(730, 800]",1117.5,2680.28125,6
3,"(800, 830]",976.46875,1063.40625,7
4,"(830, 900]",954.46875,1593.6875,8


In [97]:
working_df = survey_df[[
    "unique_id", "is_completed", "inbound_or_outbound", "marketsegment", "flight_arrival_time", "flight_departure_time", "party_size_flight",  "weight"
    ]].copy()
working_df = working_df.loc[working_df["marketsegment"] == e.Type.PASSENGER].copy()
working_df["travelers"] = working_df["weight"] * working_df["party_size_flight"]
working_df.head()

Unnamed: 0,unique_id,is_completed,inbound_or_outbound,marketsegment,flight_arrival_time,flight_departure_time,party_size_flight,weight,travelers
458,459,True,2.0,1,17.0,,1.0,3.681496,3.681496
459,460,True,2.0,1,20.0,,0.0,1.805546,0.0
460,461,True,2.0,1,24.0,,0.0,1.805546,0.0
461,462,True,2.0,1,17.0,,0.0,1.187836,0.0
462,463,True,2.0,1,18.0,,0.0,1.48425,0.0


In [101]:
survey_arrivals_df = working_df.groupby(["flight_arrival_time"], observed=True)["travelers"].agg(["sum", "count"]).reset_index()
survey_arrivals_df.columns = ["time_index", "survey_arrivals", "survey_arrivals_observations"]
survey_departures_df = working_df.groupby(["flight_departure_time"], observed=True)["travelers"].agg(["sum", "count"]).reset_index()
survey_departures_df.columns = ["time_index", "survey_departures", "survey_departures_observations"]

join_df = pd.merge(survey_arrivals_df, survey_departures_df, on="time_index")
join_df.head()

Unnamed: 0,time_index,survey_arrivals,survey_arrivals_observations,survey_departures,survey_departures_observations
0,3.0,2.357886,3,19.888498,10
1,4.0,7.241262,2,159.331921,12
2,5.0,14.127536,9,138.335744,52
3,6.0,65.544844,6,89.369871,40
4,7.0,8.484612,7,2216.95164,314


In [102]:
output_df = pd.merge(observed_df, join_df, on="time_index")
output_df["time_index"] = output_df["time_index"].astype("int")
output_df["time_label"] = output_df["time_index"].apply(lambda x: e.DepartTime(x).name)
output_df.head()

Unnamed: 0,time_interval,observed_mean_arrival_seats,observed_mean_departure_seats,time_index,survey_arrivals,survey_arrivals_observations,survey_departures,survey_departures_observations,time_label
0,"(630, 700]",232.0,1599.03125,4,7.241262,2,159.331921,12,SIX_THIRTY_TO_SEVEN
1,"(700, 730]",794.375,1285.53125,5,14.127536,9,138.335744,52,SEVEN_TO_SEVEN_THIRTY
2,"(730, 800]",1117.5,2680.28125,6,65.544844,6,89.369871,40,SEVEN_THIRTY_TO_EIGHT
3,"(800, 830]",976.46875,1063.40625,7,8.484612,7,2216.95164,314,EIGHT_TO_EIGHT_THIRTY
4,"(830, 900]",954.46875,1593.6875,8,77.316995,6,2244.515197,254,EIGHT_THIRTY_TO_NINE


In [103]:
output_df.to_csv(output_file, index=False)