In [79]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import calendar
import datetime

In [80]:
pd.options.display.max_rows = 999

In [None]:
listfiles = glob.glob("data/*.parquet")
df = pd.concat([pd.read_parquet(l).head(10) for l in listfiles]).reset_index().drop("index", axis=1).sort_values("price")
print("Retrieved flights:", df.shape[0])

### Add information about the dates

In [None]:
df["outbound_Departure_Hour"] = df["outbound_Departure"].apply(lambda x: x[11:])
df["inbound_Departure_Hour"] = df["inbound_Departure"].apply(lambda x: x[11:])
df["outbound_Arrival_Hour"] = df["outbound_Arrival"].apply(lambda x: x[11:])
df["inbound_Arrival_Hour"] = df["inbound_Arrival"].apply(lambda x: x[11:])

df["outbound_Departure_Date"] = df["outbound_Departure"].apply(lambda x: datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
df["inbound_Departure_Date"] = df["inbound_Departure"].apply(lambda x: datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
df["outbound_Arrival_Date"] = df["outbound_Arrival"].apply(lambda x: datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
df["inbound_Arrival_Date"] = df["inbound_Arrival"].apply(lambda x: datetime.datetime.strptime(x[:10], '%Y-%m-%d'))

df["days"] = df["inbound_Departure_Date"] - df["outbound_Departure_Date"]
df["inbound_Departure_Date_weekday"] = df["inbound_Departure_Date"].apply(lambda x: calendar.day_name[x.weekday()])
df["outbound_Departure_Date_weekday"] = df["outbound_Departure_Date"].apply(lambda x: calendar.day_name[x.weekday()])
df["inbound_Arrival_Date_weekday"] = df["inbound_Arrival_Date"].apply(lambda x: calendar.day_name[x.weekday()])
df["outbound_Arrival_Date_weekday"] = df["outbound_Arrival_Date"].apply(lambda x: calendar.day_name[x.weekday()])


def holiday(_x):
    c = 0
    if (_x["outbound_Arrival_Date"] >= datetime.datetime.strptime("2019-12-25", '%Y-%m-%d')) and (_x["inbound_Departure_Date"] <= datetime.datetime.strptime("2019-12-25", '%Y-%m-%d')):
        c = c + 1
    if (_x["outbound_Arrival_Date"] >= datetime.datetime.strptime("2019-12-26", '%Y-%m-%d')) and (_x["inbound_Departure_Date"] <= datetime.datetime.strptime("2019-12-26", '%Y-%m-%d')):
        c = c + 1
    if (_x["outbound_Arrival_Date"] >= datetime.datetime.strptime("2020-01-01", '%Y-%m-%d')) and (_x["inbound_Departure_Date"] <= datetime.datetime.strptime("2020-01-01", '%Y-%m-%d')):
        c = c + 1

    return c

df["weekdays"] = df.apply(lambda x: np.busday_count(x["outbound_Arrival_Date"].date(),  x["inbound_Departure_Date"].date()) - holiday(x), axis=1)
df["weekdays_to_days_ratio"] = df["weekdays"]/df["days"].apply(lambda x: x/ np.timedelta64(1, 'D'))


In [None]:
df.head(500)

In [None]:
df[
    (df["weekdays"]<= 8) &\
    (df["days"]> np.timedelta64(8, 'D')) &\
    (df["destinationPlace"] == "CEB")
  ]

In [None]:
plt.figure(dpi=200)
plt.yscale('log', nonposy='clip')
plt.hist(df["price"].values, bins=np.linspace(150,1000,100), edgecolor="black")
plt.show()