# Section 1: Downloading Flight for Testing datasets

This section downloads both departure and arrival dataset from [tablebuilder.singsat](https://tablebuilder.singstat.gov.sg/), Depart of Statistics Singapore website. Files are dowloaded in our google drive and extract from there in the code. 2023-2024 will represent 30%.

In [17]:
#To download from google drive
pip install gdown

SyntaxError: invalid syntax (<ipython-input-17-585e64f7f42d>, line 2)

## Section 1.1 : Downloading Flight Departure For Testing

In [2]:
import gdown
import pandas as pd

# Google Drive Departure Dataset 23/24 ID
fileDep_id = "1CfEgn8RMfwpG_0RyUc3RvQFf03_LWQRQ"
fileDep_name = "departure_test-23-24.csv"

# Download the file
gdown.download(f"https://drive.google.com/uc?id={fileDep_id}", fileDep_name, quiet=False)

# Load the CSV file into a Pandas DataFrame
df_d_test = pd.read_csv(fileDep_name, header=9)

# Remove NaN entries
df_d_test = df_d_test.dropna()

df_d_test.set_index(df_d_test.columns[0], inplace=True)

# Transpose the DataFrame (convert months → columns, countries → rows)
df_d_test = df_d_test.T.reset_index()

# Rename columns
df_d_test.rename(columns={"index": "Country"}, inplace=True)

# Reshape using melt()
df_long_test = df_d_test.melt(id_vars=["Country"], var_name="Month (YYYY-MM)", value_name="Departures")

# Save or display results
#df_long.to_csv("departures_refactor.csv", index=False)
print(df_long_test.head())  # View first few rows


Downloading...
From: https://drive.google.com/uc?id=1CfEgn8RMfwpG_0RyUc3RvQFf03_LWQRQ
To: /content/departure_test-23-24.csv
100%|██████████| 6.58k/6.58k [00:00<00:00, 17.0MB/s]

                                             Country Month (YYYY-MM)  \
0        Number Of Air Passenger Departures (Number)        2024 Dec   
1  Number Of Air Passenger Departures -> South Ea...        2024 Dec   
2  Number Of Air Passenger Departures -> South Ea...        2024 Dec   
3  Number Of Air Passenger Departures -> South Ea...        2024 Dec   
4  Number Of Air Passenger Departures -> South Ea...        2024 Dec   

   Departures  
0   3144724.0  
1   1248739.0  
2    311140.0  
3    323182.0  
4    155269.0  





In [3]:

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

# Remove the common prefix and suffix from the 'Country' column
df_long_test['Country'] = df_long_test['Country'].str.replace('Number Of Air Passenger Departures -> ', '', regex=False)
df_long_test['Country'] = df_long_test['Country'].str.replace(' (Number)', '', regex=False)

# Display the updated DataFrame
print(df_long_test.head(30))

                               Country Month (YYYY-MM)  Departures
0   Number Of Air Passenger Departures        2024 Dec   3144724.0
1                      South East Asia        2024 Dec   1248739.0
2         South East Asia -> Indonesia        2024 Dec    311140.0
3          South East Asia -> Malaysia        2024 Dec    323182.0
4       South East Asia -> Philippines        2024 Dec    155269.0
5          South East Asia -> Thailand        2024 Dec    294105.0
6           South East Asia -> Vietnam        2024 Dec    110293.0
7                      North East Asia        2024 Dec    888607.0
8    North East Asia -> Mainland China        2024 Dec    370922.0
9         North East Asia -> Hong Kong        2024 Dec    117393.0
10            North East Asia -> Japan        2024 Dec    184868.0
11                          South Asia        2024 Dec    331296.0
12                         Middle East        2024 Dec     96868.0
13                             Oceania        2024 Dec    3120

In [4]:
# Create a new DataFrame with only country names
df_countries_test = df_long_test[df_long_test['Country'].str.contains('->')].copy()

# Extract the country names by splitting and taking the last part
df_countries_test['Country'] = df_countries_test['Country'].apply(lambda x: x.split('->')[-1].strip())

# Rest index numbering for each entry
df_countries_index_test = df_countries_test.reset_index(drop=True)

# Change to interger type for Departure column
df_countries_index_test['Departures'] = df_countries_index_test['Departures'].astype(int)

df_countries_index_test['Month (YYYY-MM)'] = pd.to_datetime(
    df_countries_index_test['Month (YYYY-MM)'].str.strip(), format='%Y %b'
).dt.strftime('%Y-%m')


# Export into CSV
df_countries_index_test.to_csv("departures_test_final.csv", index=False)

# Display the new DataFrame
df_countries_index_dep_test = df_countries_index_test




## Section 1.2 : Downloading Flight Arrivals For Testing

In [5]:
import gdown
import pandas as pd

# Google Drive Arrival Dataset 23/24 ID
fileArr_id = "10KuTnelehV6N54pZos56sXkVH7Z5HIfY"
fileArr_name = "arrival_test-23-24.csv"


# Download the file
gdown.download(f"https://drive.google.com/uc?id={fileArr_id}", fileArr_name, quiet=False)

# Load the CSV file into a Pandas DataFrame
df_a_test = pd.read_csv(fileArr_name, header=9)

# Remove NaN entries
df_a_test = df_a_test.dropna()

df_a_test.set_index(df_a_test.columns[0], inplace=True)

# Transpose the DataFrame (convert months → columns, countries → rows)
df_a_test = df_a_test.T.reset_index()

# Rename columns
df_a_test.rename(columns={"index": "Country"}, inplace=True)

# Reshape using melt()
df_arr_test = df_a_test.melt(id_vars=["Country"], var_name="Month (YYYY-MM)", value_name="Arrivals")

# Save or display results
print(df_arr_test.head())  # View first few rows


Downloading...
From: https://drive.google.com/uc?id=10KuTnelehV6N54pZos56sXkVH7Z5HIfY
To: /content/arrival_test-23-24.csv
100%|██████████| 93.1k/93.1k [00:00<00:00, 78.0MB/s]

                                                                       Country  \
0                                    Number Of Air Passenger Arrivals (Number)   
1                 Number Of Air Passenger Arrivals -> South East Asia (Number)   
2    Number Of Air Passenger Arrivals -> South East Asia -> Indonesia (Number)   
3     Number Of Air Passenger Arrivals -> South East Asia -> Malaysia (Number)   
4  Number Of Air Passenger Arrivals -> South East Asia -> Philippines (Number)   

  Month (YYYY-MM)   Arrivals  
0        2024 Dec  3218469.0  
1        2024 Dec    1244205  
2        2024 Dec   341960.0  
3        2024 Dec     337008  
4        2024 Dec   123812.0  





In [6]:

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

# Remove the common prefix and suffix from the 'Country' column
df_arr_test['Country'] = df_arr_test['Country'].str.replace('Number Of Air Passenger Arrivals -> ', '', regex=False)
df_arr_test['Country'] = df_arr_test['Country'].str.replace(' (Number)', '', regex=False)

# Display the updated DataFrame
print(df_arr_test.head(30))

                              Country Month (YYYY-MM)   Arrivals
0    Number Of Air Passenger Arrivals        2024 Dec  3218469.0
1                     South East Asia        2024 Dec    1244205
2        South East Asia -> Indonesia        2024 Dec   341960.0
3         South East Asia -> Malaysia        2024 Dec     337008
4      South East Asia -> Philippines        2024 Dec   123812.0
5         South East Asia -> Thailand        2024 Dec   279506.0
6          South East Asia -> Vietnam        2024 Dec     105848
7                     North East Asia        2024 Dec     878499
8   North East Asia -> Mainland China        2024 Dec     361183
9        North East Asia -> Hong Kong        2024 Dec   114321.0
10           North East Asia -> Japan        2024 Dec   183767.0
11                         South Asia        2024 Dec     315485
12                        Middle East        2024 Dec     105289
13                            Oceania        2024 Dec     375999
14                       

In [7]:
# Create a new DataFrame with only country names
df_countries_arr_test = df_arr_test[df_arr_test['Country'].str.contains('->')].copy()

# Extract the country names by splitting and taking the last part
df_countries_arr_test['Country'] = df_countries_arr_test['Country'].apply(lambda x: x.split('->')[-1].strip())

# Rest index numbering for each entry
df_countries_index_arr_test = df_countries_arr_test.reset_index(drop=True)

# Change to interger type for Arrivals column
df_countries_index_arr_test['Arrivals'] = pd.to_numeric(df_countries_index_arr_test['Arrivals'], errors='coerce').fillna(0).astype(int)

# Changing format for date
df_countries_index_arr_test['Month (YYYY-MM)'] = pd.to_datetime(
    df_countries_index_arr_test['Month (YYYY-MM)'].str.strip(), format='%Y %b'
).dt.strftime('%Y-%m')

# Export to CSV
df_countries_index_arr_test.to_csv("arrivals_test_final.csv", index=False)

# Display the new DataFrame
df_countries_index_arr_test

Unnamed: 0,Country,Month (YYYY-MM),Arrivals
0,Indonesia,2024-12,341960
1,Malaysia,2024-12,337008
2,Philippines,2024-12,123812
3,Thailand,2024-12,279506
4,Vietnam,2024-12,105848
5,Mainland China,2024-12,361183
6,Hong Kong,2024-12,114321
7,Japan,2024-12,183767
8,France,2024-12,22678
9,Germany,2024-12,44055


## Section 1.3 : Merging Flight Departures & Arrivals For Testing

In [8]:
import pandas as pd

# Merge based on 'Country' and 'Month (YYYY-MM)'
merged_df_test = pd.merge(df_countries_index_arr_test, df_countries_index_dep_test, on=["Country", "Month (YYYY-MM)"], how="inner")

merged_df_test.to_csv("merged_flight_testing.csv",index=False )


# Section 2: Downloading Flight Training datasets

This section downloads both departure and arrival dataset from [tablebuilder.singsat](https://tablebuilder.singstat.gov.sg/), Depart of Statistics Singapore website. Files are dowloaded in our google drive and extract from there in the code. Remaining 70% will be base of (24 months / 30%) * 70% = **56 months**. We will take 56 months before [covid-19](https://en.wikipedia.org/wiki/COVID-19_pandemic#:~:text=The%20COVID%2D19%20pandemic%20(also,then%20worldwide%20in%20early%202020.))(Dec 19) as it will be the outlier of our dataset.

## Section 2.1 : Downloading Flight Departure For Training

In [9]:
import gdown
import pandas as pd

# Google Drive Departure Dataset 15/19 ID
fileDep_id = "12IQx7qGJGd3i1rUVUo5L1eaN7lGa9H16"
fileDep_name = "departure_train-15-24.csv"

# Download the file
gdown.download(f"https://drive.google.com/uc?id={fileDep_id}", fileDep_name, quiet=False)

# Load the CSV file into a Pandas DataFrame
df_d_train = pd.read_csv(fileDep_name, header=9)

# Remove NaN entries
df_d_train = df_d_train.dropna()

df_d_train.set_index(df_d_train.columns[0], inplace=True)

# Transpose the DataFrame (convert months → columns, countries → rows)
df_d_train = df_d_train.T.reset_index()

# Rename columns
df_d_train.rename(columns={"index": "Country"}, inplace=True)

# Reshape using melt()
df_long_train = df_d_train.melt(id_vars=["Country"], var_name="Month (YYYY-MM)", value_name="Departures")

# Save or display results
#df_long.to_csv("departures_refactor.csv", index=False)
print(df_long_train.head())  # View first few rows


Downloading...
From: https://drive.google.com/uc?id=12IQx7qGJGd3i1rUVUo5L1eaN7lGa9H16
To: /content/departure_train-15-24.csv
100%|██████████| 11.4k/11.4k [00:00<00:00, 16.4MB/s]

                                                                         Country  \
0                                    Number Of Air Passenger Departures (Number)   
1                 Number Of Air Passenger Departures -> South East Asia (Number)   
2    Number Of Air Passenger Departures -> South East Asia -> Indonesia (Number)   
3     Number Of Air Passenger Departures -> South East Asia -> Malaysia (Number)   
4  Number Of Air Passenger Departures -> South East Asia -> Philippines (Number)   

  Month (YYYY-MM)  Departures  
0        2019 Nov   2855679.0  
1        2019 Nov   1208027.0  
2        2019 Nov    343364.0  
3        2019 Nov    276244.0  
4        2019 Nov    134604.0  





In [10]:

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

# Remove the common prefix and suffix from the 'Country' column
df_long_train['Country'] = df_long_train['Country'].str.replace('Number Of Air Passenger Departures -> ', '', regex=False)
df_long_train['Country'] = df_long_train['Country'].str.replace(' (Number)', '', regex=False)

# Display the updated DataFrame
print(df_long_train.head(30))

                               Country Month (YYYY-MM)  Departures
0   Number Of Air Passenger Departures        2019 Nov   2855679.0
1                      South East Asia        2019 Nov   1208027.0
2         South East Asia -> Indonesia        2019 Nov    343364.0
3          South East Asia -> Malaysia        2019 Nov    276244.0
4       South East Asia -> Philippines        2019 Nov    134604.0
5          South East Asia -> Thailand        2019 Nov    267407.0
6           South East Asia -> Vietnam        2019 Nov    116564.0
7                      North East Asia        2019 Nov    733082.0
8    North East Asia -> Mainland China        2019 Nov    291946.0
9         North East Asia -> Hong Kong        2019 Nov    110116.0
10            North East Asia -> Japan        2019 Nov    155997.0
11                          South Asia        2019 Nov    285565.0
12                         Middle East        2019 Nov     79867.0
13                             Oceania        2019 Nov    2917

In [11]:
# Create a new DataFrame with only country names
df_countries_train = df_long_train[df_long_train['Country'].str.contains('->')].copy()

# Extract the country names by splitting and taking the last part
df_countries_train['Country'] = df_countries_train['Country'].apply(lambda x: x.split('->')[-1].strip())

# Rest index numbering for each entry
df_countries_index_train = df_countries_train.reset_index(drop=True)

# Change to interger type for Departure column
df_countries_index_train['Departures'] = df_countries_index_train['Departures'].astype(int)

df_countries_index_train['Month (YYYY-MM)'] = pd.to_datetime(
    df_countries_index_train['Month (YYYY-MM)'].str.strip(), format='%Y %b'
).dt.strftime('%Y-%m')


# Export into CSV
df_countries_index_train.to_csv("departures_train_final.csv", index=False)

# Display the new DataFrame
df_countries_index_dep_train = df_countries_index_train




## Section 2.2 : Downloading Flight Arrivals For Training

In [12]:
import gdown
import pandas as pd

# Google Drive Arrival Dataset 15/19 ID
fileArr_id = "1KayLfND03TnB7JNCERYAV1d6GaDPpN2L"
fileArr_name = "arrival_test-15-19.csv"


# Download the file
gdown.download(f"https://drive.google.com/uc?id={fileArr_id}", fileArr_name, quiet=False)

# Load the CSV file into a Pandas DataFrame
df_a_train = pd.read_csv(fileArr_name, header=9)

# Remove NaN entries
df_a_train = df_a_train.dropna()

df_a_train.set_index(df_a_train.columns[0], inplace=True)

# Transpose the DataFrame (convert months → columns, countries → rows)
df_a_train = df_a_train.T.reset_index()

# Rename columns
df_a_train.rename(columns={"index": "Country"}, inplace=True)

# Reshape using melt()
df_arr_train = df_a_train.melt(id_vars=["Country"], var_name="Month (YYYY-MM)", value_name="Arrivals")

# Save or display results
print(df_arr_train.head())  # View first few rows


Downloading...
From: https://drive.google.com/uc?id=1KayLfND03TnB7JNCERYAV1d6GaDPpN2L
To: /content/arrival_test-15-19.csv
100%|██████████| 11.4k/11.4k [00:00<00:00, 23.4MB/s]

                                                                       Country  \
0                                    Number Of Air Passenger Arrivals (Number)   
1                 Number Of Air Passenger Arrivals -> South East Asia (Number)   
2    Number Of Air Passenger Arrivals -> South East Asia -> Indonesia (Number)   
3     Number Of Air Passenger Arrivals -> South East Asia -> Malaysia (Number)   
4  Number Of Air Passenger Arrivals -> South East Asia -> Philippines (Number)   

  Month (YYYY-MM)   Arrivals  
0        2019 Nov  2814414.0  
1        2019 Nov  1217887.0  
2        2019 Nov   343394.0  
3        2019 Nov   294208.0  
4        2019 Nov   125603.0  





In [13]:

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

# Remove the common prefix and suffix from the 'Country' column
df_arr_train['Country'] = df_arr_train['Country'].str.replace('Number Of Air Passenger Arrivals -> ', '', regex=False)
df_arr_train['Country'] = df_arr_train['Country'].str.replace(' (Number)', '', regex=False)

# Display the updated DataFrame
print(df_arr_train.head(30))

                              Country Month (YYYY-MM)   Arrivals
0    Number Of Air Passenger Arrivals        2019 Nov  2814414.0
1                     South East Asia        2019 Nov  1217887.0
2        South East Asia -> Indonesia        2019 Nov   343394.0
3         South East Asia -> Malaysia        2019 Nov   294208.0
4      South East Asia -> Philippines        2019 Nov   125603.0
5         South East Asia -> Thailand        2019 Nov   268198.0
6          South East Asia -> Vietnam        2019 Nov   115505.0
7                     North East Asia        2019 Nov   714433.0
8   North East Asia -> Mainland China        2019 Nov   283753.0
9        North East Asia -> Hong Kong        2019 Nov   111919.0
10           North East Asia -> Japan        2019 Nov   149539.0
11                         South Asia        2019 Nov   266295.0
12                        Middle East        2019 Nov    73404.0
13                            Oceania        2019 Nov   285487.0
14                       

In [14]:
# Create a new DataFrame with only country names
df_countries_arr_train = df_arr_train[df_arr_train['Country'].str.contains('->')].copy()

# Extract the country names by splitting and taking the last part
df_countries_arr_train['Country'] = df_countries_arr_train['Country'].apply(lambda x: x.split('->')[-1].strip())

# Rest index numbering for each entry
df_countries_index_arr_train = df_countries_arr_train.reset_index(drop=True)

# Change to interger type for Arrivals column
df_countries_index_arr_train['Arrivals'] = pd.to_numeric(df_countries_index_arr_train['Arrivals'], errors='coerce').fillna(0).astype(int)

# Changing format for date
df_countries_index_arr_train['Month (YYYY-MM)'] = pd.to_datetime(
    df_countries_index_arr_train['Month (YYYY-MM)'].str.strip(), format='%Y %b'
).dt.strftime('%Y-%m')

# Export to CSV
df_countries_index_arr_train.to_csv("arrivals_train_final.csv", index=False)

# Display the new DataFrame
df_countries_index_arr_train

Unnamed: 0,Country,Month (YYYY-MM),Arrivals
0,Indonesia,2019-11,343394
1,Malaysia,2019-11,294208
2,Philippines,2019-11,125603
3,Thailand,2019-11,268198
4,Vietnam,2019-11,115505
5,Mainland China,2019-11,283753
6,Hong Kong,2019-11,111919
7,Japan,2019-11,149539
8,France,2019-11,17410
9,Germany,2019-11,45538


## Section 1.3 : Merging Flight Departures & Arrivals For Testing

In [15]:
import pandas as pd

# Merge based on 'Country' and 'Month (YYYY-MM)'
merged_df_train = pd.merge(df_countries_index_arr_train, df_countries_index_dep_train, on=["Country", "Month (YYYY-MM)"], how="inner")

merged_df_train.to_csv("merged_flight_train.csv",index=False )


## Section 3.0 : Web crawling from www.timeanddate.com

In [18]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

# Load unique countries from flights.csv
flights_df = pd.read_csv("merged_flight_train.csv")

# Define a mapping for country name standardization
country_name_mapping = {
    "Mainland China": "China"
}

# Apply the mapping to standardize country names in flights.csv
flights_df["Country"] = flights_df["Country"].replace(country_name_mapping)
unique_countries = flights_df["Country"].unique()

# Base URL template for holiday scraping
BASE_URL = "https://www.timeanddate.com/calendar/custom.html?year={year}&country={country}&cols=3&df=1&hol=1&lang=en"

# Main URL to get country list
URL = "https://www.timeanddate.com/calendar/custom.html"

# Get the list of countries from the dropdown on the website
resp = requests.get(URL)
soup = BeautifulSoup(resp.text, "html.parser")
country_select = soup.find("select", {"id": "sf_country"})

# Extract country codes and names from the website
countries = {option["value"]: option.text.strip() for option in country_select.find_all("option")}

# Filter only relevant countries based on flights.csv (after mapping)
filtered_countries = {code: name for code, name in countries.items() if name in unique_countries}

# Years 2015 - 2024
years = list(range(2015, 2025))

# List to store extracted holiday data
holiday_data = []

def format_date(dd_mmm, year):
    try:
        # Convert "1 Jan" to a datetime object
        date_obj = datetime.strptime(f"{dd_mmm} {year}", "%d %b %Y")

        # Convert to required formats
        full_date = date_obj.strftime("%Y-%m-%d")  # YYYY-MM-DD

        return full_date
    except ValueError:
        return None, None  # Handle unexpected formats

# Iterate over only relevant countries and years
for country_code, country_name in tqdm(filtered_countries.items(), desc="Scraping holidays"):
    for year in years:
        url = BASE_URL.format(year=year, country=country_code)
        resp = requests.get(url)

        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            holiday_table = soup.find("table", {"class": "cl1h"})

            if holiday_table:
                for holiday_row in holiday_table.find_all("tr"):
                    date_span = holiday_row.find("span", {"class": "co1"})
                    name_td = holiday_row.find("a")

                    if date_span and name_td:
                        raw_date = date_span.text.strip()
                        full_date = format_date(raw_date, year)  # Format date
                        holiday_name = name_td.text.strip()

                        if full_date:
                            # Append row to list
                            holiday_data.append([full_date, holiday_name, country_name, year])

# Convert to DataFrame
df = pd.DataFrame(holiday_data, columns=["Date", "Event", "Country", "Year"])

# Save to Excel
df.to_excel("holidays.xlsx", index=False)

print("Data successfully saved to holidays.xlsx")

Scraping holidays: 100%|██████████| 11/11 [00:21<00:00,  1.99s/it]


Data successfully saved to holidays.xlsx


## Section 3.1 : Clean holiday dataset

In [19]:
# Load the excel file from Section 1
file_path = "holidays.xlsx"
df = pd.read_excel(file_path)

# Remove duplicate rows
df_cleaned = df.drop_duplicates()

# Save the cleaned data back to an Excel file
cleaned_file_path = "holidays_cleaned.xlsx"
df_cleaned.to_excel(cleaned_file_path, index=False)

print("Duplicates removed. Cleaned data saved to:", cleaned_file_path)

Duplicates removed. Cleaned data saved to: holidays_cleaned.xlsx


## Section 3.2 : Number of holidays for that month

In [20]:
# Load the holiday dataset
holiday_df = pd.read_excel("holidays_cleaned.xlsx")

# Standardize country names using a mapping dictionary
country_name_mapping = {
    "China" : "Mainland China"
}

# Replace country names in holiday_df
holiday_df["Country"] = holiday_df["Country"].replace(country_name_mapping)

# Convert 'Date' to datetime format
holiday_df["Date"] = pd.to_datetime(holiday_df["Date"])

# Extract Year-Month in 'YYYY-MM' format
holiday_df["Month (YYYY-MM)"] = holiday_df["Date"].dt.strftime("%Y-%m")

# Count holidays per country per month
holiday_counts = (
    holiday_df.groupby(["Country", "Month (YYYY-MM)"])
    .size()
    .reset_index(name="No of holidays")
)

# List of input and output files
files = {
    "merged_flight_testing.csv": "merged_flight_testing_with_holidays.csv",
    "merged_flight_train.csv": "merged_flight_train_with_holidays.csv",
}

# Process each file
for input_file, output_file in files.items():
    # Load the dataset
    df = pd.read_csv(input_file)

    # Merge with holiday data
    merged_df = df.merge(holiday_counts, on=["Country", "Month (YYYY-MM)"], how="left")

    # Fill NaN values with 0 in case some months have no holidays
    merged_df["No of holidays"] = merged_df["No of holidays"].fillna(0).astype(int)

    # Save the updated file
    merged_df.to_csv(output_file, index=False)

    print(f"Processed and saved: {output_file}")

Processed and saved: merged_flight_testing_with_holidays.csv
Processed and saved: merged_flight_train_with_holidays.csv
