In [4]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("M650041.csv")

# Melt the data from wide to long format
df_long = df.melt(id_vars=["Data Series"], var_name="Month_Year", value_name="Departures")

# Split 'Month_Year' into 'Month' and 'Year'
df_long[['Year', 'Month']] = df_long['Month_Year'].str.split(expand=True)

# Reorder and rename columns
df_long = df_long[['Month', 'Year', 'Data Series', 'Departures']].rename(columns={'Data Series': 'Country'})

# Convert 'Departures' to numeric
df_long['Departures'] = pd.to_numeric(df_long['Departures'], errors='coerce')

# Display result
print(df_long.head())

# Save to CSV if needed
df_long.to_csv("formatted_data.csv", index=False)


  Month  Year                             Country  Departures
0   Jan  2025  Number Of Air Passenger Departures   2971086.0
1   Jan  2025                     South East Asia   1146441.0
2   Jan  2025                           Indonesia    330478.0
3   Jan  2025                            Malaysia    286238.0
4   Jan  2025                         Philippines    129892.0


In [5]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("M650051.csv")

# Melt the data from wide to long format
df_long = df.melt(id_vars=["Data Series"], var_name="Month_Year", value_name="Arrivals")

# Split 'Month_Year' into 'Month' and 'Year'
df_long[['Year', 'Month']] = df_long['Month_Year'].str.split(expand=True)

# Reorder and rename columns
df_long = df_long[['Month', 'Year', 'Data Series', 'Arrivals']].rename(columns={'Data Series': 'Country'})

# Convert 'Arrivals' to numeric
df_long['Arrivals'] = pd.to_numeric(df_long['Arrivals'], errors='coerce')

# Display result
print(df_long.head())

# Save to CSV if needed
df_long.to_csv("arrivals.csv", index=False)


  Month  Year                           Country   Arrivals
0   Jan  2025  Number Of Air Passenger Arrivals  3130805.0
1   Jan  2025                   South East Asia  1189806.0
2   Jan  2025                         Indonesia   319040.0
3   Jan  2025                          Malaysia   303021.0
4   Jan  2025                       Philippines   150016.0


In [23]:
import pandas as pd

# Load departures dataset
df_departures = pd.read_csv("departures.csv")

# Load arrivals dataset
df_arrivals = pd.read_csv("arrivals.csv")

# Merge datasets on Year, Month, and Country
df_combined = pd.merge(df_departures, df_arrivals, on=['Year', 'Month', 'Country'], how='inner')

# Save or display result
df_combined.to_csv("combined_data.csv", index=False)
print(df_combined.head())


  Month  Year            Country  Departures   Arrivals
0   Jan  2025    South East Asia   1146441.0  1189806.0
1   Jan  2025          Indonesia    330478.0   319040.0
2   Jan  2025           Malaysia    286238.0   303021.0
3   Jan  2025        Philippines    129892.0   150016.0
4   Jan  2025           Thailand    246544.0   264939.0


In [51]:
import pandas as pd

# Load cleaned dataset
df = pd.read_csv("combined_data.csv")

# Keep only data from 2000 onwards
df = df[df['Year'] >= 2000]

print(df['Year'].unique())  

[2025 2024 2023 2022 2021 2020 2019 2018 2017 2016 2015 2014 2013 2012
 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000]


In [52]:
# Clean the leading and trailing spaces in the 'Country' column
df['Country'] = df['Country'].str.strip()
# Remove rows where 'Country' column contains 'other regions'
df = df[df['Country'] != 'Other Regions']

df = df.sort_values(by=['Year'], ascending=[True]).reset_index(drop=True)

# Define training and testing sets based on years
#we took out 2020 to 2022 as these are the years when Covid-19 happened and the laws then may have affected flights.
train_df = df[df['Year'] <= 2019]  # Train: 2000-2019
train_df = train_df[~train_df['Year'].between(2002, 2004)] #remove years 2002 to 2004 because SARS outbreak happened then
test_df = df[df['Year'] >= 2023]   # Test: 2023-2025

# Save train and test datasets
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

print(f"Training set size: {train_df.shape}")
print(f"Testing set size: {test_df.shape}")


Training set size: (3672, 5)
Testing set size: (450, 5)
