# Section 1: Downloading Flight datasets

This section downloads both departure and arrival dataset from [tablebuilder.singsat](https://tablebuilder.singstat.gov.sg/), Depart of Statistics Singapore website. Files are dowloaded in our google drive and extract from there in the code.

In [None]:
#To download from google drive
pip install gdown

## Section 1.1 : Downloading Flight Departure For Testing

In [None]:
import gdown
import pandas as pd

# Google Drive Departure Dataset 23/24 ID
fileDep_id = "1CfEgn8RMfwpG_0RyUc3RvQFf03_LWQRQ"
fileDep_name = "departure-23-24.csv"

# Download the file
gdown.download(f"https://drive.google.com/uc?id={fileDep_id}", fileDep_name, quiet=False)

# Load the CSV file into a Pandas DataFrame
df_d = pd.read_csv(fileDep_name, header=9)

# Remove NaN entries
df_d = df_d.dropna()

df_d.set_index(df_d.columns[0], inplace=True)

# Transpose the DataFrame (convert months → columns, countries → rows)
df_d = df_d.T.reset_index()

# Rename columns
df_d.rename(columns={"index": "Country"}, inplace=True)

# Reshape using melt()
df_long = df_d.melt(id_vars=["Country"], var_name="Month (YYYY-MM)", value_name="Departures")

# Save or display results
#df_long.to_csv("departures_refactor.csv", index=False)
print(df_long.head())  # View first few rows


In [None]:

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

# Remove the common prefix and suffix from the 'Country' column
df_long['Country'] = df_long['Country'].str.replace('Number Of Air Passenger Departures -> ', '', regex=False)
df_long['Country'] = df_long['Country'].str.replace(' (Number)', '', regex=False)

# Display the updated DataFrame
print(df_long.head(30))

In [None]:
# Create a new DataFrame with only country names
df_countries = df_long[df_long['Country'].str.contains('->')].copy()

# Extract the country names by splitting and taking the last part
df_countries['Country'] = df_countries['Country'].apply(lambda x: x.split('->')[-1].strip())

# Rest index numbering for each entry
df_countries_index = df_countries.reset_index(drop=True)

# Change to interger type for Departure column
df_countries_index['Departures'] = df_countries_index['Departures'].astype(int)

# Display the new DataFrame
df_countries_index

## Section 1.2 : Downloading Flight Arrivals For Testing

In [None]:
import gdown
import pandas as pd

# Google Drive Arrival Dataset 23/24 ID
fileArr_id = "10KuTnelehV6N54pZos56sXkVH7Z5HIfY"
fileArr_name = "arrival-23-24.csv"


# Download the file
gdown.download(f"https://drive.google.com/uc?id={fileArr_id}", fileArr_name, quiet=False)

# Load the CSV file into a Pandas DataFrame
df_a = pd.read_csv(fileArr_name, header=9)

# Remove NaN entries
df_a = df_a.dropna()

df_a.set_index(df_a.columns[0], inplace=True)

# Transpose the DataFrame (convert months → columns, countries → rows)
df_a = df_a.T.reset_index()

# Rename columns
df_a.rename(columns={"index": "Country"}, inplace=True)

# Reshape using melt()
df_arr = df_a.melt(id_vars=["Country"], var_name="Month (YYYY-MM)", value_name="Arrivals")

# Save or display results
df_arr.to_csv("arrival_refactor.csv", index=False)
print(df_arr.head())  # View first few rows


In [None]:

pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)

# Remove the common prefix and suffix from the 'Country' column
df_arr['Country'] = df_arr['Country'].str.replace('Number Of Air Passenger Arrivals -> ', '', regex=False)
df_arr['Country'] = df_arr['Country'].str.replace(' (Number)', '', regex=False)

# Display the updated DataFrame
print(df_arr.head(30))

In [None]:
# Create a new DataFrame with only country names
df_countries_arr = df_arr[df_arr['Country'].str.contains('->')].copy()

# Extract the country names by splitting and taking the last part
df_countries_arr['Country'] = df_countries_arr['Country'].apply(lambda x: x.split('->')[-1].strip())

# Rest index numbering for each entry
df_countries_index_arr = df_countries_arr.reset_index(drop=True)

# Change to interger type for Arrivals column
df_countries_index_arr['Arrivals'] = pd.to_numeric(df_countries_index_arr['Arrivals'], errors='coerce').fillna(0).astype(int)

# Display the new DataFrame
df_countries_index_arr