In [4]:
import pandas as pd
import numpy as np
import calendar

from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [5]:
# Load datasets
df_train = pd.read_csv("../preprocessing/train_data.csv")
df_test = pd.read_csv("../preprocessing/test_data.csv")
df_holidays = pd.read_excel("../Dataset/singapore_holidays_00_25_month.xlsx")
df_inflation = pd.read_csv("global_inflation_data.csv")

# Define mapping of month names to numbers
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Apply the month mapping to both train and test data
df_train['Month'] = df_train['Month'].map(month_mapping)
df_test['Month'] = df_test['Month'].map(month_mapping)

# Sort by Year and Month (ascending order) for both datasets
df_train = df_train.sort_values(by=['Year', 'Month'], ascending=[True, True]).reset_index(drop=True)
df_test = df_test.sort_values(by=['Year', 'Month'], ascending=[True, True]).reset_index(drop=True)

# Convert Month to integer type for both datasets
df_train['Month'] = df_train['Month'].astype(int)
df_test['Month'] = df_test['Month'].astype(int)

# Merge holidays data with train and test data
df_train = pd.merge(df_train, df_holidays, how='left', on=['Year', 'Month'])
df_test = pd.merge(df_test, df_holidays, how='left', on=['Year', 'Month'])

# Transform inflation data from wide to long format (melt)
inflation_df = df_inflation.melt(id_vars=["country_name", "indicator_name"], 
                                 var_name="Year", value_name="Inflation")

# Convert "Year" to integer
inflation_df["Year"] = inflation_df["Year"].astype(int)

# Drop the "indicator_name" column
inflation_df.drop(columns=["indicator_name"], inplace=True)

# Rename 'country_name' to 'Country' for merging consistency
inflation_df.rename(columns={"country_name": "Country"}, inplace=True)

# Merge global inflation data with train and test data
df_train = pd.merge(df_train, inflation_df, how='left', on=["Country", "Year"])
df_test = pd.merge(df_test, inflation_df, how='left', on=["Country", "Year"])

# Forward fill missing values in Inflation column
df_train["Inflation"].fillna(method="ffill", inplace=True)
df_test["Inflation"].fillna(method="ffill", inplace=True)

# Drop rows where Inflation is still missing after forward fill
df_train = df_train.dropna(subset=["Inflation"])
df_test = df_test.dropna(subset=["Inflation"])

# Clean up unnecessary columns (only drop if the column exists)
columns_to_drop = ['country_name', 'year']
df_train = df_train.drop(columns=[col for col in columns_to_drop if col in df_train.columns])
df_test = df_test.drop(columns=[col for col in columns_to_drop if col in df_test.columns])

# Save the merged data to CSV for checking purposes
df_train.to_csv('train_data_merged.csv', index=False)
df_test.to_csv('test_data_merged.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["Inflation"].fillna(method="ffill", inplace=True)
  df_train["Inflation"].fillna(method="ffill", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["Inflation"].fillna(method="ffill", inplace=True)
  df_test["Inflation"].fillna(method="ffill", inplace=

In [6]:
# Convert Month into cyclic features
df_train['Month_sin'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['Month_cos'] = np.cos(2 * np.pi * df_train['Month'] / 12)
df_test['Month_sin'] = np.sin(2 * np.pi * df_test['Month'] / 12)
df_test['Month_cos'] = np.cos(2 * np.pi * df_test['Month'] / 12)

# Create target variable: Total Traffic (Arrivals + Departures)
df_train['Total_Traffic'] = df_train['Arrivals'] + df_train['Departures']
df_test['Total_Traffic'] = df_test['Arrivals'] + df_test['Departures']

# Standardize numerical features like inflation and holidays
scaler = StandardScaler()
df_train[['Inflation', 'Total Holidays']] = scaler.fit_transform(df_train[['Inflation', 'Total Holidays']])
df_test[['Inflation', 'Total Holidays']] = scaler.transform(df_test[['Inflation', 'Total Holidays']])

# Save the processed data to CSV
df_train.to_csv('train_data_feature_engineered.csv', index=False)
df_test.to_csv('test_data_feature_engineered.csv', index=False)
