In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("merged_flight_train_with_holidays.csv")

In [None]:
# Preprocess data
df['Year'] = df['Month (YYYY-MM)'].apply(lambda x: int(x.split('-')[0]))
df['Month'] = df['Month (YYYY-MM)'].apply(lambda x: int(x.split('-')[1]))
df.drop(columns=['Month (YYYY-MM)'], inplace=True)

# Model 1: Predict Arrivals
# Features: 'Country', 'No of holidays', 'Year', 'Month', 'Departures'
X_arrivals = df[['No of holidays', 'Year', 'Month', 'Departures']]
X_arrivals = pd.concat([X_arrivals, pd.get_dummies(df['Country'], prefix='Country', drop_first=True)], axis=1)
y_arrivals = df['Arrivals']

# Model 2: Predict Departures
# Features: 'Country', 'No of holidays', 'Year', 'Month', 'Arrivals'
X_departures = df[['No of holidays', 'Year', 'Month', 'Arrivals']]
X_departures = pd.concat([X_departures, pd.get_dummies(df['Country'], prefix='Country', drop_first=True)], axis=1)
y_departures = df['Departures']

# Split data for Arrivals model
X_train_arr, X_test_arr, y_train_arr, y_test_arr = train_test_split(
    X_arrivals, y_arrivals, test_size=0.3, random_state=42
)

# Split data for Departures model
X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(
    X_departures, y_departures, test_size=0.3, random_state=42
)

# Train Arrivals model
lin_reg_arrivals = LinearRegression()
lin_reg_arrivals.fit(X_train_arr, y_train_arr)

# Train Departures model
lin_reg_departures = LinearRegression()
lin_reg_departures.fit(X_train_dep, y_train_dep)

# Evaluate Arrivals model
y_pred_arr = lin_reg_arrivals.predict(X_test_arr)
mse_arr = mean_squared_error(y_test_arr, y_pred_arr)
r2_arr = r2_score(y_test_arr, y_pred_arr)

# Evaluate Departures model
y_pred_dep = lin_reg_departures.predict(X_test_dep)
mse_dep = mean_squared_error(y_test_dep, y_pred_dep)
r2_dep = r2_score(y_test_dep, y_pred_dep)

# Print results
print("Arrivals Model:")
print(f"Mean Squared Error: {mse_arr:.2f}")
print(f"R² Score: {r2_arr:.4f}")

print("\nDepartures Model:")
print(f"Mean Squared Error: {mse_dep:.2f}")
print(f"R² Score: {r2_dep:.4f}")

# Visualize actual vs predicted values (Arrivals)
plt.figure(figsize=(12, 5))

# Plot for Arrivals
plt.subplot(1, 2, 1)
plt.scatter(y_test_arr, y_pred_arr, alpha=0.5)
plt.plot([y_test_arr.min(), y_test_arr.max()], [y_test_arr.min(), y_test_arr.max()], 'r--', lw=2)
plt.title('Actual vs Predicted Arrivals')
plt.xlabel('Actual Arrivals')
plt.ylabel('Predicted Arrivals')
plt.grid(True, linestyle='--', alpha=0.6)

# Plot for Departures
plt.subplot(1, 2, 2)
plt.scatter(y_test_dep, y_pred_dep, alpha=0.5)
plt.plot([y_test_dep.min(), y_test_dep.max()], [y_test_dep.min(), y_test_dep.max()], 'r--', lw=2)
plt.title('Actual vs Predicted Departures')
plt.xlabel('Actual Departures')
plt.ylabel('Predicted Departures')
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

# Plot residuals to check for patterns
plt.figure(figsize=(12, 5))

# Residual plot for Arrivals
plt.subplot(1, 2, 1)
residuals_arr = y_test_arr - y_pred_arr
plt.scatter(y_pred_arr, residuals_arr, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals for Arrivals Model')
plt.xlabel('Predicted Arrivals')
plt.ylabel('Residuals')
plt.grid(True, linestyle='--', alpha=0.6)

# Residual plot for Departures
plt.subplot(1, 2, 2)
residuals_dep = y_test_dep - y_pred_dep
plt.scatter(y_pred_dep, residuals_dep, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals for Departures Model')
plt.xlabel('Predicted Departures')
plt.ylabel('Residuals')
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'pandas'