Ensemble Methods for `clean_business_df` and `clean_economy_df`
- Bagging and Pasting
- Random Forest

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
# Import necessary libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [None]:
business_df = pd.read_csv('../../data/clean/clean_business_df.csv')
economy_df = pd.read_csv('../../data/clean/clean_economy_df.csv')

In [None]:
business_df.head()

In [None]:
economy_df.head()

### 1. Data Preprocessing
- Convert 'flight_date' to datetime to extract relevant time features
- Encode categorical features
- Define Features (X) and Target (y)
- Split Data into Training and Testing sets

In [None]:
print(business_df.dtypes)

In [None]:
print(economy_df.dtypes)

1.1 Drop columns

In [None]:
columns_to_drop = ['flight_date', 'flight_code', 'departure_time', 'arrival_time']
business_df = business_df.drop(columns=columns_to_drop)
economy_df = economy_df.drop(columns=columns_to_drop)

1.2. Encode categorical features

In [None]:
label_encoder = LabelEncoder()

# encode categorical ordinal features
business_df['departure_time_group'] = label_encoder.fit_transform(business_df['departure_time_group'])
economy_df['departure_time_group'] = label_encoder.fit_transform(economy_df['departure_time_group'])

business_df['arrival_time_group'] = label_encoder.fit_transform(business_df['arrival_time_group'])
economy_df['arrival_time_group'] = label_encoder.fit_transform(economy_df['arrival_time_group'])

1.3. Define Features and Target

In [None]:
# business
X_business = business_df[['airline_name', 'flight_duration', 'stops', 'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']]
y_business = business_df['price']

In [None]:
# economy
X_economy = economy_df[['airline_name', 'flight_duration', 'stops', 'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']]
y_economy = economy_df['price']

1.4. Split Data into Training and Testing Sets

In [None]:
# business
X_business_train, X_business_test, y_business_train, y_business_test = train_test_split(X_business, y_business, test_size=0.3, random_state=42)

In [None]:
# economy
X_economy_train, X_economy_test, y_economy_train, y_economy_test = train_test_split(X_economy, y_economy, test_size=0.3, random_state=42)


## Bagging and Pasting for 'price'

### Business Class

In [None]:

# Check which columns in X_business_train are categorical (strings)
categorical_columns = X_business_train.select_dtypes(include=['object']).columns
# print(categorical_columns)


# Apply label encoding to categorical columns
label_encoder = LabelEncoder()

for col in categorical_columns:
    X_business_train[col] = label_encoder.fit_transform(X_business_train[col])
    X_business_test[col] = label_encoder.transform(X_business_test[col])


In [None]:


# init base learner (DecisionTreeRegressor)
base_model = DecisionTreeRegressor(random_state=42)

# init Bagging Regressor with bootstrap=False to simulate pasting
pasting_model = BaggingRegressor(base_model, n_estimators=100, bootstrap=False, max_samples=1.0, random_state=42)

# init Bagging Regressor with base model
bagging_model = BaggingRegressor(base_model, n_estimators=100, random_state=42)

# train the model
bagging_model.fit(X_business_train, y_business_train)

# make predictions
y_pred = bagging_model.predict(X_business_test)

# train the model
pasting_model.fit(X_business_train, y_business_train)

# make predictions
y_pred_pasting = pasting_model.predict(X_business_test)

# evaluate
print("Bagging Regressor Evaluation for Business Class:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_business_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_business_test, y_pred))
print("R-squared (R2) Score:", r2_score(y_business_test, y_pred))

print("\nSimulated Pasting Regressor Evaluation for Business Class:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_business_test, y_pred_pasting))
print("Mean Squared Error (MSE):", mean_squared_error(y_business_test, y_pred_pasting))
print("R-squared (R2) Score:", r2_score(y_business_test, y_pred_pasting))


In [None]:
# Actual vs Predicted Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_business_test, y_pred_pasting, color='blue', alpha=0.6)
plt.plot([min(y_business_test), max(y_business_test)], [min(y_business_test), max(y_business_test)], color='red', linestyle='--')
plt.title("Actual vs Predicted Prices (Business Class)")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.grid(True)
plt.show()

In [None]:
# Residual Plot
residuals = y_business_test - y_pred_pasting
plt.figure(figsize=(8, 6))
sns.residplot(x=y_pred_pasting, y=residuals, lowess=True, line_kws={'color': 'red'})
plt.title("Residuals Plot")
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals (Actual - Predicted)")
plt.grid(True)
plt.show()

In [None]:
# Feature Importance Plot
importances = pasting_model.estimators_[0].feature_importances_
features = X_business.columns

plt.figure(figsize=(8, 6))
plt.barh(features, importances)
plt.title("Feature Importance from Decision Tree Base Learner (Business Class)")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.grid(True)
plt.show()

In [None]:
# 4. Prediction Error Distribution (Histogram of Residuals)
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='green')
plt.title("Prediction Error Distribution (Business Class)")
plt.xlabel("Residuals (Actual - Predicted)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

### Economy Class

Outlier Detection using Z-Score

In [None]:
# Assuming you have your dataset in X_economy_train as a pandas DataFrame
# Calculate Z-scores for each feature
z_scores = np.abs(stats.zscore(X_economy_train))

# Set a threshold (usually 3 or -3 for Z-scores)
outliers = (z_scores > 3)

# Identify the rows with outliers
outlier_indices = np.where(outliers)[0]
outlier_rows = X_economy_train.iloc[outlier_indices]

print(f"Number of outliers: {len(outlier_rows)}")
print(outlier_rows)

# Optionally, remove outliers
X_economy_train_clean = X_economy_train[~np.any(outliers, axis=1)]
y_economy_train_clean = y_economy_train[~np.any(outliers, axis=1)]

print("Shape of data after removing outliers:", X_economy_train_clean.shape)


In [None]:


# Split the data into training and testing sets
X_economy_train, X_economy_test, y_economy_train, y_economy_test = train_test_split(X_economy, y_economy, test_size=0.3, random_state=42)

# Initialize the base learner (DecisionTreeRegressor)
base_model = DecisionTreeRegressor(random_state=42)

# Initialize Bagging Regressor with bootstrap=False to simulate pasting
pasting_model = BaggingRegressor(base_model, n_estimators=100, bootstrap=False, max_samples=1.0, random_state=42)

# Initialize Bagging Regressor with the base model
bagging_model = BaggingRegressor(base_model, n_estimators=100, random_state=42)

# Train the Bagging Regressor
bagging_model.fit(X_economy_train_scaled, y_economy_train)

# Make predictions
y_pred = bagging_model.predict(X_economy_test_scaled)

# Train the Simulated Pasting Regressor
pasting_model.fit(X_economy_train_scaled, y_economy_train)

# Make predictions
y_pred_pasting = pasting_model.predict(X_economy_test_scaled)

# Evaluate the Bagging Model
print("Bagging Regressor Evaluation for Economy Class:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_economy_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_economy_test, y_pred))
print("R-squared (R2) Score:", r2_score(y_economy_test, y_pred))

# Evaluate the Simulated Pasting Model
print("\nSimulated Pasting Regressor Evaluation for Economy Class:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_economy_test, y_pred_pasting))
print("Mean Squared Error (MSE):", mean_squared_error(y_economy_test, y_pred_pasting))
print("R-squared (R2) Score:", r2_score(y_economy_test, y_pred_pasting))


In [None]:
# # Actual vs Predicted Plot
# plt.figure(figsize=(8, 6))
# plt.scatter(y_economy_test, y_pred_pasting, color='blue', alpha=0.6)
# plt.plot([min(y_economy_test), max(y_economy_test)], [min(y_economy_test), max(y_economy_test)], color='red', linestyle='--')
# plt.title("Actual vs Predicted Prices (Economy Class)")
# plt.xlabel("Actual Prices")
# plt.ylabel("Predicted Prices")
# plt.grid(True)
# plt.show()

In [None]:
# # Residual Plot
# residuals = y_economy_test - y_pred_pasting
# plt.figure(figsize=(8, 6))
# sns.residplot(x=y_pred_pasting, y=residuals, lowess=True, line_kws={'color': 'red'})
# plt.title("Residuals Plot (Economy Class)")
# plt.xlabel("Predicted Prices")
# plt.ylabel("Residuals (Actual - Predicted)")
# plt.grid(True)
# plt.show()

In [None]:
# # Feature Importance Plot
# importances = pasting_model.estimators_[0].feature_importances_
# features = X_economy.columns

# plt.figure(figsize=(8, 6))
# plt.barh(features, importances)
# plt.title("Feature Importance from Decision Tree Base Learner (Economy Class)")
# plt.xlabel("Importance")
# plt.ylabel("Features")
# plt.grid(True)
# plt.show()

In [None]:
# # Prediction Error Distribution (Histogram of Residuals)
# plt.figure(figsize=(8, 6))
# sns.histplot(residuals, kde=True, color='green')
# plt.title("Prediction Error Distribution (Economy Class)")
# plt.xlabel("Residuals (Actual - Predicted)")
# plt.ylabel("Frequency")
# plt.grid(True)
# plt.show()