Ensemble Methods for `clean_business_df` and `clean_economy_df`
- Bagging and Pasting
- Random Forest

In [1]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
business_df = pd.read_csv('../../data/clean/clean_business_df.csv')
economy_df = pd.read_csv('../../data/clean/clean_economy_df.csv')

In [3]:
business_df.head()

Unnamed: 0,flight_date,airline_name,flight_code,departure_time,departure_city,arrival_time,arrival_city,flight_duration,stops,price,departure_time_group,arrival_time_group
0,2022-02-11,Air India,AI-868,18:00,Delhi,20:00,Mumbai,120,0,25612,Evening,Evening
1,2022-02-11,Air India,AI-624,19:00,Delhi,21:15,Mumbai,135,0,25612,Evening,Night
2,2022-02-11,Air India,AI-531,20:00,Delhi,20:45,Mumbai,1485,1,42220,Evening,Evening
3,2022-02-11,Air India,AI-839,21:25,Delhi,23:55,Mumbai,1590,1,44450,Night,Night
4,2022-02-11,Air India,AI-544,17:15,Delhi,23:55,Mumbai,400,1,46690,Afternoon,Night


In [4]:
economy_df.head()

Unnamed: 0,flight_date,airline_name,flight_code,departure_time,departure_city,arrival_time,arrival_city,flight_duration,stops,price,departure_time_group,arrival_time_group
0,2022-02-11,SpiceJet,SG-8709,18:55,Delhi,21:05,Mumbai,130,0,5953,Evening,Night
1,2022-02-11,SpiceJet,SG-8157,06:20,Delhi,08:40,Mumbai,140,0,5953,Morning,Morning
2,2022-02-11,Air Asia,I5-764,04:25,Delhi,06:35,Mumbai,130,0,5956,Early Morning,Morning
3,2022-02-11,Vistara,UK-995,10:20,Delhi,12:35,Mumbai,135,0,5955,Morning,Afternoon
4,2022-02-11,Vistara,UK-963,08:50,Delhi,11:10,Mumbai,140,0,5955,Morning,Morning


### 1. Data Preprocessing
- Convert 'flight_date' to datetime to extract relevant time features
- Encode categorical features
- Define Features (X) and Target (y)
- Split Data into Training and Testing sets

In [5]:
print(business_df.dtypes)

flight_date             object
airline_name            object
flight_code             object
departure_time          object
departure_city          object
arrival_time            object
arrival_city            object
flight_duration          int64
stops                    int64
price                    int64
departure_time_group    object
arrival_time_group      object
dtype: object


In [6]:
print(economy_df.dtypes)

flight_date             object
airline_name            object
flight_code             object
departure_time          object
departure_city          object
arrival_time            object
arrival_city            object
flight_duration          int64
stops                    int64
price                    int64
departure_time_group    object
arrival_time_group      object
dtype: object


1.1 Drop columns

In [7]:
columns_to_drop = ['flight_date', 'flight_code', 'departure_time', 'arrival_time']
business_df = business_df.drop(columns=columns_to_drop)
economy_df = economy_df.drop(columns=columns_to_drop)

1.2. Encode categorical features

In [8]:
label_encoder = LabelEncoder()

# encode categorical ordinal features
business_df['departure_time_group'] = label_encoder.fit_transform(business_df['departure_time_group'])
economy_df['departure_time_group'] = label_encoder.fit_transform(economy_df['departure_time_group'])

business_df['arrival_time_group'] = label_encoder.fit_transform(business_df['arrival_time_group'])
economy_df['arrival_time_group'] = label_encoder.fit_transform(economy_df['arrival_time_group'])

1.3. Define Features and Target

In [None]:
# business
X_business = business_encoded[['airline_name', 'flight_duration', 'stops', 'departure_hour', 'arrival_hour', 
        'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']]
y_business = business_encoded['price']

In [None]:
# economy
X_economy = economy_encoded[['airline_name', 'flight_duration', 'stops', 'departure_hour', 'arrival_hour', 
        'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']]
y_economy = economy_encoded['price']

1.4. Split Data into Training and Testing Sets

In [None]:
# business
X_business_train, X_business_test, y_business_train, y_business_test = train_test_split(X_business, y_business, test_size=0.3, random_state=42)

In [None]:
# economy
X_economy_train, X_economy_test, y_economy_train, y_economy_test = train_test_split(X_economy, y_economy, test_size=0.3, random_state=42)


## Bagging and Pasting for 'price'

Business Class

In [None]:
# # init  base learner (DecisionTreeRegressor)
# base_model = DecisionTreeRegressor(random_state=42)

# # init Bagging Regressor with bootstrap=False to simulate pasting
# pasting_model = BaggingRegressor(base_model, n_estimators=100, bootstrap=False, max_samples=1.0, random_state=42)

# # init Bagging Regressor with base model
# bagging_model = BaggingRegressor(base_model, n_estimators=100, random_state=42)

# # train the model
# bagging_model.fit(X_business_train, y_business_train)

# # make predictions
# y_pred = bagging_model.predict(X_business_test)

# # train the model
# pasting_model.fit(X_business_train, y_business_train)

# # make predictions
# y_pred_pasting = pasting_model.predict(X_business_test)

# # evaluate
# print("Bagging Regressor Evaluation for Business Class:")
# print("Mean Absolute Error (MAE):", mean_absolute_error(y_business_test, y_pred))
# print("Mean Squared Error (MSE):", mean_squared_error(y_business_test, y_pred))
# print("R-squared (R2) Score:", r2_score(y_business_test, y_pred))

# print("\nSimulated Pasting Regressor Evaluation for Business Class:")
# print("Mean Absolute Error (MAE):", mean_absolute_error(y_business_test, y_pred_pasting))
# print("Mean Squared Error (MSE):", mean_squared_error(y_business_test, y_pred_pasting))
# print("R-squared (R2) Score:", r2_score(y_business_test, y_pred_pasting))

In [None]:
# # Actual vs Predicted Plot
# plt.figure(figsize=(8, 6))
# plt.scatter(y_business_test, y_pred_pasting, color='blue', alpha=0.6)
# plt.plot([min(y_business_test), max(y_business_test)], [min(y_business_test), max(y_business_test)], color='red', linestyle='--')
# plt.title("Actual vs Predicted Prices (Business Class)")
# plt.xlabel("Actual Prices")
# plt.ylabel("Predicted Prices")
# plt.grid(True)
# plt.show()

In [None]:
# # Residual Plot
# residuals = y_business_test - y_pred_pasting
# plt.figure(figsize=(8, 6))
# sns.residplot(x=y_pred_pasting, y=residuals, lowess=True, line_kws={'color': 'red'})
# plt.title("Residuals Plot")
# plt.xlabel("Predicted Prices")
# plt.ylabel("Residuals (Actual - Predicted)")
# plt.grid(True)
# plt.show()

In [None]:
# # Feature Importance Plot
# importances = pasting_model.estimators_[0].feature_importances_
# features = X_business.columns

# plt.figure(figsize=(8, 6))
# plt.barh(features, importances)
# plt.title("Feature Importance from Decision Tree Base Learner (Business Class)")
# plt.xlabel("Importance")
# plt.ylabel("Features")
# plt.grid(True)
# plt.show()

In [None]:
# # 4. Prediction Error Distribution (Histogram of Residuals)
# plt.figure(figsize=(8, 6))
# sns.histplot(residuals, kde=True, color='green')
# plt.title("Prediction Error Distribution (Business Class)")
# plt.xlabel("Residuals (Actual - Predicted)")
# plt.ylabel("Frequency")
# plt.grid(True)
# plt.show()

Economy Class

Outlier Detection using Z-Score

In [None]:
# Assuming you have your dataset in X_economy_train as a pandas DataFrame
# Calculate Z-scores for each feature
z_scores = np.abs(stats.zscore(X_economy_train))

# Set a threshold (usually 3 or -3 for Z-scores)
outliers = (z_scores > 3)

# Identify the rows with outliers
outlier_indices = np.where(outliers)[0]
outlier_rows = X_economy_train.iloc[outlier_indices]

print(f"Number of outliers: {len(outlier_rows)}")
print(outlier_rows)

# Optionally, remove outliers
X_economy_train_clean = X_economy_train[~np.any(outliers, axis=1)]
y_economy_train_clean = y_economy_train[~np.any(outliers, axis=1)]

print("Shape of data after removing outliers:", X_economy_train_clean.shape)


Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

scaler = StandardScaler()  # or RobustScaler()
X_economy_train_scaled = scaler.fit_transform(X_economy_train)
X_economy_test_scaled = scaler.transform(X_economy_test)


Hyperparameter Tuning

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import BaggingRegressor
# from sklearn.tree import DecisionTreeRegressor

# # Initialize the base model (DecisionTreeRegressor)
# base_model = DecisionTreeRegressor(random_state=42)

# # Initialize the Bagging Regressor model
# bagging_model = BaggingRegressor(base_model, random_state=42)

# # Define a parameter grid for tuning
# param_grid = {
#     'base_estimator__max_depth': [3, 5, 10, None],  # Decision tree depth
#     'n_estimators': [50, 100, 200],  # Number of trees in the ensemble
#     'max_samples': [0.5, 0.75, 1.0],  # Fraction of samples used for each model
#     'base_estimator__min_samples_split': [2, 5, 10]  # Min samples for splitting nodes in decision tree
# }

# # Perform GridSearchCV
# grid_search = GridSearchCV(bagging_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# # Fit the GridSearchCV to the training data
# grid_search.fit(X_economy_train_scaled, y_economy_train)

# # Print the best hyperparameters found
# print("Best hyperparameters found: ", grid_search.best_params_)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data into training and testing sets
X_economy_train, X_economy_test, y_economy_train, y_economy_test = train_test_split(X_economy, y_economy, test_size=0.3, random_state=42)

# Initialize the base learner (DecisionTreeRegressor)
base_model = DecisionTreeRegressor(random_state=42)

# Initialize Bagging Regressor with bootstrap=False to simulate pasting
pasting_model = BaggingRegressor(base_model, n_estimators=100, bootstrap=False, max_samples=1.0, random_state=42)

# Initialize Bagging Regressor with the base model
bagging_model = BaggingRegressor(base_model, n_estimators=100, random_state=42)

# Train the Bagging Regressor
bagging_model.fit(X_economy_train_scaled, y_economy_train)

# Make predictions
y_pred = bagging_model.predict(X_economy_test_scaled)

# Train the Simulated Pasting Regressor
pasting_model.fit(X_economy_train_scaled, y_economy_train)

# Make predictions
y_pred_pasting = pasting_model.predict(X_economy_test_scaled)

# Evaluate the Bagging Model
print("Bagging Regressor Evaluation for Economy Class:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_economy_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_economy_test, y_pred))
print("R-squared (R2) Score:", r2_score(y_economy_test, y_pred))

# Evaluate the Simulated Pasting Model
print("\nSimulated Pasting Regressor Evaluation for Economy Class:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_economy_test, y_pred_pasting))
print("Mean Squared Error (MSE):", mean_squared_error(y_economy_test, y_pred_pasting))
print("R-squared (R2) Score:", r2_score(y_economy_test, y_pred_pasting))


In [None]:
# # Actual vs Predicted Plot
# plt.figure(figsize=(8, 6))
# plt.scatter(y_economy_test, y_pred_pasting, color='blue', alpha=0.6)
# plt.plot([min(y_economy_test), max(y_economy_test)], [min(y_economy_test), max(y_economy_test)], color='red', linestyle='--')
# plt.title("Actual vs Predicted Prices (Economy Class)")
# plt.xlabel("Actual Prices")
# plt.ylabel("Predicted Prices")
# plt.grid(True)
# plt.show()

In [None]:
# # Residual Plot
# residuals = y_economy_test - y_pred_pasting
# plt.figure(figsize=(8, 6))
# sns.residplot(x=y_pred_pasting, y=residuals, lowess=True, line_kws={'color': 'red'})
# plt.title("Residuals Plot (Economy Class)")
# plt.xlabel("Predicted Prices")
# plt.ylabel("Residuals (Actual - Predicted)")
# plt.grid(True)
# plt.show()

In [None]:
# # Feature Importance Plot
# importances = pasting_model.estimators_[0].feature_importances_
# features = X_economy.columns

# plt.figure(figsize=(8, 6))
# plt.barh(features, importances)
# plt.title("Feature Importance from Decision Tree Base Learner (Economy Class)")
# plt.xlabel("Importance")
# plt.ylabel("Features")
# plt.grid(True)
# plt.show()

In [None]:
# # Prediction Error Distribution (Histogram of Residuals)
# plt.figure(figsize=(8, 6))
# sns.histplot(residuals, kde=True, color='green')
# plt.title("Prediction Error Distribution (Economy Class)")
# plt.xlabel("Residuals (Actual - Predicted)")
# plt.ylabel("Frequency")
# plt.grid(True)
# plt.show()