Ensemble Methods for `clean_business_df` and `clean_economy_df`
- Bagging and Pasting
- Random Forest

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

In [19]:
business_df = pd.read_csv('../../data/clean/clean_business_df.csv')
economy_df = pd.read_csv('../../data/clean/clean_economy_df.csv')

In [20]:
business_df.head()

Unnamed: 0,flight_date,airline_name,flight_code,departure_time,departure_city,arrival_time,arrival_city,flight_duration,stops,price,departure_time_group,arrival_time_group
0,2022-02-11,Air India,AI-868,18:00,Delhi,20:00,Mumbai,120,0,25612,Evening,Evening
1,2022-02-11,Air India,AI-624,19:00,Delhi,21:15,Mumbai,135,0,25612,Evening,Night
2,2022-02-11,Air India,AI-531,20:00,Delhi,20:45,Mumbai,1485,1,42220,Evening,Evening
3,2022-02-11,Air India,AI-839,21:25,Delhi,23:55,Mumbai,1590,1,44450,Night,Night
4,2022-02-11,Air India,AI-544,17:15,Delhi,23:55,Mumbai,400,1,46690,Afternoon,Night


In [21]:
economy_df.head()

Unnamed: 0,flight_date,airline_name,flight_code,departure_time,departure_city,arrival_time,arrival_city,flight_duration,stops,price,departure_time_group,arrival_time_group
0,2022-02-11,SpiceJet,SG-8709,18:55,Delhi,21:05,Mumbai,130,0,5953,Evening,Night
1,2022-02-11,SpiceJet,SG-8157,06:20,Delhi,08:40,Mumbai,140,0,5953,Morning,Morning
2,2022-02-11,Air Asia,I5-764,04:25,Delhi,06:35,Mumbai,130,0,5956,Early Morning,Morning
3,2022-02-11,Vistara,UK-995,10:20,Delhi,12:35,Mumbai,135,0,5955,Morning,Afternoon
4,2022-02-11,Vistara,UK-963,08:50,Delhi,11:10,Mumbai,140,0,5955,Morning,Morning


### 1. Data Preprocessing
- Convert 'flight_date' to datetime to extract relevant time features
- Encode categorical features
- Define Features (X) and Target (y)
- Split Data into Training and Testing sets

In [22]:
print(business_df.dtypes)

flight_date             object
airline_name            object
flight_code             object
departure_time          object
departure_city          object
arrival_time            object
arrival_city            object
flight_duration          int64
stops                    int64
price                    int64
departure_time_group    object
arrival_time_group      object
dtype: object


In [23]:
print(economy_df.dtypes)

flight_date             object
airline_name            object
flight_code             object
departure_time          object
departure_city          object
arrival_time            object
arrival_city            object
flight_duration          int64
stops                    int64
price                    int64
departure_time_group    object
arrival_time_group      object
dtype: object


1.1 - Convert 'flight_date' to datetime and extract relevant time features

In [24]:
# business
business_df['flight_date'] = pd.to_datetime(business_df['flight_date'])
business_df['departure_hour'] = business_df['departure_time'].apply(lambda x: int(x.split(':')[0]))
business_df['arrival_hour'] = business_df['arrival_time'].apply(lambda x: int(x.split(':')[0]))

In [25]:
# economy
economy_df['flight_date'] = pd.to_datetime(economy_df['flight_date'])
economy_df['departure_hour'] = economy_df['departure_time'].apply(lambda x: int(x.split(':')[0]))
economy_df['arrival_hour'] = economy_df['arrival_time'].apply(lambda x: int(x.split(':')[0]))

1.2 - Encode categorical features

In [26]:
def encode_categorical_features(df, encoding_method='label'):
    """
    This function encodes the categorical features in a DataFrame.
    
    Parameters:
    - df: pandas DataFrame containing the dataset
    - encoding_method: 'label' or 'onehot'. 'label' for LabelEncoding, 'onehot' for OneHotEncoding
    
    Returns:
    - df_encoded: pandas DataFrame with categorical features encoded
    """
    # Identify categorical columns (excluding the target variable 'price' and any date columns)
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Drop 'flight_date' column as it's a date type that should not be encoded
    if 'flight_date' in categorical_columns:
        categorical_columns.remove('flight_date')
    
    # Initialize LabelEncoder or OneHotEncoder based on the encoding method
    if encoding_method == 'label':
        encoder = LabelEncoder()
        for col in categorical_columns:
            df[col] = encoder.fit_transform(df[col])
    elif encoding_method == 'onehot':
        # Use OneHotEncoder to apply one-hot encoding
        df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
        return df_encoded
    else:
        raise ValueError("encoding_method must be 'label' or 'onehot'")
    
    return df

In [27]:
# encode business_df
business_encoded = encode_categorical_features(business_df, encoding_method='label')
business_encoded.head()

Unnamed: 0,flight_date,airline_name,flight_code,departure_time,departure_city,arrival_time,arrival_city,flight_duration,stops,price,departure_time_group,arrival_time_group,departure_hour,arrival_hour
0,2022-02-11,0,160,121,2,131,5,120,0,25612,2,2,18,20
1,2022-02-11,0,83,131,2,146,5,135,0,25612,2,5,19,21
2,2022-02-11,0,50,140,2,140,5,1485,1,42220,2,2,20,20
3,2022-02-11,0,154,154,2,175,5,1590,1,44450,5,5,21,23
4,2022-02-11,0,58,113,2,175,5,400,1,46690,0,5,17,23


In [28]:
# encode economy_df
economy_encoded = encode_categorical_features(economy_df, encoding_method='label')
economy_encoded.head()

Unnamed: 0,flight_date,airline_name,flight_code,departure_time,departure_city,arrival_time,arrival_city,flight_duration,stops,price,departure_time_group,arrival_time_group,departure_hour,arrival_hour
0,2022-02-11,4,1415,190,2,230,5,130,0,5953,2,5,18,21
1,2022-02-11,4,1394,40,2,81,5,140,0,5953,4,4,6,8
2,2022-02-11,0,1216,17,2,56,5,130,0,5956,1,4,4,6
3,2022-02-11,7,1566,88,2,128,5,135,0,5955,4,0,10,12
4,2022-02-11,7,1556,70,2,111,5,140,0,5955,4,4,8,11


1.3 - Define Features and Target

In [29]:
# business
X_business = business_encoded[['airline_name', 'flight_duration', 'stops', 'departure_hour', 'arrival_hour', 
        'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']]
y_business = business_encoded['price']

In [30]:
# economy
X_economy = economy_encoded[['airline_name', 'flight_duration', 'stops', 'departure_hour', 'arrival_hour', 
        'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']]
y_economy = economy_encoded['price']

1.4. Split Data into Training and Testing Sets

In [31]:
# business
X_business_train, X_business_test, y_business_train, y_business_test = train_test_split(X_business, y_business, test_size=0.3, random_state=42)

In [32]:
# economy
X_economy_train, X_economy_test, y_economy_train, y_economy_test = train_test_split(X_economy, y_economy, test_size=0.3, random_state=42)
