In [167]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pickle
from sklearn.ensemble import GradientBoostingRegressor
import yaml
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler

In [335]:
# Load the config file
with open("../../config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [337]:
# Get the path to the clean data
clean_data_path = config['output_data']['clean_data']

# Load the clean data
cleaned_df = pd.read_csv(clean_data_path)

# Inspect the data
print(cleaned_df.head())

  airline_name flight_code departure_city arrival_city  flight_duration  \
0     SpiceJet     SG-8709          Delhi       Mumbai              130   
1     SpiceJet     SG-8157          Delhi       Mumbai              140   
2     Air Asia      I5-764          Delhi       Mumbai              130   
3      Vistara      UK-995          Delhi       Mumbai              135   
4      Vistara      UK-963          Delhi       Mumbai              140   

   stops  price    class  days_left departure_time_group arrival_time_group  
0      0   5953  Economy          1              Evening              Night  
1      0   5953  Economy          1        Early Morning            Morning  
2      0   5956  Economy          1        Early Morning      Early Morning  
3      0   5955  Economy          1              Morning          Afternoon  
4      0   5955  Economy          1              Morning            Morning  


In [339]:
#drop flight_code column
cleaned_df = cleaned_df.drop(columns=['flight_code'])

In [341]:
cleaned_df

Unnamed: 0,airline_name,departure_city,arrival_city,flight_duration,stops,price,class,days_left,departure_time_group,arrival_time_group
0,SpiceJet,Delhi,Mumbai,130,0,5953,Economy,1,Evening,Night
1,SpiceJet,Delhi,Mumbai,140,0,5953,Economy,1,Early Morning,Morning
2,Air Asia,Delhi,Mumbai,130,0,5956,Economy,1,Early Morning,Early Morning
3,Vistara,Delhi,Mumbai,135,0,5955,Economy,1,Morning,Afternoon
4,Vistara,Delhi,Mumbai,140,0,5955,Economy,1,Morning,Morning
...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Hyderabad,605,1,69265,Business,49,Morning,Evening
300149,Vistara,Chennai,Hyderabad,625,1,77105,Business,49,Afternoon,Night
300150,Vistara,Chennai,Hyderabad,830,1,79099,Business,49,Early Morning,Night
300151,Vistara,Chennai,Hyderabad,600,1,81585,Business,49,Early Morning,Evening


In [343]:
#define columns
categorical_columns = ['airline_name', 'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group']
numerical_columns = ['flight_duration', 'days_left', 'stops']

In [345]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Scale the numerical columns
scaled_numerical = scaler.fit_transform(cleaned_df[numerical_columns])

# Replace the original numerical columns with the scaled values in the DataFrame
cleaned_df[numerical_columns] = scaled_numerical

In [347]:
cleaned_df

Unnamed: 0,airline_name,departure_city,arrival_city,flight_duration,stops,price,class,days_left,departure_time_group,arrival_time_group
0,SpiceJet,Delhi,Mumbai,0.027211,0.0,5953,Economy,0.0,Evening,Night
1,SpiceJet,Delhi,Mumbai,0.030612,0.0,5953,Economy,0.0,Early Morning,Morning
2,Air Asia,Delhi,Mumbai,0.027211,0.0,5956,Economy,0.0,Early Morning,Early Morning
3,Vistara,Delhi,Mumbai,0.028912,0.0,5955,Economy,0.0,Morning,Afternoon
4,Vistara,Delhi,Mumbai,0.030612,0.0,5955,Economy,0.0,Morning,Morning
...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Hyderabad,0.188776,0.5,69265,Business,1.0,Morning,Evening
300149,Vistara,Chennai,Hyderabad,0.195578,0.5,77105,Business,1.0,Afternoon,Night
300150,Vistara,Chennai,Hyderabad,0.265306,0.5,79099,Business,1.0,Early Morning,Night
300151,Vistara,Chennai,Hyderabad,0.187075,0.5,81585,Business,1.0,Early Morning,Evening


In [353]:
#use OneHotEncoder to transform categorical columns to boolean
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(cleaned_df[categorical_columns])
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)

In [355]:
# Create a new DataFrame with encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
encoded_df.isna().sum()

airline_name_Air Asia                 0
airline_name_Air India                0
airline_name_Go First                 0
airline_name_Indigo                   0
airline_name_SpiceJet                 0
airline_name_Vistara                  0
departure_city_Bangalore              0
departure_city_Chennai                0
departure_city_Delhi                  0
departure_city_Hyderabad              0
departure_city_Kolkata                0
departure_city_Mumbai                 0
arrival_city_Bangalore                0
arrival_city_Chennai                  0
arrival_city_Delhi                    0
arrival_city_Hyderabad                0
arrival_city_Kolkata                  0
arrival_city_Mumbai                   0
departure_time_group_Afternoon        0
departure_time_group_Early Morning    0
departure_time_group_Evening          0
departure_time_group_Late Night       0
departure_time_group_Morning          0
departure_time_group_Night            0
arrival_time_group_Afternoon          0


In [357]:
# Concatenate the 'price' and 'class' columns with the preprocessed DataFrame
preprocessed_df = pd.concat([cleaned_df[numerical_columns], encoded_df, cleaned_df[['price', 'class']]], axis=1)

# Display the updated DataFrame
preprocessed_df.head()

Unnamed: 0,flight_duration,days_left,stops,airline_name_Air Asia,airline_name_Air India,airline_name_Go First,airline_name_Indigo,airline_name_SpiceJet,airline_name_Vistara,departure_city_Bangalore,...,departure_time_group_Morning,departure_time_group_Night,arrival_time_group_Afternoon,arrival_time_group_Early Morning,arrival_time_group_Evening,arrival_time_group_Late Night,arrival_time_group_Morning,arrival_time_group_Night,price,class
0,0.027211,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5953,Economy
1,0.030612,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5953,Economy
2,0.027211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5956,Economy
3,0.028912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5955,Economy
4,0.030612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5955,Economy


In [359]:
preprocessed_df.isna().sum()

flight_duration                       0
days_left                             0
stops                                 0
airline_name_Air Asia                 0
airline_name_Air India                0
airline_name_Go First                 0
airline_name_Indigo                   0
airline_name_SpiceJet                 0
airline_name_Vistara                  0
departure_city_Bangalore              0
departure_city_Chennai                0
departure_city_Delhi                  0
departure_city_Hyderabad              0
departure_city_Kolkata                0
departure_city_Mumbai                 0
arrival_city_Bangalore                0
arrival_city_Chennai                  0
arrival_city_Delhi                    0
arrival_city_Hyderabad                0
arrival_city_Kolkata                  0
arrival_city_Mumbai                   0
departure_time_group_Afternoon        0
departure_time_group_Early Morning    0
departure_time_group_Evening          0
departure_time_group_Late Night       0


In [361]:
# Filter dataset for Economy class
economy_class_df = preprocessed_df[preprocessed_df['class'] == 'Economy']
business_class_df = preprocessed_df[preprocessed_df['class'] == 'Business']

# Drop columns that are not useful for prediction
economy_class_df = economy_class_df.drop(columns=['class'])
business_class_df = business_class_df.drop(columns=['class'])

In [363]:
economy_class_df

Index(['flight_duration', 'days_left', 'stops', 'airline_name_Air Asia',
       'airline_name_Air India', 'airline_name_Go First',
       'airline_name_Indigo', 'airline_name_SpiceJet', 'airline_name_Vistara',
       'departure_city_Bangalore', 'departure_city_Chennai',
       'departure_city_Delhi', 'departure_city_Hyderabad',
       'departure_city_Kolkata', 'departure_city_Mumbai',
       'arrival_city_Bangalore', 'arrival_city_Chennai', 'arrival_city_Delhi',
       'arrival_city_Hyderabad', 'arrival_city_Kolkata', 'arrival_city_Mumbai',
       'departure_time_group_Afternoon', 'departure_time_group_Early Morning',
       'departure_time_group_Evening', 'departure_time_group_Late Night',
       'departure_time_group_Morning', 'departure_time_group_Night',
       'arrival_time_group_Afternoon', 'arrival_time_group_Early Morning',
       'arrival_time_group_Evening', 'arrival_time_group_Late Night',
       'arrival_time_group_Morning', 'arrival_time_group_Night', 'price'],
      dtype

In [365]:
business_class_df

Unnamed: 0,flight_duration,days_left,stops,airline_name_Air Asia,airline_name_Air India,airline_name_Go First,airline_name_Indigo,airline_name_SpiceJet,airline_name_Vistara,departure_city_Bangalore,...,departure_time_group_Late Night,departure_time_group_Morning,departure_time_group_Night,arrival_time_group_Afternoon,arrival_time_group_Early Morning,arrival_time_group_Evening,arrival_time_group_Late Night,arrival_time_group_Morning,arrival_time_group_Night,price
206666,0.023810,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,25612
206667,0.028912,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,25612
206668,0.488095,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42220
206669,0.523810,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,44450
206670,0.119048,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,46690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300148,0.188776,1.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,69265
300149,0.195578,1.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,77105
300150,0.265306,1.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,79099
300151,0.187075,1.0,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,81585


In [371]:
# Function to perform Linear Regression
def perform_linear_regression(df, target_column):
    # Split data into features (X) and target (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Initialize and train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Return the results and the trained model
    return {
        "MAE": mae,
        "MSE": mse,
        "R2 Score": r2
    }, model

In [377]:
# Perform Linear Regression for Economy class
economy_results, economy_model = perform_linear_regression(economy_class_df, target_column='price')

# Perform Linear Regression for Business class
business_results, business_model = perform_linear_regression(business_class_df, target_column='price')

# Print formatted results
print("Linear Regression Results:")
print(f"Economy Class:\n"
      f"Mean Absolute Error (MAE): {economy_results['MAE']:.2f}\n"
      f"Mean Squared Error (MSE): {economy_results['MSE']:.2f}\n"
      f"R2 Score: {economy_results['R2 Score']:.4f}\n")

print(f"Business Class:\n"
      f"Mean Absolute Error (MAE): {business_results['MAE']:.2f}\n"
      f"Mean Squared Error (MSE): {business_results['MSE']:.2f}\n"
      f"R2 Score: {business_results['R2 Score']:.4f}")

Linear Regression Results:
Economy Class:
Mean Absolute Error (MAE): 1892.64
Mean Squared Error (MSE): 6881336.02
R2 Score: 0.5081

Business Class:
Mean Absolute Error (MAE): 6936.64
Mean Squared Error (MSE): 81600017.84
R2 Score: 0.5135


In [379]:
# Function to perform Gradient Boosting
def perform_gradient_boosting(df, target_column):
    # Split data into features (X) and target (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Initialize and train the Gradient Boosting model
    model = GradientBoostingRegressor(random_state=0)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Return the results and the trained model
    return {
        "MAE": mae,
        "MSE": mse,
        "R2 Score": r2
    }, model

In [387]:
# Perform Gradient Boosting for Economy class
gradient_results_economy, gradient_model_economy = perform_gradient_boosting(economy_class_df, target_column='price')

# Perform Gradient Boosting for Business class
gradient_results_business, gradient_model_business = perform_gradient_boosting(business_class_df, target_column='price')

# Print formatted results
print("Gradient Boosting Results:")
print(f"Economy Class:\n"
      f"Mean Absolute Error (MAE): {gradient_results_economy['MAE']:.2f}\n"
      f"Mean Squared Error (MSE): {gradient_results_economy['MSE']:.2f}\n"
      f"R2 Score: {gradient_results_economy['R2 Score']:.4f}\n")

print(f"Business Class:\n"
      f"Mean Absolute Error (MAE): {gradient_results_business['MAE']:.2f}\n"
      f"Mean Squared Error (MSE): {gradient_results_business['MSE']:.2f}\n"
      f"R2 Score: {gradient_results_business['R2 Score']:.4f}")

Gradient Boosting Results:
Economy Class:
Mean Absolute Error (MAE): 1315.51
Mean Squared Error (MSE): 4077258.47
R2 Score: 0.7085

Business Class:
Mean Absolute Error (MAE): 5818.16
Mean Squared Error (MSE): 59399758.97
R2 Score: 0.6459


In [383]:
# Function to perform K-Nearest Neighbors (KNN)
def perform_knn(df, target_column):
    # Split data into features (X) and target (y)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # Initialize and train the KNN model
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Return the results and the trained model
    return {
        "MAE": mae,
        "MSE": mse,
        "R2 Score": r2
    }, model

In [389]:
# Perform KNN for Economy class
knn_results_economy, knn_model_economy = perform_knn(economy_class_df, target_column='price')

# Perform KNN for Business class
knn_results_business, knn_model_business = perform_knn(business_class_df, target_column='price')

# Print formatted results
print("KNN Results:")
print(f"Economy Class:\n"
      f"Mean Absolute Error (MAE): {knn_results_economy['MAE']:.2f}\n"
      f"Mean Squared Error (MSE): {knn_results_economy['MSE']:.2f}\n"
      f"R2 Score: {knn_results_economy['R2 Score']:.4f}\n")

print(f"Business Class:\n"
      f"Mean Absolute Error (MAE): {knn_results_economy['MAE']:.2f}\n"
      f"Mean Squared Error (MSE): {knn_results_economy['MSE']:.2f}\n"
      f"R2 Score: {knn_results_economy['R2 Score']:.4f}")

KNN Results:
Economy Class:
Mean Absolute Error (MAE): 902.21
Mean Squared Error (MSE): 2924854.43
R2 Score: 0.7909

Business Class:
Mean Absolute Error (MAE): 902.21
Mean Squared Error (MSE): 2924854.43
R2 Score: 0.7909
