In [1]:
#Importing the needed libraries and functions.
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
#Importing the Dataset
url = '/Users/amirrezakamkar/Desktop/Works_in_progress/Ironhack/7_7th_week/Project/machine-learning-project/data/clean/clean_dataset_df.csv'
df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,airline_name,flight_code,departure_city,arrival_city,flight_duration,stops,price,class,days_left,departure_time_group,arrival_time_group
0,SpiceJet,SG-8709,Delhi,Mumbai,130,0,5953,Economy,1,Evening,Night
1,SpiceJet,SG-8157,Delhi,Mumbai,140,0,5953,Economy,1,Early Morning,Morning
2,Air Asia,I5-764,Delhi,Mumbai,130,0,5956,Economy,1,Early Morning,Early Morning
3,Vistara,UK-995,Delhi,Mumbai,135,0,5955,Economy,1,Morning,Afternoon
4,Vistara,UK-963,Delhi,Mumbai,140,0,5955,Economy,1,Morning,Morning


In [4]:
#Removing the irrelevant columns.
df = df.drop(columns = 'flight_code')

In [6]:
#Converting the categorical columns into boolean.
df2 = pd.get_dummies(df, columns=['airline_name', 'departure_city', 'arrival_city', 'departure_time_group', 'arrival_time_group'])

In [7]:
#Spliting the dataset into two sub-dataset, one for economy class and one for business calss.
df_economy = df2[df2['class'] == 'Economy'].reset_index(drop=True)
df_business = df2[df2['class'] == 'Business'].reset_index(drop=True)

In [15]:
df_economy.head()

Unnamed: 0,flight_duration,stops,price,class,days_left,airline_name_Air Asia,airline_name_Air India,airline_name_Go First,airline_name_Indigo,airline_name_SpiceJet,...,departure_time_group_Evening,departure_time_group_Late Night,departure_time_group_Morning,departure_time_group_Night,arrival_time_group_Afternoon,arrival_time_group_Early Morning,arrival_time_group_Evening,arrival_time_group_Late Night,arrival_time_group_Morning,arrival_time_group_Night
0,130,0,5953,Economy,1,False,False,False,False,True,...,True,False,False,False,False,False,False,False,False,True
1,140,0,5953,Economy,1,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
2,130,0,5956,Economy,1,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,135,0,5955,Economy,1,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False
4,140,0,5955,Economy,1,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False


In [34]:
#converting boolean columns into integer (0,1)
bool_cols = df_economy.select_dtypes(include=['bool']).columns  # Select boolean columns
df_economy[bool_cols] = df_economy[bool_cols].astype(int) 
df_economy = df_economy.drop(columns= 'class')

In [38]:
#Selecting features and target columns
features_e = df_economy.drop(columns ='price')
target_e = df_economy['price']

In [40]:
#Spliting the dataset into two groups, one for training and one for test
X_train, X_test, y_train, y_test = train_test_split(features_e, target_e, test_size = 0.20, random_state=0)

In [42]:
#Normalizing the numerical columns using MinMaxScaler
normalizer = MinMaxScaler()

normalizer.fit(X_train)

In [44]:
X_train_norm = normalizer.transform(X_train)

X_test_norm = normalizer.transform(X_test)

In [48]:
#Converting the numpy arrays into Pandas dataframes.
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)

In [50]:
#Using AdaBoost model for prediction.
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [52]:
ada_reg.fit(X_train_norm, y_train)

In [53]:
#Evaluation the performance.
pred = ada_reg.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", ada_reg.score(X_test_norm, y_test))

MAE 893.4615048728755
RMSE 1528.1726770951493




R2 score 0.8330645728894952


In [None]:
# The whole above process will be done for the business class:

In [56]:
bool_cols = df_economy.select_dtypes(include=['bool']).columns  # Select boolean columns
df_business[bool_cols] = df_business[bool_cols].astype(int) 
df_business = df_business.drop(columns= 'class')

In [62]:
features_b = df_business.drop(columns ='price')
target_b = df_business['price']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(features_b, target_b, test_size = 0.20, random_state=0)

In [66]:
normalizer = MinMaxScaler()

normalizer.fit(X_train)

In [68]:
X_train_norm = normalizer.transform(X_train)

X_test_norm = normalizer.transform(X_test)

In [70]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)

In [72]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [76]:
ada_reg.fit(X_train_norm, y_train)

In [77]:
pred = ada_reg.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", ada_reg.score(X_test_norm, y_test))

MAE 2815.9753815543936
RMSE 4963.25828011397




R2 score 0.853140059343147
