# Flight Fare Prediction Model

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Reading Training Dataset

In [2]:
df_train = pd.read_excel('Data_Train.xlsx')

In [3]:
df_train.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


## Handling Missing Values

In [4]:
df_train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [5]:
df_train[df_train['Total_Stops'].isnull()]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
9039,Air India,6/05/2019,Delhi,Cochin,,09:45,09:25 07 May,23h 40m,,No info,7480


In [6]:
df_train.dropna(inplace=True)

In [7]:
df_train.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

## Data Preprocessing

### Changing Date Related Features to Datetime

In [8]:
df_train_copy = df_train.copy()

In [9]:
## datatype conversion: object -> datetime
def change_to_datetime(col):
    df_train_copy[col] = pd.to_datetime(df_train_copy[col])

In [10]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [11]:
## applying function to each feature
for feature in ['Dep_Time', 'Arrival_Time', 'Date_of_Journey']:
    change_to_datetime(feature)

In [12]:
df_train_copy.dtypes

Airline                    object
Date_of_Journey    datetime64[ns]
Source                     object
Destination                object
Route                      object
Dep_Time           datetime64[ns]
Arrival_Time       datetime64[ns]
Duration                   object
Total_Stops                object
Additional_Info            object
Price                       int64
dtype: object

In [13]:
## extracting day, month, and year
df_train_copy['Day_of_Journey'] = df_train_copy['Date_of_Journey'].dt.day
df_train_copy['Month_of_Journey'] = df_train_copy['Date_of_Journey'].dt.month
df_train_copy['Year_of_Journey'] = df_train_copy['Date_of_Journey'].dt.year

In [14]:
df_train_copy.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2025-12-15 22:20:00,2025-03-22 01:10:00,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,2025-12-15 05:50:00,2025-12-15 13:15:00,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,2025-12-15 09:25:00,2025-06-10 04:25:00,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,2025-12-15 18:05:00,2025-12-15 23:30:00,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,2025-12-15 16:50:00,2025-12-15 21:35:00,4h 45m,1 stop,No info,13302,1,3,2019


### Cleaning Dep_Time and Arrival_Time

In [15]:
## extracting dep and arrival time
def extract_hrs_mins(df, col):
    df[col+'_Hours'] = df[col].dt.hour
    df[col+'_Minutes'] = df[col].dt.minute

In [16]:
extract_hrs_mins(df_train_copy, "Dep_Time")
extract_hrs_mins(df_train_copy, "Arrival_Time")

In [17]:
df_train_copy.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day_of_Journey,Month_of_Journey,Year_of_Journey,Dep_Time_Hours,Dep_Time_Minutes,Arrival_Time_Hours,Arrival_Time_Minutes
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,2025-12-15 22:20:00,2025-03-22 01:10:00,2h 50m,non-stop,No info,3897,24,3,2019,22,20,1,10
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,2025-12-15 05:50:00,2025-12-15 13:15:00,7h 25m,2 stops,No info,7662,1,5,2019,5,50,13,15
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,2025-12-15 09:25:00,2025-06-10 04:25:00,19h,2 stops,No info,13882,9,6,2019,9,25,4,25
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,2025-12-15 18:05:00,2025-12-15 23:30:00,5h 25m,1 stop,No info,6218,12,5,2019,18,5,23,30
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,2025-12-15 16:50:00,2025-12-15 21:35:00,4h 45m,1 stop,No info,13302,1,3,2019,16,50,21,35


In [18]:
## dropping Dep_Time and Arrival_Time
df_train_copy.drop(['Dep_Time', 'Arrival_Time'], axis=1, inplace=True)

In [19]:
print(df_train_copy.columns)
df_train_copy.shape

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Duration', 'Total_Stops', 'Additional_Info', 'Price', 'Day_of_Journey',
       'Month_of_Journey', 'Year_of_Journey', 'Dep_Time_Hours',
       'Dep_Time_Minutes', 'Arrival_Time_Hours', 'Arrival_Time_Minutes'],
      dtype='object')


(10682, 16)

## Some Data Analysis and Plot

### Analyzing when most flights take off according to the time of day

In [20]:
## converting dep time to time of day
def dep_time(hour):
    if (hour > 4) and (hour <= 8):
        return "Early Morning"
    elif (hour > 8) and (hour <= 12):
        return "Morning"
    elif (hour > 12) and (hour <= 16):
        return "Noon/Afternoon"
    elif (hour > 16) and (hour <= 20):
        return "Evening"
    elif (hour > 20) and (hour <= 24):
        return "Night"
    else:
        return "Late Night"

In [21]:
df_train_copy['Dep_Time_Hours'].apply(dep_time).value_counts()

Dep_Time_Hours
Early Morning     2880
Evening           2357
Morning           2209
Noon/Afternoon    1731
Night             1040
Late Night         465
Name: count, dtype: int64

In [31]:
import plotly.express as px

In [32]:
## converting value counts into df for iplot
vc_df = df_train_copy['Dep_Time_Hours'].apply(dep_time).value_counts().reset_index() 
vc_df.columns = ['Dep_Time', 'Count']
fig = px.bar(vc_df, 
             x='Dep_Time', 
             y='Count', 
             title='Departure Time Distribution', 
             color_discrete_sequence=['#2ca02c'])
fig.update_xaxes(title_text='Departure Time Category')
fig.update_yaxes(title_text='Number of Flights Take Off')
fig.update_layout(
    title={
        'text': 'Departure Time Distribution',
        'x': 0.5,        # center
        'xanchor': 'center',
        'font': {
            'size': 20,
            'family': 'Arial',
            'color': 'black'
        }
    }
)
fig.show()