## Flight Price Prediction (EDA + Feature Engineering)

In [1]:
# Importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Train data
train_df = pd.read_excel('Data_Train.xlsx')

In [3]:
train_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
# Test data
test_df = pd.read_excel('Test_set.xlsx')

In [5]:
test_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [6]:
# Combine the data. Append method is deprecated.
final_df = pd.concat([train_df, test_df])
final_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0


In [7]:
final_df.tail() # There some NaN values in Price col due to the Test Dataset.

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
2666,Air India,6/06/2019,Kolkata,Banglore,CCU → DEL → BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info,
2667,IndiGo,27/03/2019,Kolkata,Banglore,CCU → BLR,14:20,16:55,2h 35m,non-stop,No info,
2668,Jet Airways,6/03/2019,Delhi,Cochin,DEL → BOM → COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info,
2669,Air India,6/03/2019,Delhi,Cochin,DEL → BOM → COK,04:00,19:15,15h 15m,1 stop,No info,
2670,Multiple carriers,15/06/2019,Delhi,Cochin,DEL → BOM → COK,04:55,19:15,14h 20m,1 stop,No info,


In [8]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13354 entries, 0 to 2670
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Date_of_Journey  13354 non-null  object 
 2   Source           13354 non-null  object 
 3   Destination      13354 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13354 non-null  object 
 6   Arrival_Time     13354 non-null  object 
 7   Duration         13354 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13354 non-null  object 
 10  Price            10683 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.2+ MB


In [9]:
# For retrieve all the Dates in the "Date_of_Journey" col.
final_df['Date_of_Journey'].str.split('/').str[0]

0       24
1        1
2        9
3       12
4       01
        ..
2666     6
2667    27
2668     6
2669     6
2670    15
Name: Date_of_Journey, Length: 13354, dtype: object

In [10]:
## Feature Engineering Process:
# Date_of_Journey
# For retrieve the all the dates, months and year in respective variables
final_df['Date']=final_df['Date_of_Journey'].str.split('/').str[0]
final_df['Month']=final_df['Date_of_Journey'].str.split('/').str[1]
final_df['Year']=final_df['Date_of_Journey'].str.split('/').str[2]

In [11]:
# We got the specific Date, Month & Year cols. in the last of the data.
final_df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0,24,3,2019
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0,1,5,2019


In [12]:
# Alternative method:
# df["Date"]=df["Date_of_Journey"].apply(lambda x:x.split("/")[0])
# df["Month"]=df["Date_of_Journey"].apply(lambda x:x.split("/")[1])
# df["Year"]=df["Date_of_Journey"].apply(lambda x:x.split("/")[2])

In [13]:
# Typecasting Date, Month & Year to Int.
final_df['Date']=final_df['Date'].astype(int)
final_df['Month']=final_df['Month'].astype(int)
final_df['Year']=final_df['Year'].astype(int)

In [14]:
final_df.info() # Date, Month & Year are type of int32.

<class 'pandas.core.frame.DataFrame'>
Index: 13354 entries, 0 to 2670
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Date_of_Journey  13354 non-null  object 
 2   Source           13354 non-null  object 
 3   Destination      13354 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13354 non-null  object 
 6   Arrival_Time     13354 non-null  object 
 7   Duration         13354 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13354 non-null  object 
 10  Price            10683 non-null  float64
 11  Date             13354 non-null  int32  
 12  Month            13354 non-null  int32  
 13  Year             13354 non-null  int32  
dtypes: float64(1), int32(3), object(10)
memory usage: 1.4+ MB


In [15]:
# Drop the 'Date_of_Journey' col.
final_df.drop('Date_of_Journey', axis=1, inplace=True)

In [16]:
final_df.head(1)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0,24,3,2019


In [17]:
# Arrival Time 
final_df['Arrival_Time'].str.split(' ').str[0]

0       01:10
1       13:15
2       04:25
3       23:30
4       21:35
        ...  
2666    20:25
2667    16:55
2668    04:25
2669    19:15
2670    19:15
Name: Arrival_Time, Length: 13354, dtype: object

In [18]:
# Alternative 
final_df['Arrival_Time'] = final_df['Arrival_Time'].apply(lambda x : x.split(' ')[0])

In [19]:
# Split the Arrival_Time into its hour & min
final_df['Arrival_hour'] = final_df['Arrival_Time'].str.split(':').str[0]
final_df['Arrival_min'] = final_df['Arrival_Time'].str.split(':').str[1]

In [20]:
# Typecasting to int
final_df['Arrival_hour']=final_df['Arrival_hour'].astype(int)
final_df['Arrival_min']=final_df['Arrival_min'].astype(int)

In [21]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13354 entries, 0 to 2670
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Source           13354 non-null  object 
 2   Destination      13354 non-null  object 
 3   Route            13353 non-null  object 
 4   Dep_Time         13354 non-null  object 
 5   Arrival_Time     13354 non-null  object 
 6   Duration         13354 non-null  object 
 7   Total_Stops      13353 non-null  object 
 8   Additional_Info  13354 non-null  object 
 9   Price            10683 non-null  float64
 10  Date             13354 non-null  int32  
 11  Month            13354 non-null  int32  
 12  Year             13354 non-null  int32  
 13  Arrival_hour     13354 non-null  int32  
 14  Arrival_min      13354 non-null  int32  
dtypes: float64(1), int32(5), object(9)
memory usage: 1.4+ MB


In [22]:
final_df.head(1)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,non-stop,No info,3897.0,24,3,2019,1,10


In [23]:
# Drop Arrival_Time col.
final_df.drop('Arrival_Time', axis=1, inplace=True)

In [24]:
final_df.head(1)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,2h 50m,non-stop,No info,3897.0,24,3,2019,1,10


In [25]:
# Departure Time
# Split the Dep_Time into its hour & min
final_df['Dept_hour'] = final_df['Dep_Time'].str.split(':').str[0]
final_df['Dept_min'] = final_df['Dep_Time'].str.split(':').str[1]

In [26]:
# Typecasting to int
final_df['Dept_hour']=final_df['Dept_hour'].astype(int)
final_df['Dept_min']=final_df['Dept_min'].astype(int)
# Drop Dep_Time
final_df.drop('Dep_Time', axis=1, inplace=True)

In [27]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13354 entries, 0 to 2670
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Source           13354 non-null  object 
 2   Destination      13354 non-null  object 
 3   Route            13353 non-null  object 
 4   Duration         13354 non-null  object 
 5   Total_Stops      13353 non-null  object 
 6   Additional_Info  13354 non-null  object 
 7   Price            10683 non-null  float64
 8   Date             13354 non-null  int32  
 9   Month            13354 non-null  int32  
 10  Year             13354 non-null  int32  
 11  Arrival_hour     13354 non-null  int32  
 12  Arrival_min      13354 non-null  int32  
 13  Dept_hour        13354 non-null  int32  
 14  Dept_min         13354 non-null  int32  
dtypes: float64(1), int32(7), object(7)
memory usage: 1.3+ MB


In [28]:
# Unique values in the 'Total_Stops' col
final_df['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', nan, '4 stops'],
      dtype=object)

In [29]:
# Only one NaN value in Total_Stops.
final_df['Total_Stops'].isnull().sum() 

1

In [30]:
# That one specific NaN cell
final_df[final_df['Total_Stops'].isnull()]

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min
9039,Air India,Delhi,Cochin,,23h 40m,,No info,7480.0,6,5,2019,9,25,9,45


In [31]:
# Split the Total_Stops values to their respective integers.
final_df['Total_Stops']=final_df['Total_Stops'].map({'non-stop':0, '1 stop':1, '2 stops':2, '3 stops':3, '4 stops':4, 'nan':1})

In [32]:
final_df.head(1)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,0.0,No info,3897.0,24,3,2019,1,10,22,20


In [33]:
# Null values in the dataset
final_df.isnull().sum()

Airline               0
Source                0
Destination           0
Route                 1
Duration              0
Total_Stops           1
Additional_Info       0
Price              2671
Date                  0
Month                 0
Year                  0
Arrival_hour          0
Arrival_min           0
Dept_hour             0
Dept_min              0
dtype: int64

In [34]:
# Drop 'Route' Col.
final_df.drop('Route', axis=1, inplace=True)

In [35]:
final_df.head(1)

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min
0,IndiGo,Banglore,New Delhi,2h 50m,0.0,No info,3897.0,24,3,2019,1,10,22,20


In [36]:
# Additional Info
final_df['Additional_Info'].unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [37]:
# Duration
final_df[final_df['Duration']=='5m'] # Two records of '5m' duration time.

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min
6474,Air India,Mumbai,Hyderabad,5m,2.0,No info,17327.0,6,3,2019,16,55,16,50
2660,Air India,Mumbai,Hyderabad,5m,2.0,No info,,12,3,2019,16,55,16,50


In [38]:
# Drop the above two records
final_df.drop(6474, axis=0, inplace=True)
final_df.drop(2660, axis=0, inplace=True)

In [39]:
# Split the Duration value to hour only without the suffix 'h'
final_df['duration_hour']=(final_df['Duration'].str.split(' ').str[0].str.split('h').str[0]).astype(int)

In [40]:
final_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min,duration_hour
0,IndiGo,Banglore,New Delhi,2h 50m,0.0,No info,3897.0,24,3,2019,1,10,22,20,2
1,Air India,Kolkata,Banglore,7h 25m,2.0,No info,7662.0,1,5,2019,13,15,5,50,7
2,Jet Airways,Delhi,Cochin,19h,2.0,No info,13882.0,9,6,2019,4,25,9,25,19
3,IndiGo,Kolkata,Banglore,5h 25m,1.0,No info,6218.0,12,5,2019,23,30,18,5,5
4,IndiGo,Banglore,New Delhi,4h 45m,1.0,No info,13302.0,1,3,2019,21,35,16,50,4


In [41]:
# Drop duration col.
final_df.drop('Duration', axis=1, inplace=True)

In [42]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13351 entries, 0 to 2670
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13351 non-null  object 
 1   Source           13351 non-null  object 
 2   Destination      13351 non-null  object 
 3   Total_Stops      13350 non-null  float64
 4   Additional_Info  13351 non-null  object 
 5   Price            10681 non-null  float64
 6   Date             13351 non-null  int32  
 7   Month            13351 non-null  int32  
 8   Year             13351 non-null  int32  
 9   Arrival_hour     13351 non-null  int32  
 10  Arrival_min      13351 non-null  int32  
 11  Dept_hour        13351 non-null  int32  
 12  Dept_min         13351 non-null  int32  
 13  duration_hour    13351 non-null  int32  
dtypes: float64(2), int32(8), object(4)
memory usage: 1.1+ MB


### Categorical Features
A categorical feature in data refers to a variable that can take on one of a limited, fixed number of possible values, which typically represent categories or groups. These features are non-numeric and often used to classify or group data points into distinct categories. Examples of categorical features include:

- **Gender**: Male, Female, Other
- **Color**: Red, Blue, Green, etc.
- **Day of the Week**: Monday, Tuesday, etc.
- **Type of Car**: Sedan, SUV, Truck, etc.

Categorical features can be further divided into:

1. **Nominal features**: These have categories without any intrinsic ordering. For example, the color of a car (Red, Blue, Green) doesn't have a natural order.
2. **Ordinal features**: These have categories with a meaningful order but without a fixed distance between them. For example, a satisfaction rating (Low, Medium, High) where "Medium" is between "Low" and "High".

In data processing and machine learning, categorical features often need to be encoded into a numerical format using techniques like one-hot encoding, label encoding, or ordinal encoding before they can be used in algorithms that require numerical input.

In [43]:
# Categorical Features
# Airlines
final_df['Airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

### Label Encoding
Label encoding is a technique used to convert categorical data into numerical format, making it easier for machine learning algorithms to process. This method assigns a unique integer to each category in a categorical feature.

For example, consider a categorical feature "Color" with the categories "Red," "Blue," and "Green." Label encoding would convert these categories into:

- Red: 0
- Blue: 1
- Green: 2

The resulting numerical values can be used as input for machine learning models. However, it's important to note that label encoding can sometimes introduce unintended ordinal relationships. For example, the algorithm might interpret "Green" (2) as being greater than "Blue" (1), even if the categories themselves don't have an inherent order.

In cases where the categorical data does have an ordinal relationship (like "Low," "Medium," "High"), label encoding can be appropriate. However, for purely nominal data without an inherent order, other encoding methods like one-hot encoding might be more suitable to avoid misleading the algorithm.

In [44]:
# Lablel Encoding
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()

In [45]:
final_df['Airline']=labelencoder.fit_transform(final_df['Airline'])
final_df['Source']=labelencoder.fit_transform(final_df['Source'])
final_df['Destination']=labelencoder.fit_transform(final_df['Destination']) 
final_df['Additional_Info']=labelencoder.fit_transform(final_df['Additional_Info']) 

In [46]:
final_df.shape

(13351, 14)

In [47]:
final_df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min,duration_hour
0,3,0,5,0.0,8,3897.0,24,3,2019,1,10,22,20,2
1,1,3,0,2.0,8,7662.0,1,5,2019,13,15,5,50,7
2,4,2,1,2.0,8,13882.0,9,6,2019,4,25,9,25,19
3,3,3,0,1.0,8,6218.0,12,5,2019,23,30,18,5,5
4,3,0,5,1.0,8,13302.0,1,3,2019,21,35,16,50,4


### Series and DataFrame
In pandas, a popular data manipulation library in Python, a **Series** and a **DataFrame** are two primary data structures used for data analysis.

### Series
A pandas Series is a one-dimensional array-like object that can hold data of any type (integers, floats, strings, etc.). It is similar to a column in an Excel spreadsheet or a database table. Each element in a Series has an associated label, known as an index, which can be used to access specific elements. The index can be numerical or labeled with strings.

**Example:**

```python
import pandas as pd

# Creating a Series
data = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(data)
```

Output:
```
a    10
b    20
c    30
d    40
dtype: int64
```

In this example, `a`, `b`, `c`, and `d` are the indices, and 10, 20, 30, and 40 are the values.

### DataFrame
A pandas DataFrame is a two-dimensional, tabular data structure with labeled axes (rows and columns). It can be thought of as a collection of Series objects, sharing the same index. DataFrames are similar to spreadsheets or SQL tables and are one of the most commonly used data structures in data analysis.

A DataFrame can contain data of different types (numeric, string, boolean, etc.) in each column.

**Example:**

```python
import pandas as pd

# Creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)
print(df)
```

Output:
```
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
```

In this DataFrame, "Name," "Age," and "City" are columns, and each row corresponds to a different individual with their respective data.

Both Series and DataFrame objects offer a wide range of functionalities for data manipulation, filtering, aggregation, and visualization, making them essential tools for data analysis in Python.

In [48]:
# Return in Series
final_df['Airline'] 

0       3
1       1
2       4
3       3
4       3
       ..
2666    1
2667    3
2668    4
2669    1
2670    6
Name: Airline, Length: 13351, dtype: int32

In [49]:
# Return in DataFrame
final_df[['Airline']] 

Unnamed: 0,Airline
0,3
1,1
2,4
3,3
4,3
...,...
2666,1
2667,3
2668,4
2669,1


### One-Hot Encoding
One-hot encoding is a technique used to convert categorical data into a numerical format that can be used in machine learning models. It involves creating a binary column for each category in a categorical feature, with each column representing one possible category. For each row, only one of these columns is set to 1 (indicating the presence of that category), and the rest are set to 0.

This method is especially useful when dealing with nominal categorical data (categories without an intrinsic order) because it avoids implying any ordinal relationship between categories, which could be misinterpreted by some algorithms if a different encoding method, like label encoding, were used.

### Example

Suppose we have a categorical feature "Color" with three categories: "Red," "Blue," and "Green." One-hot encoding would transform this feature into three new binary features, each representing one of the possible colors:

| Color  | Color_Red | Color_Blue | Color_Green |
|--------|-----------|------------|-------------|
| Red    | 1         | 0          | 0           |
| Blue   | 0         | 1          | 0           |
| Green  | 0         | 0          | 1           |
| Red    | 1         | 0          | 0           |

In this table, the "Color" column has been replaced by three new columns: "Color_Red," "Color_Blue," and "Color_Green." Each row now has a 1 in the column corresponding to the actual color and 0 in the others.

### Using One-Hot Encoding in Pandas

Pandas provides a convenient function, `pd.get_dummies()`, for performing one-hot encoding:

```python
import pandas as pd

# Sample data
data = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Red']})

# One-hot encoding
one_hot_encoded_data = pd.get_dummies(data, columns=['Color'])
print(one_hot_encoded_data)
```

Output:
```
   Color_Blue  Color_Green  Color_Red
0           0            0          1
1           1            0          0
2           0            1          0
3           0            0          1
```

In this example, the original "Color" column is replaced by three new columns, each representing a color, with binary indicators showing the presence of each color.

In [50]:
# Fill NaN values with 0 for numerical columns
final_df = final_df.fillna(0)

# Check if there are still any NaN values in the DataFrame
print(final_df.isnull().sum())

Airline            0
Source             0
Destination        0
Total_Stops        0
Additional_Info    0
Price              0
Date               0
Month              0
Year               0
Arrival_hour       0
Arrival_min        0
Dept_hour          0
Dept_min           0
duration_hour      0
dtype: int64


In [51]:
# One Hot Encoding
# Typecasting to int
pd.get_dummies(final_df, columns=['Airline', 'Source', 'Destination', 'Additional_Info'], drop_first=True).astype(int)

Unnamed: 0,Total_Stops,Price,Date,Month,Year,Arrival_hour,Arrival_min,Dept_hour,Dept_min,duration_hour,...,Destination_5,Additional_Info_1,Additional_Info_2,Additional_Info_3,Additional_Info_4,Additional_Info_5,Additional_Info_6,Additional_Info_7,Additional_Info_8,Additional_Info_9
0,0,3897,24,3,2019,1,10,22,20,2,...,1,0,0,0,0,0,0,0,1,0
1,2,7662,1,5,2019,13,15,5,50,7,...,0,0,0,0,0,0,0,0,1,0
2,2,13882,9,6,2019,4,25,9,25,19,...,0,0,0,0,0,0,0,0,1,0
3,1,6218,12,5,2019,23,30,18,5,5,...,0,0,0,0,0,0,0,0,1,0
4,1,13302,1,3,2019,21,35,16,50,4,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666,1,0,6,6,2019,20,25,20,30,23,...,0,0,0,0,0,0,0,0,1,0
2667,0,0,27,3,2019,16,55,14,20,2,...,0,0,0,0,0,0,0,0,1,0
2668,1,0,6,3,2019,4,25,21,50,6,...,0,0,0,0,0,0,0,0,1,0
2669,1,0,6,3,2019,19,15,4,0,15,...,0,0,0,0,0,0,0,0,1,0
