# Problem Statement
# To Predict Flight Price.

**This can assist airlines in determining what rates they can keep.**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set()

## Importing dataset

1. Since data is in form of excel file we have to use pandas read_excel to load the data
2. After loading it is important to check the complete information of data as it can indication many of the hidden infomation such as null values in a column or a row
3. Check whether any null values are there or not. if it is present then following can be done,
    1. Imputing data using Imputation method in sklearn
    2. Filling NaN values with mean, median and mode using fillna() method
4. Describe data --> which can give statistical analysis

In [2]:
train_data = pd.read_excel(r"Data_Train.xlsx")

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
train_data

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,9/04/2019,Kolkata,Banglore,CCU → BLR,19:55,22:25,2h 30m,non-stop,No info,4107
10679,Air India,27/04/2019,Kolkata,Banglore,CCU → BLR,20:45,23:20,2h 35m,non-stop,No info,4145
10680,Jet Airways,27/04/2019,Banglore,Delhi,BLR → DEL,08:20,11:20,3h,non-stop,No info,7229
10681,Vistara,01/03/2019,Banglore,New Delhi,BLR → DEL,11:30,14:10,2h 40m,non-stop,No info,12648


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [6]:
train_data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [7]:
train_data.dropna(inplace = True)

In [8]:
train_data.shape

(10682, 11)

In [9]:
train_data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

# Statistical Discription

In [12]:
train_data.describe()

Unnamed: 0,Price
count,10682.0
mean,9087.214567
std,4611.54881
min,1759.0
25%,5277.0
50%,8372.0
75%,12373.0
max,79512.0


In [13]:
train_data.describe(include='all')

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
count,10682,10682,10682,10682,10682,10682,10682,10682,10682,10682,10682.0
unique,12,44,5,6,128,222,1343,368,5,10,
top,Jet Airways,18/05/2019,Delhi,Cochin,DEL → BOM → COK,18:55,19:00,2h 50m,1 stop,No info,
freq,3849,504,4536,4536,2376,233,423,550,5625,8344,
mean,,,,,,,,,,,9087.214567
std,,,,,,,,,,,4611.54881
min,,,,,,,,,,,1759.0
25%,,,,,,,,,,,5277.0
50%,,,,,,,,,,,8372.0
75%,,,,,,,,,,,12373.0


---

## EDA

**From description we can see that Date_of_Journey is a object data type,\
Therefore, we have to convert this datatype into timestamp so as to use this column properly for prediction**



**For this we require pandas **to_datetime** **to convert object data type to datetime dtype.**


**.dt.day method will extract only day of that date**

**dt.month method will extract only month of that date**

In [14]:
train_data["Journey_day"] = pd.to_datetime(train_data.Date_of_Journey, format="%d/%m/%Y").dt.day

In [15]:
train_data["Journey_month"] = pd.to_datetime(train_data["Date_of_Journey"], format = "%d/%m/%Y").dt.month

In [16]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_day,Journey_month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3


In [None]:
# Since we have converted Date_of_Journey column into integers, Now we can drop as it is of no use.

train_data.drop(["Date_of_Journey"], axis = 1, inplace = True)

In [None]:
# Departure time is when a plane leaves the gate. 
# Similar to Date_of_Journey we can extract values from Dep_Time

# Extracting Hours
train_data["Dep_hour"] = pd.to_datetime(train_data["Dep_Time"]).dt.hour

# Extracting Minutes
train_data["Dep_min"] = pd.to_datetime(train_data["Dep_Time"]).dt.minute

In [None]:
# Now we can drop Dep_Time as it is of no use
train_data.drop(["Dep_Time"], axis = 1, inplace = True)

In [None]:
train_data

In [None]:
# Arrival time is when the plane pulls up to the gate.
# Similar to Date_of_Journey we can extract values from Arrival_Time

# Extracting Hours
train_data["Arrival_hour"] = pd.to_datetime(train_data.Arrival_Time).dt.hour

# Extracting Minutes
train_data["Arrival_min"] = pd.to_datetime(train_data.Arrival_Time).dt.minute

# Now we can drop Arrival_Time as it is of no use
train_data.drop(["Arrival_Time"], axis = 1, inplace = True)

In [None]:
train_data.head()

# transform 'Duration' column

In [None]:
duration = list(train_data["Duration"])

In [None]:
duration

In [None]:
# Time taken by plane to reach destination is called Duration
# It is the differnce betwwen Departure Time and Arrival time
# Assigning and converting Duration column into list

for i in range(len(duration)):
    if len(duration[i].split()) != 2:    # Check if duration contains only hour or mins
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"   # Adds 0 minute
#             print('Hour ',duration)
        else:
            duration[i] = "0h " + duration[i]           # Adds 0 hour
#             print('Minutes \n',duration)

In [None]:
for i in range(len(duration)):
#     if len(duration[i].split()) != 2:
        print(duration[i])

In [None]:
# for i in range(len(duration)):
#     if len(duration[i].split()) != 2

In [None]:
train_data['Duration']

In [None]:
duration

In [None]:
duration_hours = []
duration_mins = []
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0]))    # Extract hours from duration
    duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))   # Extracts only minutes from duration

In [None]:
duration[0].split(sep='h')
#index position 0 , integer part is extracted

In [None]:
duration[1].split(sep='m')[0].split()[-1]
#extract minutes 

In [None]:
train_data.columns

In [None]:
# Adding duration_hours and duration_mins list to train_data dataframe

train_data["Duration_hours"] = duration_hours
train_data["Duration_mins"] = duration_mins

In [None]:
train_data.columns

In [None]:
train_data.drop(["Duration"], axis = 1, inplace = True)

In [None]:
train_data.head(3)

In [None]:
train_data.shape

---

## Handling Categorical Data

One can find many ways to handle categorical data. Some of them categorical data are,
1. <span style="color: blue;">**Nominal data**</span> --> data are not in any order --> <span style="color: green;">**OneHotEncoder**</span> is used in this case
2. <span style="color: blue;">**Ordinal data**</span> --> data are in order --> <span style="color: green;">**LabelEncoder**</span> is used in this case

In [None]:
train_data["Airline"].value_counts()

In [None]:
train_data["Airline"].value_counts()

In [None]:
# train_data.loc["Airline"=='Trujet']

In [None]:
# train_data.drop([train_data['Airline'] == 'Trujet'],inplace=True,axis=1)
train_data = train_data[train_data.Airline != 'Trujet']

In [None]:
train_data.shape

In [None]:
train_data.reset_index(drop=True)  

In [None]:
train_data.shape

In [None]:
train_data.sort_values('Price',ascending=False)

# Plot of Airline vs Price

In [None]:
# From graph we can see that Jet Airways Business have the highest Price. ~80k
# Apart from the first Airline almost all are having similar median

# Airline vs Price
sns.catplot(y = "Price", x = "Airline", data = train_data.sort_values("Price", ascending = False), kind="box", height = 6, aspect = 3)
plt.show()

train data 10682,30 remove trujet

test 2671,28 = 'Price' is unavailable

Regression technique : assumptions: How did you assume and verify 

linearity = plot scatter

normality = plot distribution using distplot

homoscedacity = lasso regression

multicollinearity .corr()

outlier available = linear regression will not work.

# Airline column

In [None]:
Airline = train_data[["Airline"]]
Airline

In [None]:
# As Airline is Nominal Categorical data we will perform OneHotEncoding

Airline = pd.get_dummies(Airline, drop_first= True)

# Airline[0:13]
#drop_first means get k - 1 dummies out of k categorical columns so, 11 columns are generated, Air_Asia is skipped alphabetically

# Source

In [None]:
train_data["Source"].value_counts()

In [None]:
# Source vs Price

sns.catplot(y = "Price", x = "Source", data = train_data.sort_values("Price", ascending = False), kind="boxen", height = 4, aspect = 3)
plt.show()
#Delhi value in source column has more median as compared to rest of the values

In [None]:
Source = train_data[["Source"]]
Source

In [None]:
# As Source is Nominal Categorical data we will perform OneHotEncoding
Source = pd.get_dummies(Source, drop_first= True)

Source.head()

# Destination column

In [None]:
train_data["Destination"].value_counts()

In [None]:
Destination = train_data[["Destination"]]
Destination

In [None]:
# As Destination is Nominal Categorical data we will perform OneHotEncoding

Destination = train_data[["Destination"]]

Destination = pd.get_dummies(Destination, drop_first = True)

Destination.head()

# Route column

In [None]:
train_data.columns

In [None]:
# train_data['Additional_Info'].value_counts()
#index position

In [None]:
Addition_no_info = train_data.loc[train_data['Additional_Info'] == 'No info']

In [None]:
# 78% data is not available as in No_info
len(Addition_no_info)/ len(train_data) * 100

In [None]:
# Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other

train_data.drop(["Route", "Additional_Info"], axis = 1, inplace = True)

# Route and Additional info columns are dropped

# Total_Stops

In [None]:
train_data["Total_Stops"].value_counts()

In [None]:
train_data['Total_Stops'].unique()

In [None]:
# As this is case of Ordinal Categorical type we perform LabelEncoder
# Here Values are assigned with corresponding keys
# to replace use replace() method
train_data.replace(['non-stop','1 stop','2 stops','3 stops','4 stops'],[0,1,2,3,4], inplace = True)
#doubt : label encodeing returns float values

In [None]:
train_data

# concatenate dataframes : train_data, Airline,Source,Destination

In [None]:
# Concatenate dataframe --> train_data + Airline + Source + Destination

data_train = pd.concat([train_data, Airline, Source, Destination], axis = 1)

In [None]:
data_train.head()

In [None]:
data_train.shape

In [None]:
data_train.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

In [None]:
data_train.head()

In [None]:
data_train.shape

---

## Test set

In [None]:
test_data = pd.read_excel(r"Test_set.xlsx")

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
# Preprocessing

print("Test data Info")
print("-"*75)
print(test_data.info())

print()
print()

print("Null values :")
print("-"*75)

print(test_data.isnull().sum())

# EDA of Test data

**Date_of_Journey**

In [None]:
test_data["Journey_day"] = pd.to_datetime(test_data.Date_of_Journey, format="%d/%m/%Y").dt.day
test_data["Journey_month"] = pd.to_datetime(test_data["Date_of_Journey"], format = "%d/%m/%Y").dt.month
test_data.drop(["Date_of_Journey"], axis = 1, inplace = True)

In [None]:
test_data

**Departure_time**

In [None]:
# Dep_Time
test_data["Dep_hour"] = pd.to_datetime(test_data["Dep_Time"]).dt.hour
test_data["Dep_min"] = pd.to_datetime(test_data["Dep_Time"]).dt.minute
test_data.drop(["Dep_Time"], axis = 1, inplace = True)

In [None]:
test_data

**Arrival Time**

In [None]:
# Arrival_Time
test_data["Arrival_hour"] = pd.to_datetime(test_data.Arrival_Time).dt.hour
test_data["Arrival_min"] = pd.to_datetime(test_data.Arrival_Time).dt.minute
test_data.drop(["Arrival_Time"], axis = 1, inplace = True)

In [None]:
test_data

**Duration**

In [None]:
# Duration
duration = list(test_data["Duration"])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:    # Check if duration contains only hour or mins
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"   # Adds 0 minute
        else:
            duration[i] = "0h " + duration[i]           # Adds 0 hour

In [None]:
duration_hours = []
duration_mins = []
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0]))    # Extract hours from duration
    duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))   # Extracts only minutes from duration

In [None]:
# Adding Duration column to test set
test_data["Duration_hours"] = duration_hours
test_data["Duration_mins"] = duration_mins
test_data.drop(["Duration"], axis = 1, inplace = True)

In [None]:
test_data.head()

**Converting Categorical data into Numeric data**

**Airline Column**

In [None]:
print("Airline")
print("-"*75)
print(test_data["Airline"].value_counts())
Airline = pd.get_dummies(test_data["Airline"], drop_first= True)

In [None]:
Airline

**Source Column**

In [None]:
print("Source")
print("-"*75)
print(test_data["Source"].value_counts())
Source = pd.get_dummies(test_data["Source"], drop_first= True)

In [None]:
Source

**Destination**

In [None]:
print("Destination")
print("-"*75)
print(test_data["Destination"].value_counts())
Destination = pd.get_dummies(test_data["Destination"], drop_first = True)

In [None]:
Destination

In [None]:
test_data.columns

# Drop columns Route, Additional_Info

In [None]:
# 78% data is not available as in No_info
Addition_no_info_test = test_data.loc[test_data['Additional_Info'] == 'No info']
len(Addition_no_info)/ len(train_data) * 100

In [None]:
# Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other
test_data.drop(["Route", "Additional_Info"], axis = 1, inplace = True)

In [None]:
test_data.head(3)

In [None]:
# Replacing Total_Stops
test_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)

In [None]:
test_data.head()

# Concatenate dataframe --> test_data + Airline + Source + Destination

In [None]:
data_test = pd.concat([test_data, Airline, Source, Destination], axis = 1)

In [None]:
data_test.head()

In [None]:
data_test.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

In [None]:
data_test.head()

In [None]:
data_test.shape

In [None]:
data_train.shape

In [None]:
data_train.columns
#remove trujet

In [None]:
data_test.columns

---

## Feature Selection

Finding out the best feature which will contribute and have good relation with target variable.
Following are some of the feature selection methods,

**Feature selection methods**


1. <span style="color: purple;">**heatmap**</span>
2. <span style="color: purple;">**feature_importance_**</span>


In [None]:
data_train.shape

In [None]:
# train_data.shape

In [None]:
train_data.columns

In [None]:
data_train.columns

In [None]:
data_train.iloc[0:5,0:9]
#slicing

In [None]:
data_train

In [None]:
data_train.loc[:,'Price']

**Divide training data into Independent and dependent variables**

In [None]:
X = data_train.loc[:,['Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour',
       'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours',
       'Duration_mins', 'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Vistara', 'Airline_Vistara Premium economy', 'Source_Chennai',
       'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai', 'Destination_Cochin',
       'Destination_Delhi', 'Destination_Hyderabad', 'Destination_Kolkata',
       'Destination_New Delhi']]
X.head()
# using loc we mentioned labels of features
#labels = names of columns
#independent feature = X
#dependent feature = y

In [None]:
X.shape

In [None]:
y = data_train.iloc[:, 1]
y.head()
#iloc specifies rows and columns by their integer position values

In [None]:
y.shape

In [None]:
data_train.corr()

**0.7399 between Duration_hours and Total_Stops**

**0.603 between Price and total_stops**

**0.508 between Price and Duration Hours**

In [11]:
# Finds correlation between Independent and dependent attributes

# plt.figure(figsize = (10,10))
# sns.heatmap(train_data.corr(), annot = True, cmap = "RdYlGn")

# plt.show()

**Finding highly correlated independent variables**

In [None]:
plt.figure(figsize = (10, 5))
plt.title('Count of flights month wise')
ax=sns.countplot(x = 'Journey_month', data = train_data)
plt.xlabel('Month')
plt.ylabel('Count of flights')
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.25, p.get_height()+1), va='bottom', color= 'black')

In [None]:
import sklearn
# from sklearn.preprocessing import LabelEncoder
# le=LabelEncoder()
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
lr.fit(X,y)

In [None]:
linear_y_pred = lr.predict(X_test)

In [None]:
linear_y_pred

In [None]:
mean_squared_error(linear_y_pred,y_test)

In [None]:
r2_score(linear_y_pred,y_test) * 100

In [None]:
# Important feature using ExtraTreesRegressor

from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor()
selection.fit(X, y)

In [None]:
selection.feature_importances_

In [None]:
print(selection.feature_importances_)
#are these values gini importance?

In [None]:
#plot graph of feature importances for better visualization

plt.figure(figsize = (12,8))
feat_importances = pd.Series(selection.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()
#cutoff 0.4

**we have found important features by using extratreeregressor.They are Total_stops, Airline_Jet Airways, Duration_hours, Airline_JetAirwaysBusiness**

---

## Fitting model using Random Forest

1. Split dataset into train and test set in order to prediction w.r.t X_test
2. If needed do scaling of data
    * Scaling is not done in Random forest
3. Import model
4. Fit the data
5. Predict w.r.t X_test
6. In regression check **RSME** Score
7. Plot graph

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
data_train.shape
#10682 rows, 80% training data, 20% is testing data

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
y

In [None]:
# type(y_test)
# y_test[10]

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train, y_train)

In [None]:
y_test

In [None]:
y_pred = reg_rf.predict(X_test)

In [None]:
# y_pred_test = reg_rf.predict(data_test)

In [None]:
y_pred

In [None]:
reg_rf.score(X_train, y_train)
#Return the coefficient of determination of the prediction.
#best possible score of R square is 1.
#It can be negative if model is worse.

In [None]:
reg_rf.score(X_test, y_test)

In [None]:
sns.distplot(y_test-y_pred)
plt.show()

**Distplot gives Gaussian Distribution**

In [None]:
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

**scatter plot shows increasing order**

# Evaluation metric on Training data

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))# average of the summation of the squared difference
#between the actual output value and the predicted output value.
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Normalizing RMSE value

**This produces a value between 0 and 1, where values closer to 0 represent better fitting models.**

In [None]:
print(169.5664/(max(y)-min(y)))

In [None]:
# RMSE/(max(DV)-min(DV))

print(2090.5509/(max(y)-min(y)))
print(max(y)-min(y))
print(max(y))
print(min(y))
print(2100.0248/(max(y)-min(y)))
# range from 1759/- to 79512/- , RMSE 2100

**RMSE value is 2100 and our range of price is 1759/- to 79512/-, Lets do hyperparameter tuning** 

In [None]:
metrics.r2_score(y_test, y_pred)
#high value of r2 means less difference between predicted and actual values

In [None]:
r2_score(y_test,y_pred) * 100

# Evaluation of model on test dataset

In [None]:
y_pred_test = selection.predict(data_test)

In [None]:
y_pred_test1 = reg_rf.predict(data_test)

In [None]:
y_pred_test1

In [None]:
y_pred_test

In [None]:
y_pred_test.shape

In [None]:
data_test.shape

---

## Hyperparameter Tuning


* Choose following method for hyperparameter tuning
    1. **RandomizedSearchCV** --> Fast
    2. **GridSearchCV**
* Assign hyperparameters in form of dictionary
* Fit the model
* Check best paramters and best score

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
n_estimators

In [None]:
max_features

In [None]:
max_depth

In [None]:
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, 
                               param_distributions = random_grid,scoring='neg_mean_squared_error',
                               n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction = rf_random.predict(X_test)

In [None]:
prediction

In [None]:
y_test-prediction

In [None]:
plt.figure(figsize = (8,8))
sns.distplot(y_test-prediction)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
plt.scatter(y_test, prediction, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))

In [None]:
r2_score(y_test,prediction)

---

# PLOTTING

In [None]:
train_data1 = pd.read_excel(r"Data_Train.xlsx")

In [None]:
train_data1

In [None]:
plt.bar('Airline',10,data=train_data1,color='orange')
plt.xlabel('Type of Airline')
plt.ylabel('Count')
plt.title('Distribution of Airline') # value counts
plt.xticks(rotation=40)

In [None]:
train_data1.columns

In [None]:
train_data1.info()

In [None]:
train_data1.isna().sum()

In [None]:
train_data1.dropna(inplace=True)

In [None]:
train_data1.isna().sum()

In [None]:
print(train_data1['Airline'].value_counts())
print()
print(train_data1['Source'].value_counts())
print()
print(train_data1['Destination'].value_counts())
print()
print(train_data1['Total_Stops'].value_counts())
print()
print(train_data1['Additional_Info'].value_counts())
# print(train_data1['Airline'].value_counts())

In [None]:
# sns.barplot(value_count.index, value_count["Airline"])

In [None]:
sns.countplot('Airline',data=train_data1)
plt.xlabel('Type of Airline')
plt.ylabel('Count')
plt.title('Distribution of Airline')
plt.xticks(rotation=90)

In [None]:
sns.countplot('Source',data=train_data1)
plt.xlabel('Type of Source')
plt.ylabel('Count')
plt.title('Source Counts')
plt.xticks(rotation=90)

In [None]:
sns.countplot('Destination',data=train_data1)
plt.xlabel('Type of Destination')
plt.ylabel('Count')
plt.title('Count of Destination')
plt.xticks(rotation=90)

In [None]:
sns.countplot('Additional_Info',data=train_data1)
plt.xlabel('Type of Additional Info')
plt.ylabel('Count')
plt.title('Distribution of Additional Info')
plt.xticks(rotation=90)

In [None]:
# sns.pairplot(train_data1,hue='Price',kind='scatter')

# Measure of central tendency

In [None]:
train_data1['Airline'].mode()

In [None]:
train_data1['Source'].mode()

In [None]:
train_data1['Destination'].mode()

**One continuous and one categorical variable**

In [None]:
sns.boxplot(train_data1['Airline'],train_data1['Price'])
plt.xticks(rotation=40)

In [None]:
# sns.catplot(data=train_data1, x="Total_Stops", y="Source", hue="Airline", kind="swarm")