## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

## Mape defining

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Importing Data

In [None]:
df_train = pd.read_excel('..\Data\Data_Train.xlsx')
df_test = pd.read_excel('..\Data\Test_set.xlsx')

## Data-Housekeeping

In [None]:
df = pd.concat([df_train,df_test],0,sort=False)

## Feature Engineering

In [None]:
df['Date'] = df['Date_of_Journey'].str.split('/').str[0]
df['Month'] = df['Date_of_Journey'].str.split('/').str[1]
df['Year'] = df['Date_of_Journey'].str.split('/').str[2]

In [None]:
df.drop(columns=['Date_of_Journey'],axis=1,inplace=True)

In [None]:
df['Date'] = df['Date'].astype(int)
df['Month'] = df['Month'].astype(int)
df['Year'] = df['Year'].astype(int)

In [None]:
df['Arrival_Time'] = df['Arrival_Time'].str.split(' ').str[0]

In [None]:
df['Total_Stops'].fillna(value='1 stop',inplace=True)

In [None]:
df['Total_Stops']  = df['Total_Stops'].replace('non-stop','0 stop')

In [None]:
df['Stops'] = df['Total_Stops'].str.split(' ').str[0]

In [None]:
df.drop(columns=['Total_Stops'],inplace=True)

In [None]:
df['Stops'] = df['Stops'].astype(int)

In [None]:
df['Arrival_Time_Hours'] = df['Arrival_Time'].str.split(':').str[0].astype(int)
df['Arrival_Time_Minutes'] = df['Arrival_Time'].str.split(':').str[1].astype(int)


df['Dep_Time_Hours'] = df['Dep_Time'].str.split(':').str[0].astype(int)
df['Dep_Time_Minutes'] = df['Dep_Time'].str.split(':').str[1].astype(int)


df.drop(columns = ['Dep_Time','Arrival_Time'],axis=1,inplace=True)

In [None]:
df['Route 1'] = df['Route'].str.split('→ ').str[0]
df['Route 2'] = df['Route'].str.split('→ ').str[1]
df['Route 3'] = df['Route'].str.split('→ ').str[2]
df['Route 4'] = df['Route'].str.split('→ ').str[3]
df['Route 5'] = df['Route'].str.split('→ ').str[4]
df['Route 6'] = df['Route'].str.split('→ ').str[5]

In [None]:
df = df[df['Duration']!='5m']

In [None]:
for i in df.columns[-6:]:
    df[i].fillna('None',inplace=True)

In [None]:
df['Duration_Hours'] = df['Duration'].str.split('h').str[0].astype(int)

In [None]:
df['Duration_Mins'] = df['Duration'].str.split('h').str[1].str.split('m').str[0]

In [None]:
df['Duration_Mins'].replace('',0,inplace=True)

In [None]:
df['Duration_Mins'] = df['Duration_Mins'].astype(int)

In [None]:
df.drop(columns = ['Route','Duration'],axis=1,inplace=True)

In [None]:
df['Price'].fillna(df['Price'].mean(),inplace=True)

In [None]:
le = LabelEncoder()

for i in df.columns[:4]:
    df[i] = le.fit_transform(df[i])

In [None]:
for route in df[['Route 1', 'Route 2', 'Route 3','Route 4', 'Route 5', 'Route 6',]]:
    df[route] = le.fit_transform(df[route])

## Feature Selection

In [None]:
train_data = df.iloc[:10683]
test_data = df.iloc[10683:]

In [None]:
train_data.head(1)

In [None]:
train_data.drop(columns=['Route 6','Year'],inplace=True)

In [None]:
X = train_data.drop(columns=['Price'],axis=1)
Y = train_data[['Price']]

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.2,random_state=0)

In [None]:
model = SelectFromModel(estimator=Lasso(alpha=0.05,random_state=0))

In [None]:
model.fit(xtrain,ytrain)

## Decision Tree

In [None]:
dt = DecisionTreeRegressor(random_state=0)
dt.fit(xtrain,ytrain)

dt_predictions = dt.predict(xtest)

mean_absolute_percentage_error(y_true=ytest,y_pred=dt_predictions)

## Random Forest

In [None]:
rf = RandomForestRegressor(random_state=0)
rf.fit(xtrain,ytrain)

rf_predictions = rf.predict(xtest)
#np.sqrt(mean_squared_error(ytest,rf_predictions))
mean_absolute_percentage_error(y_true=ytest,y_pred=rf_predictions)

## Tuning Random Forest Model

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)] # trees

max_features = ['auto', 'sqrt'] # Number of features to consider at every split

max_depth = [int(x) for x in np.linspace(5, 30, num = 6)] # Maximum number of levels in tree

min_samples_split = [2, 5, 10, 15, 100] # Minimum number of samples required to split a node

min_samples_leaf = [1, 2, 5, 10] # Minimum number of samples required at each leaf node

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
rf_tuned_model  = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid,
                               scoring='neg_mean_squared_error', 
                               n_iter = 10, cv = 3, 
                               verbose=2, 
                               random_state=0, 
                               n_jobs = 1)

In [None]:
rf_tuned_model.fit(xtrain,ytrain)

In [None]:
y_pred = rf_tuned_model.predict(xtest)

In [None]:
mean_absolute_percentage_error(ytest,y_pred).round(2)