#### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

#### Data Sanity

In [2]:
flight_data = pd.read_csv('Clean_Dataset.csv')
flight_data.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [3]:
flight_data.shape

(300153, 12)

In [4]:
flight_data.isna().sum()

Unnamed: 0          0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [5]:
flight_data = flight_data.loc[(flight_data["source_city"] == "Delhi") & (flight_data["destination_city"] == "Mumbai")]
flight_data.shape

(15289, 12)

#### Data Prep

In [11]:
X = flight_data.drop(["Unnamed: 0", "price","source_city","destination_city","class"], axis=1)
y = flight_data["price"]

In [12]:
transformed_x = pd.get_dummies(flight_data[["airline", "flight","departure_time","stops","arrival_time","stops"]])
transformed_x.head()

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,flight_6E-152,flight_6E-153,flight_6E-171,flight_6E-181,...,stops_zero,arrival_time_Afternoon,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,stops_one,stops_two_or_more,stops_zero.1
0,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1


In [13]:
X_train, X_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

In [14]:
models = {}
models["SVR_L"] = SVR(kernel='linear')
models["SVR_RBF"] = SVR(kernel='rbf')
models["RFR"] = RandomForestRegressor()

In [15]:
model_acc = {}
for key in models:
        model_r = models[key]
        dt = model_r.fit(X_train,y_train) # fit the model
        # YPred = dt.predict(X_test) # predict
        a = model_r.score(X_test,y_test) # compute accuracy
        model_acc[key] = a
print(model_acc) 


{'SVR_L': -0.28596535473152707, 'SVR_RBF': -0.3719088909467556, 'RFR': 0.34798544082598293}


In [16]:
df = pd.read_json('{"Title":"Hello","Year":"2008","Rated":"N/A","Released":"10 Oct 2008","Runtime":"129 min","Genre":"Drama, Romance","Director":"Atul Agnihotri","Writer":"Atul Agnihotri, Chetan Bhagat, Jalees Sherwani","Actors":"Sharman Joshi, Amrita Arora, Sohail Khan","Plot":"Call-center workers receive a phone call from God.","Language":"Hindi","Country":"India","Awards":"1 nomination","Poster":"https://m.media-amazon.com/images/M/MV5BZGM5NjliODgtODVlOS00OWZmLWIzYzMtMTI2OWIzMTM1ZGRhXkEyXkFqcGdeQXVyNDUzOTQ5MjY@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"3.3/10"}],"Metascore":"N/A","imdbRating":"3.3","imdbVotes":"2,128","imdbID":"tt1087856","Type":"movie","DVD":"05 Apr 2018","BoxOffice":"N/A","Production":"N/A","Website":"N/A","Response":"True"}')
df

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response
0,Hello,2008,,10 Oct 2008,129 min,"Drama, Romance",Atul Agnihotri,"Atul Agnihotri, Chetan Bhagat, Jalees Sherwani","Sharman Joshi, Amrita Arora, Sohail Khan",Call-center workers receive a phone call from ...,...,,3.3,2128,tt1087856,movie,05 Apr 2018,,,,True
