In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_path = "/content/drive/My Drive/flight_ticket_price_prediction/Input"

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

%matplotlib inline

In [None]:
df_train = pd.read_excel(os.path.join(root_path, 'Data_Train.xlsx'))
df_test = pd.read_excel(os.path.join(root_path, 'Test_set.xlsx'))

In [None]:
print(f'Training: {df_train.shape}')
print(f'Testing: {df_test.shape}')

Training: (10682, 11)
Testing: (2671, 10)


In [None]:
#avoid skew of target column
df_train['Price'] = np.log1p(df_train.Price)

#Drop the single NA row from train data
df_train.dropna(axis=0, inplace=True)

#Drop the single row with 'Duration' 5m in train & replace value for test
df_train.drop(index=6474, axis=0, inplace=True)
df_test.loc[df_test['Duration'] == '5m', 'Duration'] = '11h 50m'

#count of train data
train_count = df_train.shape[0]

#combine train,test for preprocessing & FE
df_full = df_train.append(df_test, ignore_index=True)

In [None]:
df_full['Destination'].replace(to_replace='New Delhi', value='Delhi', inplace=True)
df_full['Additional_Info'].replace(to_replace='No Info', value='No info', inplace=True)

In [None]:
import re

def change_to_min(stime):
  h =  str(stime).split(':')[0].strip()
  m = 0
  if len(str(stime).split(':')) > 1:
    m = str(stime).split(':')[1].strip()
  return int(h)*60 + int(m)

def insert_min(stime):
  stimetemp = re.sub(r"^(\d+)h$", r"\1", stime)
  stimetemp = re.sub(r"^(\d+)h\s(\d+)m", r"\1:\2", stimetemp)
  return change_to_min(stimetemp)

df_full['duration_min'] = df_full['Duration'].apply(lambda x: insert_min(x))

In [None]:
def changeDate(df):
  df['doj'] = pd.to_datetime(df['Date_of_Journey'])
  #df['date_of_journey'] = df['doj'].dt.date
  df['day_of_journey'] = df['doj'].dt.day
  df['month_of_journey'] = df['doj'].dt.month
  df['week_of_journey'] = df['doj'].dt.weekday
  df['month_st_journey'] = df['doj'].dt.is_month_start.astype('int')
  df['month_ed_journey'] = df['doj'].dt.is_month_end.astype('int')
  #return df

changeDate(df_full)

In [None]:
temp=df_full.groupby('Airline')['duration_min'].agg(['mean']).rename({'mean':'duration_mean'},axis=1)
df_full = pd.merge(df_full, temp, on='Airline', how='left')

In [None]:
stop_loc = {}
location = df_full['Route'].str.split(' → ')
for l in location:
  for r in l:
    if r not in stop_loc:
      stop_loc[r] = 1
    else:
      stop_loc[r] += 1

#
df_routes = df_full['Route'].str.split(expand=True).drop(columns=[1,3,5,7,9], 
                                                        axis=1).rename(columns={0:'Route_1',
                                                                                2:'Route_2',
                                                                                4:'Route_3',
                                                                                6:'Route_4',
                                                                                8:'Route_5',
                                                                                10:'Route_6'}).fillna(0)

#
df_routes['Route_1'] = df_routes['Route_1'].map(stop_loc).fillna(0)
df_routes['Route_2'] = df_routes['Route_2'].map(stop_loc).fillna(0)
df_routes['Route_3'] = df_routes['Route_3'].map(stop_loc).fillna(0)
df_routes['Route_4'] = df_routes['Route_4'].map(stop_loc).fillna(0)
df_routes['Route_5'] = df_routes['Route_5'].map(stop_loc).fillna(0)
df_routes['Route_6'] = df_routes['Route_6'].map(stop_loc).fillna(0)

In [None]:
df_full_1 = pd.concat([df_full,df_routes], axis=1)

df_full_1['doj'] = df_full_1['doj'].astype(np.int) / df_full_1['doj'].astype(np.int).min()

In [None]:
df_full_1.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'duration_min', 'doj', 'day_of_journey',
       'month_of_journey', 'week_of_journey', 'month_st_journey',
       'month_ed_journey', 'duration_mean', 'Route_1', 'Route_2', 'Route_3',
       'Route_4', 'Route_5', 'Route_6'],
      dtype='object')

In [None]:
obj_cols = ['Airline','Total_Stops','Additional_Info']

df_source = pd.get_dummies(df_full_1['Source'], prefix='s')
df_destination = pd.get_dummies(df_full_1['Destination'], prefix='d')
df_obj_cols = pd.get_dummies(df_full_1[obj_cols], prefix="", prefix_sep="")

In [None]:
to_drop = ['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
           'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
           'Additional_Info']

In [None]:
df_full = pd.concat([df_full_1,df_source,df_destination,df_obj_cols],axis=1).drop(columns=to_drop, axis=1)
df_full.head()

Unnamed: 0,Price,duration_min,doj,day_of_journey,month_of_journey,week_of_journey,month_st_journey,month_ed_journey,duration_mean,Route_1,Route_2,Route_3,Route_4,Route_5,Route_6,s_Banglore,s_Chennai,s_Delhi,s_Kolkata,s_Mumbai,d_Banglore,d_Cochin,d_Delhi,d_Hyderabad,d_Kolkata,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,Vistara,Vistara Premium economy,1 stop,2 stops,3 stops,4 stops,non-stop,1 Long layover,1 Short layover,2 Long layover,Business class,Change airports,In-flight meal not included,No check-in baggage included,No info,Red-eye flight
0,8.268219,170,1.00447,24,3,6,0,0,289.687988,6627,9414,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,8.944159,445,1.000112,5,1,5,0,0,948.146119,4135,69,158.0,6627.0,0.0,0.0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,9.53842,1140,1.013744,6,9,4,0,0,847.831858,9414,72,7150.0,5795.0,0.0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
3,8.735364,325,1.018772,5,12,3,0,0,289.687988,4135,132,6627.0,0.0,0.0,0.0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
4,9.495745,285,1.0,3,1,3,0,0,289.687988,6627,132,9414.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
train = df_full[:train_count]
test = df_full[train_count:]

print(f'train shape {train.shape}')
print(f'test shape {test.shape}')

train shape (10681, 51)
test shape (2671, 51)


In [None]:
corr_matrix = train.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.95)]
to_drop

['month_of_journey',
 'd_Banglore',
 'd_Cochin',
 'd_Delhi',
 'd_Hyderabad',
 'd_Kolkata',
 '2 stops',
 '3 stops',
 '4 stops']

In [None]:
#For remove single unique value columns
unique_counts = df_full.nunique()
to_drop_unqiue = list(unique_counts[unique_counts == 1].index)
to_drop_unqiue

['month_st_journey', 'month_ed_journey']

In [None]:
X = train.drop(columns=['Price','month_of_journey',
                        'd_Banglore','d_Cochin',
                        'd_Delhi', 'd_Hyderabad',
                        'd_Kolkata', '2 stops',
                        '3 stops', '4 stops',
                        'month_st_journey', 'month_ed_journey'], axis=1)
features = X.columns
y = train['Price']
test.drop(columns=['Price','month_of_journey',
                   'd_Banglore','d_Cochin',
                   'd_Delhi', 'd_Hyderabad',
                   'd_Kolkata', '2 stops',
                   '3 stops', '4 stops',
                   'month_st_journey', 'month_ed_journey'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
import lightgbm as lgb

In [None]:
feature_names = list(X.columns)

features = np.array(X)
labels = np.array(y).reshape((-1, ))
n_iterations = 10

feature_importance_values = np.zeros(len(feature_names))

for _ in range(n_iterations):
  model = lgb.LGBMRegressor(n_estimators=1000, learning_rate = 0.05, verbose = -1)
  train_features, valid_features, train_labels, valid_labels = train_test_split(features, labels, test_size = 0.15)
  model.fit(train_features, train_labels, eval_metric = 'rmse',
                          eval_set = [(valid_features, valid_labels)],
                          early_stopping_rounds = 100, verbose = -1)
  feature_importance_values += model.feature_importances_ / n_iterations  

feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

# Sort features according to importance
feature_importances = feature_importances.sort_values('importance', ascending = False).reset_index(drop = True)

# Extract the features with zero importance
record_zero_importance = feature_importances[feature_importances['importance'] == 0.0]
        
to_drop = list(record_zero_importance['feature'])
to_drop

Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[942]	valid_0's l2: 0.0195554	valid_0's rmse: 0.139841
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[947]	valid_0's l2: 0.0165686	valid_0's rmse: 0.128719
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[969]	valid_0's l2: 0.0206004	valid_0's rmse: 0.143528
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[998]	valid_0's l2: 0.0159539	valid_0's rmse: 0.126309
Training until validation scores don't improve for 100 rounds.
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.017586	valid_0's rmse: 0.132612
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[732]	valid_0's l2: 0.0183958	valid_0's rmse: 0.135631
Training until va

['2 Long layover',
 's_Chennai',
 'Business class',
 'Change airports',
 '1 Long layover',
 '1 Short layover',
 'Trujet',
 'non-stop',
 'Vistara Premium economy',
 's_Delhi',
 'Multiple carriers Premium economy',
 'Jet Airways Business',
 'Route_6',
 'Air India',
 'Red-eye flight']

In [None]:
final_drop = ['Price','month_of_journey','d_Banglore','d_Cochin','d_Delhi', 
              'd_Hyderabad','d_Kolkata', '2 stops','3 stops', '4 stops',
              'month_st_journey', 'month_ed_journey','2 Long layover',
              's_Chennai','Business class','Change airports',
              '1 Long layover','1 Short layover','Trujet','non-stop',
              'Vistara Premium economy','s_Delhi','Multiple carriers Premium economy',
              'Jet Airways Business','Route_6','Air India','Red-eye flight']
X = train.drop(columns=final_drop, axis=1)
features = X.columns
y = train['Price']
test.drop(columns=final_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

scaler = sc.fit(X)
X_sc = scaler.transform(X)
test_sc = scaler.transform(test)

In [None]:
X_sc = pd.DataFrame(X_sc, columns=features)
test_sc = pd.DataFrame(test_sc, columns=features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.20, random_state=3)

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': 6
    }

print('Starting training...')

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=100)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)    

Starting training...
[1]	valid_0's l2: 0.225363
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's l2: 0.190734
[3]	valid_0's l2: 0.162614
[4]	valid_0's l2: 0.140514
[5]	valid_0's l2: 0.121601
[6]	valid_0's l2: 0.10607
[7]	valid_0's l2: 0.0935572
[8]	valid_0's l2: 0.0829584
[9]	valid_0's l2: 0.0746908
[10]	valid_0's l2: 0.0677585
[11]	valid_0's l2: 0.061842
[12]	valid_0's l2: 0.0567519
[13]	valid_0's l2: 0.0528176
[14]	valid_0's l2: 0.0495796
[15]	valid_0's l2: 0.0465887
[16]	valid_0's l2: 0.0441854
[17]	valid_0's l2: 0.0420432
[18]	valid_0's l2: 0.0403378
[19]	valid_0's l2: 0.0387865
[20]	valid_0's l2: 0.0374857
[21]	valid_0's l2: 0.0364216
[22]	valid_0's l2: 0.0354549
[23]	valid_0's l2: 0.0346092
[24]	valid_0's l2: 0.0337658
[25]	valid_0's l2: 0.0329831
[26]	valid_0's l2: 0.0323421
[27]	valid_0's l2: 0.031728
[28]	valid_0's l2: 0.0311925
[29]	valid_0's l2: 0.0308116
[30]	valid_0's l2: 0.0303925
[31]	valid_0's l2: 0.0300151
[32]	valid_0's l2: 0.0295852
[33]	v

In [None]:
#print(f'score: {gbm.score(X_test, y_test)}')
print(f'rmse: {np.sqrt(mean_squared_error(y_test, y_pred))}')

rmse: 0.13539589473076769


In [None]:
final_prediction = np.expm1(gbm.predict(test_sc))
output_path = os.path.join(root_path, 'submit-lgb3.csv')
pd.DataFrame(final_prediction, columns=['Price']).to_csv(output_path)