In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


In [2]:
def hour_bins(hour):
    '''Function to divide Flight Times into Windows based on observed average fares'''
    if hour in ['00','01','02','03','04','05','06', '07']:
        return '00-07'
    elif hour in ['08','09']:
        return '08-09'
    elif hour in ['10','11','12','13','14','15','16','17','18']:
        return '10-18'
    elif hour in ['19','20','21','22','23']:
        return '19-23'
        
train_url = '../train.csv'
#Read Train and Test Data
train, test = pd.read_csv(train_url), pd.read_csv('../test.csv')
#train, test = pd.read_csv('train.csv'), pd.read_csv('test.csv')

#Business and Economy are binary. Business having a higher fare. Replacing with 1, 0
train = train.replace(['Business', 'Economy'], [1, 0])
test = test.replace(['Business', 'Economy'], [1, 0])

#Create New Features 'Title' and 'Gender' from the 'Name'
train['Title'] = train.apply(lambda row: row[0].split(" ")[0], axis = 1)
train['Gender'] = train.apply(lambda row: row[0].split(" ")[1].split("G")[0], axis = 1)

test['Title'] = test.apply(lambda row: row[0].split(" ")[0], axis = 1)
test['Gender'] = test.apply(lambda row: row[0].split(" ")[1].split("G")[0], axis = 1)

#Male and Female form a binary class. Males paying a higher fare. Replacing with 1, 0
train = train.replace(['M', 'F'], [1, 0])
test = test.replace(['M', 'F'], [1, 0])

#Create feature 'Age' from the 'Date of Birth' and 'Booking Date'
train['Age'] = train.apply(lambda row: int(row[6].split("-")[0]) - int(row[1].split("-")[0]), axis = 1)                                         
test['Age'] = test.apply(lambda row: int(row[6].split("-")[0]) - int(row[1].split("-")[0]), axis = 1)

#Drop 'Date of Birth'
train.drop('Date of Birth', axis = 1, inplace = True)
test.drop('Date of Birth', axis = 1, inplace = True)

#Create a feature called 'From-To' which is a measure of distance b/w cities in some sense
train['From-To'] = train.apply(lambda row: row[1]+'-'+row[2], axis = 1)
test['From-To'] = test.apply(lambda row: row[1]+'-'+row[2], axis = 1)

#Create a feature 'Days to Journey' from 'Flight Date' and 'Booking Date'
train['Flight Date'] = pd.to_datetime(train['Flight Date'])
train['Booking Date'] = pd.to_datetime(train['Booking Date'])
train['Days to Journey'] = train.apply(lambda row: (row[3] - row[5]).days, axis = 1)

test['Flight Date'] = pd.to_datetime(test['Flight Date'])
test['Booking Date'] = pd.to_datetime(test['Booking Date'])
test['Days to Journey'] = test.apply(lambda row: (row[3] - row[5]).days, axis = 1)

#Retaining only the 'Hours' component of 'Flight Time'
train['Hours'] = train.apply(lambda row: row[4].split(":")[0] , axis = 1)
test['Hours'] = test.apply(lambda row: row[4].split(":")[0] , axis = 1)

#Dividing 'Hours' into windows based on average fares
train['Hour'] = train.apply(lambda row: hour_bins(row[13]), axis = 1)
test['Hour'] = test.apply(lambda row: hour_bins(row[12]), axis = 1)

# Drop 'Hours'
train.drop('Hours', axis = 1, inplace = True)
test.drop('Hours', axis = 1, inplace = True)

#Drop 'Flight Time'
train.drop('Flight Time', axis = 1, inplace = True)
test.drop('Flight Time', axis = 1, inplace = True)

#Divide people into age groups by creating Age-bins (span 10 years)
train['Age_bin'] = train.apply(lambda row: str(int(row[9]/10)*10) +'-'+str(int(row[9]/10)*10 + 9)\
                               , axis = 1)
test['Age_bin'] = test.apply(lambda row: str(int(row[8]/10)*10) +'-'+str(int(row[8]/10)*10 + 9)\
                               , axis = 1)

#Drop unnecessary features
train.drop(['Age','Flight Date', 'Booking Date', 'Name'], axis = 1, inplace = True)
test.drop(['Age','Flight Date', 'Booking Date', 'Name'], axis = 1, inplace = True)
 
train = train.astype({'From':'category', 'To':'category', 'Class':'category', 'Title':'category',\
                     'From-To':'category', 'Hour':'category', 'Age_bin':'category'})
test = test.astype({'From':'category', 'To':'category', 'Class':'category', 'Title':'category',\
                    'From-To':'category', 'Hour':'category', 'Age_bin':'category'})

#Prepare Data for Model
train_Y = train['Fare']
train_X = train.drop('Fare', axis = 1)
test_X = test

#Make Predictions
train_X, test_X = pd.get_dummies(train_X), pd.get_dummies(test_X)
fin_model = GradientBoostingRegressor(n_estimators=100, max_depth=8)
fin_model.fit(train_X, train_Y)
Predictions = fin_model.predict(test_X)