## Import Libraries

In [1]:
# Matrix operations
import pandas as pd
import numpy as np


# Preprocessing
import category_encoders as ce 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from corr_code import get_cat_correlated_cols,get_correlated_cols

# Metrics 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_log_error,make_scorer

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, Lars, LarsCV

In [27]:
import re

### Load Data 

In [2]:
train = pd.read_csv("data/Train.csv")
test = pd.read_csv("data/Test.csv")

# SHape
print("Train Shape ",train.shape)
print("Test Shape ",test.shape)

Train Shape  (21000, 15)
Test Shape  (9000, 14)


In [3]:
# Target Columns
target = "Per Person Price"
train.head(3)

Unnamed: 0,Uniq Id,Package Name,Package Type,Destination,Itinerary,Places Covered,Travel Date,Hotel Details,Start City,Airline,Flight Stops,Meals,Sightseeing Places Covered,Cancellation Rules,Per Person Price
0,e788ab76d9d8cf1e6ed2f139645ca5d1,Best of Shimla and Manali Holiday from Delhi,Standard,New Delhi|Shimla|Manali|Chandigarh,1N New Delhi . 2N Shimla . 2N Manali . 1N Chan...,New Delhi|Shimla|Manali|Chandigarh,30-07-2021,Not Available,Mumbai,Not Available,2,3,Not Available,Not Available,11509.0
1,178f892630ce3e335a5a41d5d83937fd,Kashmir Valley vacation,Premium,Srinagar|Pahalgam|Srinagar,1N Srinagar . 2N Pahalgam . 1N Srinagar,Srinagar|Pahalgam|Srinagar,08-12-2021,The Orchard Retreat & Spa:4.6|WelcomHotel Pine...,New Delhi,IndiGo|IndiGo,0,5,Dal Lake | Avantipura Ruins | Mughal Gardens ...,Cancellation any time after making the 1st pay...,22485.5
2,f060f2954840503cc2fdaf495357b7df,Might of Mewar- Udaipur and Chittorgarh,Luxury,Udaipur|Chittorgarh,2N Udaipur . 1N Chittorgarh,Udaipur|Chittorgarh,26-04-2021,The Ananta:4.4|juSTa Lake Nahargarh Palace:4,New Delhi,IndiGo,0,4,Lake Pichola | Jag Mandir Palace | Saheliyon ...,Cancellation any time after making the 1st pay...,12421.5


## Variables

In [4]:
#Drop Uniq Id
train.drop('Uniq Id',axis=1, inplace=True)
test.drop('Uniq Id',axis=1, inplace=True)

In [5]:
# Travel Date
train['Travel Date'] = pd.to_datetime(train['Travel Date'])
test['Travel Date'] = pd.to_datetime(test['Travel Date'])

In [6]:
# Feature Extraction 
train['td_year'] = train['Travel Date'].dt.year
train['td_month'] = train['Travel Date'].dt.month
train['td_weekday'] = train['Travel Date'].dt.weekday

test['td_year'] = test['Travel Date'].dt.year
test['td_month'] = test['Travel Date'].dt.month
test['td_weekday'] = test['Travel Date'].dt.weekday

In [7]:
train.drop(['Travel Date'],axis=1,inplace=True)
test.drop(['Travel Date'],axis=1,inplace=True)

### Feature Engineering

In [8]:
# Drop Correlated Columns

In [9]:
num_drop_cols = get_correlated_cols(train,.9)


In [11]:
num_drop_cols

[]

In [12]:

cat_cols = ['Package Name', 'Package Type', 'Destination', 'Itinerary',
       'Places Covered', 'Hotel Details', 'Start City',
       'Airline', 'Flight Stops', 'Meals', 'Sightseeing Places Covered',
       'Cancellation Rules']
corr_map,cat_drop_cols = get_cat_correlated_cols(train,cat_cols,.9)

In [13]:
cat_drop_cols

{'Destination',
 'Flight Stops',
 'Itinerary',
 'Meals',
 'Places Covered',
 'Sightseeing Places Covered'}

In [14]:
cat_drop_cols.remove('Itinerary')
cat_drop_cols.add('Destination')

In [15]:
cols_to_drop = num_drop_cols + list(cat_drop_cols)

print("Columns to Drop  ", cols_to_drop)

Columns to Drop   ['Sightseeing Places Covered', 'Destination', 'Flight Stops', 'Meals', 'Places Covered']


In [47]:
# Drop the correlated columns
train1 = train.drop(cat_drop_cols,axis=1)
test1 = test.drop(cat_drop_cols,axis=1)

train2 = train.drop(cat_drop_cols,axis=1)
test2 = test.drop(cat_drop_cols,axis=1)

In [48]:
train1.head()

Unnamed: 0,Package Name,Package Type,Itinerary,Hotel Details,Start City,Airline,Cancellation Rules,Per Person Price,td_year,td_month,td_weekday
0,Best of Shimla and Manali Holiday from Delhi,Standard,1N New Delhi . 2N Shimla . 2N Manali . 1N Chan...,Not Available,Mumbai,Not Available,Not Available,11509.0,2021,7,4
1,Kashmir Valley vacation,Premium,1N Srinagar . 2N Pahalgam . 1N Srinagar,The Orchard Retreat & Spa:4.6|WelcomHotel Pine...,New Delhi,IndiGo|IndiGo,Cancellation any time after making the 1st pay...,22485.5,2021,8,3
2,Might of Mewar- Udaipur and Chittorgarh,Luxury,2N Udaipur . 1N Chittorgarh,The Ananta:4.4|juSTa Lake Nahargarh Palace:4,New Delhi,IndiGo,Cancellation any time after making the 1st pay...,12421.5,2021,4,0
3,Colorful Kerala ( Romantic Getaway ),Premium,2N Munnar . 1N Kumarakom . 1N Allepey . 2N Kov...,Elixir Hills Suites Resort & Spa-MMT Holidays ...,New Delhi,IndiGo,Cancellation any time after making the 1st pay...,35967.0,2021,8,4
4,A Week In Bangkok & Pattaya,Premium,4N Pattaya . 3N Bangkok,Dusit Thani Pattaya - MMT Special:4.5|Amari Wa...,New Delhi,Spicejet|Go Air,Cancellation any time after making the 1st pay...,25584.0,2021,12,6


### Extract Features. 

In [49]:
train1['no_places'] = train1['Itinerary'].apply(lambda x : len(x.split(".")))
test1['no_places'] = test1['Itinerary'].apply(lambda x : len(x.split(".")))


In [50]:
train1['no_airline'] =  train1['Airline'].apply(lambda x : len(x.split("|")))
test1['no_airline'] = test1['Airline'].apply(lambda x : len(x.split("|")))

In [51]:
train1['ttl_nights'] =  train1['Itinerary'].apply(lambda x : sum(map(int,re.findall(r"[\d+]+",x))))
test1['ttl_nights'] = test1['Itinerary'].apply(lambda x : sum(map(int,re.findall(r"[\d+]+",x))))

In [52]:
destinations = set('|'.join(train['Destination']).split('|'))
train1[list(destinations)] = [0]*len(destinations)
test1[list(destinations)] = [0]*len(destinations)

for i in range(train1.shape[0]):
    dests = train1.loc[i,'Itinerary']
    for dest in dests.split(' . '):
        val,key = int(dest[0]), dest[3:]
        if key not in destinations:
            print(i)
        train1.loc[i,key] = val 
for i in range(test1.shape[0]):
    dests = test1.loc[i,'Itinerary']
    for dest in dests.split(' . '):
        val,key = int(dest[0]), dest[3:]
        if key not in destinations:
            print(i)
        test1.loc[i,key] = val 

In [55]:
airlines = set('|'.join(train1['Airline']).split('|'))

In [61]:
train1[list(airlines)] = [0]*len(airlines)
test1[list(airlines)] = [0]*len(airlines)

In [66]:
for i in range(train1.shape[0]):
    dests = train1.loc[i,'Airline']
    for dest in dests.split('|'):
        if dest not in airlines:
            print(i)
        train1.loc[i,dest] =1 
for i in range(test1.shape[0]):
    dests = test1.loc[i,'Airline']
    for dest in dests.split('|'):
        if dest not in airlines:
            print(i)
        test1.loc[i,dest] +=1 

In [76]:
train1['Aeroflot'].sum()

0

In [69]:
airlines

{'Aeroflot',
 'Air Austral',
 'Air Baltic',
 'Air India',
 'Air India Express',
 'Air Mauritius',
 'Air New Zealand',
 'Air Seychelles',
 'AirAsia',
 'AirAsia Indonesia',
 'AirAsia X',
 'Airconnect',
 'Bangkok Airways',
 'British Airways',
 'Brussels Airlines',
 'Cathay Pacific',
 'China Eastern',
 'China Southern',
 'EgyptAir',
 'Emirates',
 'Ethiopian Airlines',
 'Etihad Airways',
 'Finnair',
 'Garuda Indonesia',
 'Go Air',
 'Gulf Air',
 'IndiGo',
 'Jetstar Airways',
 'Jetstar Asia',
 'Kenya Airways',
 'Kuwait Airways',
 'Lufthansa',
 'Malaysia Airlines',
 'Malindo Air',
 'Norwegian',
 'Not Available',
 'Oman Air',
 'Philippine Airlines',
 'Qantas Airways',
 'Qatar Airways',
 'Saudia',
 'Scoot',
 'Sichuan Airlines',
 'Silkair',
 'Singapore Airlines',
 'South African Airways',
 'Spicejet',
 'Srilankan Airlines',
 'Swiss',
 'Thai AirAsia',
 'Thai Airways',
 'Thai Vietjet Air',
 'Turkish Airlines',
 'Uzbekistan Airways',
 'Virgin Atlantic',
 'Vistara'}

In [71]:
train1['Scoot'].sum()

0

In [29]:
train1.columns

Index(['Package Name', 'Package Type', 'Itinerary', 'Hotel Details',
       'Start City', 'Airline', 'Cancellation Rules', 'Per Person Price',
       'td_year', 'td_month', 'td_weekday', 'no_places', 'no_airline',
       'ttl_nights'],
      dtype='object')

In [30]:
train2 = train1.drop(['Package Name','Itinerary','Hotel Details','Airline','Cancellation Rules'],axis=1)
test2 = test1.drop(['Package Name','Itinerary','Hotel Details','Airline','Cancellation Rules'],axis=1)

## Modeling 1 

In [31]:
train2.columns

Index(['Package Type', 'Start City', 'Per Person Price', 'td_year', 'td_month',
       'td_weekday', 'no_places', 'no_airline', 'ttl_nights'],
      dtype='object')

In [32]:
train2.head()

Unnamed: 0,Package Type,Start City,Per Person Price,td_year,td_month,td_weekday,no_places,no_airline,ttl_nights
0,Standard,Mumbai,11509.0,2021,7,4,4,1,6
1,Premium,New Delhi,22485.5,2021,8,3,3,2,4
2,Luxury,New Delhi,12421.5,2021,4,0,2,1,3
3,Premium,New Delhi,35967.0,2021,8,4,4,1,6
4,Premium,New Delhi,25584.0,2021,12,6,2,2,7


In [33]:

cat_cols = ['Package Type', 'Start City','td_year','td_weekday','td_month']

In [38]:
X = train2.drop(['Per Person Price'],axis=1)
y = train2['Per Person Price']
Xt = test2[X.columns]

In [35]:
cat_idx = []
for col in cat_cols:
    cat_idx.append(X.columns.get_loc(col))

print("Category Indexes ",cat_idx)

Category Indexes  [0, 1, 2, 4, 3]


In [36]:
scoring = "neg_mean_squared_log_error"

mdl = CatBoostRegressor(cat_features=cat_idx,verbose=300)
cv_score = cross_val_score(mdl,X,y,scoring = scoring,cv=3)

Learning rate set to 0.062433
0:	learn: 11336.8441525	total: 202ms	remaining: 3m 21s
300:	learn: 7192.8007920	total: 12.8s	remaining: 29.8s
600:	learn: 7000.2679821	total: 25.3s	remaining: 16.8s
900:	learn: 6854.2468806	total: 37.9s	remaining: 4.16s
999:	learn: 6818.6436449	total: 41.9s	remaining: 0us
Learning rate set to 0.062433
0:	learn: 11476.9686488	total: 41ms	remaining: 41s
300:	learn: 7497.9175769	total: 12.3s	remaining: 28.6s
600:	learn: 7311.8917599	total: 25.3s	remaining: 16.8s
900:	learn: 7175.5721115	total: 38.3s	remaining: 4.21s
999:	learn: 7130.8929966	total: 42.9s	remaining: 0us
Learning rate set to 0.062433
0:	learn: 11128.0345375	total: 43.2ms	remaining: 43.2s
300:	learn: 7264.9035078	total: 13s	remaining: 30.1s
600:	learn: 7075.0871794	total: 26.7s	remaining: 17.7s
900:	learn: 6951.2363422	total: 47.8s	remaining: 5.25s
999:	learn: 6915.8895656	total: 52.3s	remaining: 0us


In [37]:
print(" Catboost model")
print("cv_score " ,cv_score)
print("cv meann",cv_score.mean())
print("cv std",cv_score.std())

 Catboost model
cv_score  [-0.11571721 -0.11315707 -0.12021861]
cv meann -0.11636429844729296
cv std 0.0029189444518810796


In [41]:
mdl.fit(X,y)



Learning rate set to 0.067105
0:	learn: 11300.5265390	total: 50.7ms	remaining: 50.6s
300:	learn: 7376.2100009	total: 13.9s	remaining: 32.3s
600:	learn: 7222.9643387	total: 26.9s	remaining: 17.9s
900:	learn: 7130.3168595	total: 41.8s	remaining: 4.6s
999:	learn: 7108.3382145	total: 48s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x22d72be18c8>

In [42]:
dict(zip(mdl.feature_names_,mdl.feature_importances_))

{'Package Type': 24.349856419052678,
 'Start City': 1.03787792507083,
 'td_year': 0.20808718997150058,
 'td_month': 2.241642897455258,
 'td_weekday': 1.8156819629341627,
 'no_places': 9.706238907667492,
 'no_airline': 30.037888848395603,
 'ttl_nights': 30.602725849452433}

In [43]:
res = pd.DataFrame()
res[target]=mdl.predict(Xt)
res.to_csv("cat_pipe.csv",index=False)