In [18]:
import pandas as pd
import numpy as np
# reference: https://www.kaggle.com/willkoehrsen/introduction-to-feature-selection

In [2]:
full_data = pd.read_csv('../train.csv')
full_data = full_data[(full_data['adr'] < 1000) & (full_data['adr'] > -100)] # remove outliers
full_data = full_data[full_data['is_canceled'] == 0] # only use the uncanceled orders to train

In [9]:
num_features = ["lead_time","arrival_date_week_number","arrival_date_day_of_month",
                "stays_in_weekend_nights","stays_in_week_nights","adults","children",
                "babies","is_repeated_guest", "previous_cancellations",
                "previous_bookings_not_canceled","agent","company",
                "required_car_parking_spaces", "total_of_special_requests"]

cat_features = ["hotel","arrival_date_month","meal","market_segment",
                "distribution_channel","reserved_room_type","deposit_type","customer_type"]
print("Total number of features manually extracted: " + str(len(num_features) + len(cat_features)))

Total number of features manually extracted: 23


In [14]:
train_x = full_data[num_features + cat_features]
train_x.shape

(58766, 23)

# Remove Collinear Variables
However, we found that the numerical columns are all not so correlated, so we didn't remove any of them

In [16]:
# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = train_x.corr().abs()
corr_matrix.head()

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,agent,company,required_car_parking_spaces,total_of_special_requests
lead_time,1.0,0.134553,0.025681,0.186958,0.285071,0.12515,0.026734,0.015798,0.154862,0.040152,0.079817,0.029541,0.156951,0.085576,0.006941
arrival_date_week_number,0.134553,1.0,0.07819,0.031746,0.035357,0.032238,0.003772,0.006699,0.055524,0.03271,0.031342,0.005286,0.093608,0.002509,0.036335
arrival_date_day_of_month,0.025681,0.07819,1.0,0.024729,0.025252,0.002223,0.018744,0.000902,0.005066,0.006765,0.001018,0.00232,0.032566,0.009199,0.006716
stays_in_weekend_nights,0.186958,0.031746,0.024729,1.0,0.520067,0.131191,0.016072,0.013282,0.113629,0.024726,0.053062,0.099975,0.102474,0.025387,0.064818
stays_in_week_nights,0.285071,0.035357,0.025252,0.520067,1.0,0.132806,0.012354,0.011631,0.126741,0.026698,0.058881,0.149081,0.176029,0.028805,0.060801


In [19]:
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

Unnamed: 0,lead_time,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,agent,company,required_car_parking_spaces,total_of_special_requests
lead_time,,0.134553,0.025681,0.186958,0.285071,0.12515,0.026734,0.015798,0.154862,0.040152,0.079817,0.029541,0.156951,0.085576,0.006941
arrival_date_week_number,,,0.07819,0.031746,0.035357,0.032238,0.003772,0.006699,0.055524,0.03271,0.031342,0.005286,0.093608,0.002509,0.036335
arrival_date_day_of_month,,,,0.024729,0.025252,0.002223,0.018744,0.000902,0.005066,0.006765,0.001018,0.00232,0.032566,0.009199,0.006716
stays_in_weekend_nights,,,,,0.520067,0.131191,0.016072,0.013282,0.113629,0.024726,0.053062,0.099975,0.102474,0.025387,0.064818
stays_in_week_nights,,,,,,0.132806,0.012354,0.011631,0.126741,0.026698,0.058881,0.149081,0.176029,0.028805,0.060801


In [20]:
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))

There are 0 columns to remove.


# Feature Selection through Feature Importances

In [31]:
import sys
DATA_UTIL_PATH = "../"
sys.path.append(DATA_UTIL_PATH)
import datautil

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from matplotlib import pyplot as plt
from sklearn import svr

In [38]:
# get data that is already processed (imputed and one-hot encoded)
train_x, train_y = datautil.get_preprocessed_xy()
print("shape of train_x is " + str(train_x.shape))
print("shape of train_y is " + str(train_y.shape))

shape of train_x is (58766, 62)
shape of train_y is (58766,)


In [37]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(train_x.shape[1])

# load the best model saved before
from joblib import dump, load
model = load('svm.model')

In [None]:
# Find the features with zero importance
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
feature_importances.tail()