# Modeling

<h3> Prepare data 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from scipy.stats import skew
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

<h4> Sample data into smaller size

In [3]:
all_data = pd.read_csv('../expedia_data/all_data_fixed.csv')

In [11]:
all_data['date_time'] = pd.to_datetime(all_data.date_time)
all_data.sort_values(by=['date_time'],inplace=True)
all_data = all_data.reset_index(drop=True)
all_data.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,new_comp5_rate_percent_diff,new_comp6_rate,new_comp6_inv,new_comp6_rate_percent_diff,new_comp7_rate,new_comp7_inv,new_comp7_rate_percent_diff,new_comp8_rate,new_comp8_inv,new_comp8_rate_percent_diff
0,365278,2012-11-01 00:01:37,24,216,,,225,24202,3,0.0,...,0,0,0,0,0,0,0,0,0,0
1,365278,2012-11-01 00:01:37,24,216,,,225,106786,4,5.0,...,0,0,0,0,0,0,0,0,0,0
2,365278,2012-11-01 00:01:37,24,216,,,225,52429,4,4.0,...,0,0,0,0,0,0,0,0,0,0
3,365278,2012-11-01 00:01:37,24,216,,,225,140423,3,4.0,...,0,0,0,0,0,0,0,0,0,0
4,365278,2012-11-01 00:01:37,24,216,,,225,135609,4,4.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def get_sampled_data(size, data):
    '''
    This function takes in the whole dataset, 
    and outout a sampled subset with data size specified
    '''
    interval_range = len(data)//size
    mid_idx_lst = []
    for i in range(1,size+1):
        mid_idx = (interval_range*(i-1) + interval_range*i)//2
        mid_idx_lst.append(mid_idx)

    print(mid_idx_lst[0],mid_idx_lst[-1])
    data_sampled = data.iloc[mid_idx_lst]
    return data_sampled
    
all_data_sampled = get_sampled_data(1000, all_data)

454 907546


In [13]:
# replace old column NA values to median value
all_data_sampled_non_NA = all_data_sampled.fillna(all_data_sampled.median())

# check NA values
# all_data_sampled_non_NA.isna().sum()

<h3> Split data into training, validation and test set

In [14]:
def split_data(data):
    '''
    This function takes in the whole data set and divide it into training, validation and test data;
    split into 0.8 and 0.2 first; second split using the same ratio into training and valiation
    '''
    training_size_large = int(len(data) * 0.8)   
    validation_size = int(training_size_large * 0.2)
    training_size = training_size_large - validation_size
    test_size = int(len(data) * 0.2)
    
    print('training size: %d'%training_size)
    print('validation size: %d'%validation_size)
    print('test size: %d'%test_size)
    
    # split data manually
    training_data = data[0: training_size]
    validation_data = data[training_size:(training_size + validation_size)]
    test_data = data[(training_size + validation_size): (training_size + validation_size + test_size)]
    
    return training_data, validation_data, test_data
    
training_data, validation_data, test_data = split_data(all_data_sampled_non_NA)

training size: 640
validation size: 160
test size: 200


<h3> LSTM + RNN:

TODO: entity embedding for categorical variables

In [30]:
import time 
start_time  = int(time.time())
import pandas as pd 
import numpy as np
from keras.layers import *
from keras.models import Sequential
import pickle

In [31]:
'''
df_train为去掉target column和unnecessary column得到的training set;
同理，df_validation.
'''
df_train = training_data.drop(columns = ['srch_id','price_usd'])
df_validation = validation_data.drop(columns = ['srch_id','price_usd'])

In [26]:
### get all columns
all_columns = training_data.columns.unique().tolist()
# all_columns

In [23]:
categorical_variables = ['site_id', 'visitor_location_country_id', 'prop_country_id','srch_destination_id','prop_id']
binary_variables = ['random_bool','srch_saturday_night_bool', 'promotion_flag', 'prop_brand_bool',\
                   'comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv',\
                   ] + [col for col in training_data if col.startswith('new')]

continuous_variables = ['visitor_hist_starrating','visitor_hist_adr_usd','prop_starrating','prop_review_score','prop_location_score1','prop_location_score2',\
                     'prop_log_historical_price','srch_length_of_stay','srch_booking_window','srch_adults_count','srch_children_count','srch_room_count','srch_query_affinity_score',\
                     'orig_destination_distance','comp1_rate','comp1_rate_percent_diff','comp2_rate','comp2_rate_percent_diff','comp3_rate','comp3_rate_percent_diff','comp4_rate',\
                     'comp4_rate_percent_diff','comp5_rate','comp5_rate_percent_diff','comp6_rate','comp6_rate_percent_diff','comp7_rate','comp7_rate_percent_diff','comp8_rate',\
                     'comp8_rate_percent_diff']

print('there are %d categorical variables in dataset'%len(categorical_variables))
print('there are %d binary variables in dataset'%len(binary_variables))
print('there are %d continuous variables in dataset'%len(continuous_variables))


there are 5 categorical variables in dataset
there are 42 binary variables in dataset
there are 30 continuous variables in dataset


*Notice: 'srch_id' not necessary for prediction; leave it out

In [24]:
# check number of unique values in each category in training set
for cat_var in categorical_variables:
    print (cat_var, df_train[cat_var].nunique())    

site_id 23
visitor_location_country_id 48
prop_country_id 46
srch_destination_id 468
prop_id 634


In [41]:
### get all the other columns except for categorical variables
other_cols = binary_variables + continuous_variables
# other_cols

In [42]:
def preproc(X_train) : 

    input_list_train = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in categorical_variables :
        
        """
        vals = np.asarray(X_train[c].tolist() )
        input_list_train.append( np.asarray( vals ))
        this fails as keras Expect 0,1,2,3.. as cat and not 1,2,3,5 if there are 4 categories.
        Using below method instead from https://stackoverflow.com/a/45988584 
        
        """
        vals = np.asarray(X_train[c].tolist())
        vals = pd.factorize( vals )[0]
        input_list_train.append( np.asarray(vals)  )
       
    #the rest of the columns
    input_list_train.append(X_train[other_cols].values)
    return input_list_train

In [43]:
df_train_modified = preproc( df_train )
df_validation  = preproc( df_validation  )

In [46]:
len(df_train_modified)
'''
we have 5 categorical variables an 1 for the other variable
'''

6

In [48]:
len(df_train_modified[0]) 
'''
共有640笔training data
'''

640

In [51]:
df_train_modified[0]
df_train_modified[0].ndim

1

In [56]:
'''
将除categorical variables之外的放入一个variable,say "other"
'''
print(df_train_modified[5])
df_train_modified[5].ndim

[[ 0.  1.  1. ... 12.  0. 11.]
 [ 0.  1.  0. ... 12.  0. 11.]
 [ 0.  1.  0. ... 12.  0. 11.]
 ...
 [ 0.  0.  0. ... 12.  0. 11.]
 [ 1.  0.  1. ... 12.  0. 11.]
 [ 0.  1.  0. ... 18.  0. 11.]]


2

In [63]:
'''
The below code adds a embedding network for each of the catgeoriacal variable. 
Each model is appending to a list named models.
'''
models = []

for categorical_var in categorical_variables :
    print ("------------------------------------------------------------------")
    print ("for categorical column ", categorical_var)
    model = Sequential()
    no_of_unique_cat  = df_train[categorical_var].nunique()   # 用包含training set和test set的data?
    print ("number of unique cat",no_of_unique_cat)
    embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
    embedding_size = int(embedding_size)
    print ("embedding_size set as ", embedding_size)
    model.add(  Embedding( no_of_unique_cat+1, embedding_size, input_length = 1 ) )
    
    model.add(Reshape(target_shape=( [embedding_size] )))

    print (model.summary())
    
    models.append( model )

------------------------------------------------------------------
for categorical column  site_id
number of unique cat 23
embedding_size set as  12
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 12)             288       
_________________________________________________________________
reshape_1 (Reshape)          (None, 12)                0         
Total params: 288
Trainable params: 288
Non-trainable params: 0
_________________________________________________________________
None
------------------------------------------------------------------
for categorical column  visitor_location_country_id
number of unique cat 48
embedding_size set as  24
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 24)             1176      
___________________

In [64]:
'''
Once the categorical columns are made, 
we add another single model for all the continous variables and add it to the models list .
'''

model_rest = Sequential()
model_rest.add(Dense(16, input_shape = [3] ))  #Question: how to set the "input_shape"?
model_rest.summary() 
models.append(model_rest)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                64        
Total params: 64
Trainable params: 64
Non-trainable params: 0
_________________________________________________________________


In [65]:
'''
Our models list will contain N-cat+1 models. 
( N-cat models for each of the categorical columns and one model for all other columns.)
'''
models

[<keras.engine.sequential.Sequential at 0x1a4089d8d0>,
 <keras.engine.sequential.Sequential at 0x1a4089dd68>,
 <keras.engine.sequential.Sequential at 0x1a408b19e8>,
 <keras.engine.sequential.Sequential at 0x1a408f4da0>,
 <keras.engine.sequential.Sequential at 0x1a40919c88>,
 <keras.engine.sequential.Sequential at 0x41ad20978>]

In [110]:
models[0].summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 12)             288       
_________________________________________________________________
reshape_1 (Reshape)          (None, 12)                0         
Total params: 288
Trainable params: 288
Non-trainable params: 0
_________________________________________________________________


In [97]:
'''
Finally we merge all the models present into a single model. Concat places the model one after the other
'''
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers import Dense, Input


# model1_in = Input(shape=(27, 27, 1))
# model1_out = Dense(300, input_dim=40, activation='relu', name='layer_1')(model1_in)
# model1 = Model(model1_in, model1_out)

# model2_in = Input(shape=(27, 27, 1))
# model2_out = Dense(300, input_dim=40, activation='relu', name='layer_2')(model2_in)
# model2 = Model(model2_in, model2_out)


concatenated = concatenate([model1_out, model2_out])
out = Dense(1, activation='softmax', name='output_layer')(concatenated)

merged_model = Model([model1_in, model2_in], out)
# merged_model.compile(loss='binary_crossentropy', optimizer='adam', 
# metrics=['accuracy'])
full_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

# checkpoint = ModelCheckpoint('weights.h5', monitor='val_acc',
# save_best_only=True, verbose=2)
# early_stopping = EarlyStopping(monitor="val_loss", patience=5)

merged_model.fit([x1, x2], y=y, batch_size=384, epochs=200,
             verbose=1, validation_split=0.1, shuffle=True, 
# callbacks=[early_stopping, checkpoint])

AttributeError: Layer sequential_3 has multiple inbound nodes, hence the notion of "layer input" is ill-defined. Use `get_input_at(node_index)` instead.

In [None]:
full_model.add(Dense(1024))
full_model.add(Activation('relu'))
full_model.add(Dense(512))
full_model.add(Activation('relu'))
full_model.add(Dense(256))
full_model.add(Activation('sigmoid'))

full_model.add(Dense(2))
full_model.add(Activation('sigmoid'))
full_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

Using TensorFlow backend.


In [15]:
# X is the housing price at given time (t), Y isthe housing price at given time (t+1)
# convert an array of values into a dataset matrix

def create_dataset(dataset,look_back=1):
    dataX, dataY = [],[]
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back),0]
        dataX.append(a)
        dataY.append(dataset[i+look_back,0])
    
    return np.array(dataX), np.array(dataY)

np.random.seed(7)

In [16]:
# normalize the dataset
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
all_data_sampled_non_NA = sc.fit_transform(all_data_sampled_non_NA)   
training_data, validation_data, test_data = split_data(all_data_sampled_non_NA)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [132]:
all_data_sampled_non_NA.shape
training_data.shape

(6400, 79)

In [116]:
# prepare the train and test datasets for modeling
look_back = 1
trainX,trainY = create_dataset(training_data,look_back)
validationX,validationY = create_dataset(validation_data,look_back) 

In [117]:
trainX.shape

(6398, 1)

In [119]:
# reshape input to be [samples,time steps, features]
trainX = np.reshape(trainX,(trainX.shape[0],1,trainX.shape[1]))
validationX = np.reshape(validationX,(validationX.shape[0],1,validationX.shape[1]))

print(trainX.shape)
print(validationX.shape)

(6398, 1, 1)
(1598, 1, 1)


In [109]:
# create and fit LSTM network

model = Sequential()
model.add(LSTM(4, input_shape=(1,look_back)))
model.add(Dense(1))
model.compile(loss= 'mean_squared_error',optimizer='adam')
model.fit(trainX,trainY,epochs = 100,batch_size=1,verbose=2)

Epoch 1/100


KeyboardInterrupt: 

In [125]:
# make prediction
trainPredict = model.predict(trainX)
validationPredict = model.predict(validationX)

print(trainPredict.shape)
print(validationPredict.shape)

(6398, 1)
(1598, 1)


In [129]:
trainY.shape
validationY.shape

(1598,)

In [130]:
# inverted predictions
trainPredict = sc.inverse_transform(trainPredict)
trainY = sc.inverse_transform([trainY])
validationPredict = sc.inverse_transform(validationPredict)
validationY = sc.inverse_transform([validationY])

### broadcasting error: problem maybe in scaler() part => using (6398,79) to fit the min_max scaler, so (6398,1) cannot be converted back

ValueError: non-broadcastable output operand with shape (6398,1) doesn't match the broadcast shape (6398,79)

In [None]:
# TODO: compute RMSE score

In [None]:
# TODO: plot result

In [26]:
# feats = [col for col in training_data_.columns.unique().tolist() if col != 'price_usd' ]
# target = 'price_usd'

# X_train_LSTM = training_data_[feats]
# y_train_LSTM = training_data_[target]