In [4]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import os
# os.chdir('/Users/jingwang/Desktop/winter2019/si699/si699codes')
os.chdir('/Users/liyuan/desktop/SI699/codes')

# Put this when it's called
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score

from sklearn import tree
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNet

class modeling_pipeline():
    def __init__(self,data,model,variables):
        self.data = data.iloc[:]
        self.train_data = pd.DataFrame()
        self.val_data = pd.DataFrame()
        self.test_data = pd.DataFrame()
        self.model = model
        self.variables = variables
        self.X_train = pd.DataFrame()
        self.X_test = pd.DataFrame()
        self.X_val = pd.DataFrame()
        self.y_train = pd.DataFrame()
        self.y_test = pd.DataFrame()
        self.y_val = pd.DataFrame()

        self.categorical_vars = ['srch_id','site_id','visitor_location_country_id','visitor_hist_starrating','prop_country_id','prop_id',
        'srch_destination_id']

        self.categorical_binary_vars = []
        self.continuous_vars = []

        self.X_train_normalized = pd.DataFrame()
        self.X_val_normalized = pd.DataFrame()
        self.X_test_normalized = pd.DataFrame()

        self.X_train_standardized = pd.DataFrame()
        self.X_val_standardized = pd.DataFrame()
        self.X_test_standardized = pd.DataFrame()
 
    def split_data(self):
        training_size_large = int(len(self.data) * 0.8)   
        validation_size = int(training_size_large * 0.2)
        training_size = training_size_large - validation_size
        test_size = int(len(self.data) * 0.2)
        print('training size: %d'%training_size)
        print('validation size: %d'%validation_size)
        print('test size: %d'%test_size)
        # split data by temporal order
        self.train_data = self.data.iloc[0: training_size]
        self.val_data = self.data.iloc[training_size:(training_size + validation_size)]
        # self.test_data = self.data.iloc[(training_size + validation_size): (training_size + validation_size + test_size)]
        self.test_data = self.data.iloc[(training_size + validation_size):]
        return self.train_data, self.val_data, self.test_data
    
    def divide_variables(self):
        # divide variables into categories
        # get categorical variables
        other_cols = [col for col in self.variables if col not in self.categorical_vars]
        # get categorical binary variables
        self.categorical_binary_vars += ['promotion_flag']
        self.categorical_binary_vars += [col for col in self.data if col.startswith('new')]
        self.categorical_binary_vars += [col for col in self.data if col.endswith('inv')]
        self.categorical_binary_vars += [col for col in self.data if col.endswith('bool')]
        # get continous variables
        self.continuous_vars += [ col for col in self.variables if (col not in self.categorical_binary_vars) & (col not in self.categorical_vars )]
        print ("categorical binary vars: ", len(self.categorical_binary_vars))
        print ("categorical non binary vars: ", len(self.categorical_vars))
        print ("continues vars: ", len(self.continuous_vars))
        return self.categorical_vars, self.categorical_binary_vars, self.continuous_vars

    def get_X_y(self):
        # TODO: need to handle 'date_time' properly
        # for now, leave out "date_time" from modeling
        # self.variables += [col for col in self.data.columns.unique().tolist() if col not in ['price_usd','date_time']]
        self.X_train = self.train_data[self.variables]
        self.y_train = self.train_data['price_usd']
        self.X_val = self.val_data[self.variables]
        self.y_val = self.val_data['price_usd']
        self.X_test = self.test_data[self.variables]
        self.y_test = self.test_data['price_usd']
        return self.X_train, self.y_train, self.X_val, self.y_val, self.X_test, self.y_test

    def get_normalized_X_y(self):
        normalizer = Normalizer().fit(self.X_train) 
        self.X_train_normalized = normalizer.transform(self.X_train)
        self.X_val_normalized = normalizer.transform(self.X_val)
        self.X_test_normalized = normalizer.transform(self.X_test)
        return self.X_train_normalized, self.X_val_normalized, self.X_test_normalized
    
    def get_standardized_X_y(self):
        # usign min-max scaler to standardized
        scaler = MinMaxScaler().fit(self.X_train)
        self.X_train_standardized = scaler.transform(self.X_train)
        self.X_val_standardized = scaler.transform(self.X_val)
        return self.X_train_standardized, self.X_val_standardized, self.X_test_standardized 

    def get_RMSE(self,y_pred,y_true,data):
        return np.sqrt(sum((y_pred - y_true)**2)/len(data))
    
    # updated: adding y_pred_test and test_RMSE
    def get_modeling_result(self):
        reg = self.model.fit(self.X_train, self.y_train)
        y_pred_val = reg.predict(self.X_val)
        y_pred_train = reg.predict(self.X_train)
        y_pred_test = reg.predict(self.X_test)
        val_RMSE = self.get_RMSE(y_pred_val, self.y_val, self.val_data)
        train_RMSE = self.get_RMSE(y_pred_train ,self.y_train, self.train_data)
        test_RMSE = self.get_RMSE(y_pred_test ,self.y_test, self.test_data)
        print('training RMSE:',train_RMSE)
        print('valiation RMSE:',val_RMSE)
        print('test RMSE:',test_RMSE)
        return y_pred_train, y_pred_val, y_pred_test

    def get_normalized_modeling_result(self):
        reg = self.model.fit(self.X_train_normalized, self.y_train)
        y_pred_val = reg.predict(self.X_val_normalized)
        y_pred_train = reg.predict(self.X_train_normalized)
        val_RMSE = self.get_RMSE(y_pred_val, self.y_val, self.val_data)
        train_RMSE = self.get_RMSE(y_pred_train ,self.y_train, self.train_data)
        print('training RMSE:',train_RMSE)
        print('valiation RMSE:',val_RMSE)
        return train_RMSE, val_RMSE

    def get_standardized_modeling_result(self):
        reg = self.model.fit(self.X_train_standardized, self.y_train)
        y_pred_val = reg.predict(self.X_val_standardized)
        y_pred_train = reg.predict(self.X_train_standardized)
        val_RMSE = self.get_RMSE(y_pred_val, self.y_val, self.val_data)
        train_RMSE = self.get_RMSE(y_pred_train ,self.y_train, self.train_data)
        print('training RMSE:',train_RMSE)
        print('valiation RMSE:',val_RMSE)
        return train_RMSE, val_RMSE

class PropModeling():
    def __init__(self):
        pass
    
    def load_data(self):
        train = pd.read_csv('../expediadata/train.csv')
        test = pd.read_csv('../expediadata/test.csv')
        cols_train_only = [col for col in train.columns.unique().tolist() if col not in test.columns.unique().tolist()]
        train = train.drop(columns = cols_train_only)
        all_data = pd.concat([train, test], ignore_index=True)
        # get features
        comp_features = ['comp1_rate','comp1_inv','comp1_rate_percent_diff','comp2_inv','comp2_rate','comp2_rate_percent_diff','comp3_rate','comp3_inv','comp3_rate_percent_diff','comp4_rate','comp4_inv','comp4_rate_percent_diff','comp5_rate','comp5_inv','comp5_rate_percent_diff','comp6_rate','comp6_inv','comp6_rate_percent_diff','comp7_rate','comp7_inv','comp7_rate_percent_diff','comp8_rate','comp8_inv','comp8_rate_percent_diff']
        user_features = ['visitor_hist_starrating','visitor_hist_adr_usd','srch_query_affinity_score','orig_destination_distance','site_id','visitor_location_country_id','srch_id']
        other_features = ['random_bool']
        time_features = ['date_time']
        all_data = all_data.drop(columns = comp_features)
        all_data = all_data.drop(columns = user_features)
        all_data = all_data.drop(columns = other_features)
        all_data['date_time'] = pd.to_datetime(all_data.date_time)
        all_data.sort_values(by=['date_time'],inplace=True)
        all_data = all_data.reset_index(drop=True)
        
        # change id to popularity
        
        # handle country
        country_counts = all_data['prop_country_id'].value_counts()

        d = {}
        for ID in country_counts.index:
            d[ID] = country_counts[ID]
        country_id = all_data['prop_country_id'].tolist()
        country_pop = []
        for ID in country_id:
            country_pop.append(d[ID])
        all_data['country_value_counts'] = country_pop
        city_counts = all_data['srch_destination_id'].value_counts()

        # handle city
        city = {}
        for ID in city_counts.index:
            city[ID] = city_counts[ID]
        city_id = all_data['srch_destination_id'].tolist()
        city_pop = []
        for ID in city_id:
            city_pop.append(city[ID])
        all_data['city_value_counts'] = city_pop

        all_data = all_data.drop(columns = 'prop_country_id')
        all_data = all_data.drop(columns = 'srch_destination_id')        
        return all_data
    
    def prop_modeling(self, all_data, prop_id):

        prop = all_data[all_data['prop_id']==prop_id]

        all_data_t = prop.set_index('date_time')
        prop_day = all_data_t.resample('D').median()
        prop_day.count()

        prop_day = prop_day.drop(columns = 'prop_id')
        prop_day = prop_day.fillna(prop_day.median())

        
        variables = [col for col in prop_day.columns.unique().tolist() if col not in ['price_usd']]
        model_prop_e = modeling_pipeline(prop_day, ElasticNet(), variables)
        train_data, val_data, test_data = model_prop_e.split_data()
        X_train,y_train,X_val,y_val,X_test,y_test = model_prop_e.get_X_y()
        y_pred_train, y_pred_val, y_pred_test = model_prop_e.get_modeling_result()
        
        return y_pred_train, y_pred_val, y_pred_test

        

In [5]:
p = PropModeling()
p_data = p.load_data()
y_pred_train, y_pred_val, y_pred_test=p.prop_modeling(p_data, 134232)
print('y_pred_train:',y_pred_train,'y_pred_val:',y_pred_val,'y_pred_test',y_pred_test)

FileNotFoundError: File b'../expediadata/train.csv' does not exist

In [16]:
p_data.columns

Index(['date_time', 'prop_id', 'prop_starrating', 'prop_review_score',
       'prop_brand_bool', 'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
       'country_value_counts', 'city_value_counts'],
      dtype='object')

In [8]:
y_pred_train, y_pred_val, y_pred_test=p.prop_modeling(p_data, 104517)
print('y_pred_train:',y_pred_train,'y_pred_val:',y_pred_val,'y_pred_test',y_pred_test)

training size: 155
validation size: 38
test size: 48
training RMSE: 17.518946066940984
valiation RMSE: 21.317406545873936
test RMSE: 15.633332183156323
y_pred_train: [ 94.9118055   87.75956599  89.23358553  92.38855685  87.12784333
  86.91726911  87.33841755  93.55035705  92.8060633   96.17525082
  92.39219884  91.54625997  95.54352816  89.23358553  89.76002108
  90.91089532  88.18071443  87.54899177  94.07315061  89.65473397
  90.39174374  95.54352816  89.65473397  89.65473397  90.07588241
  89.65473397  90.70760507  95.43824105  89.02301131  90.28645663
  88.9177242   89.86530819  88.28600154  89.02301131  89.02301131
  89.44415975  87.97014021  88.18071443  87.54899177  88.70714998
  89.23358553  88.81243709  87.44370466  87.12784333  89.76002108
  86.07497223  87.75956599  87.02255622  93.22721174  86.49612067
  85.65382379  85.44324957  84.81152691  85.33796246  86.91726911
  90.48974688  85.96968512  91.12146954  89.76002108  85.23267535
  88.18071443  96.80697348  92.60277306  9

In [9]:
y_pred_train, y_pred_val, y_pred_test=p.prop_modeling(p_data, 124342)
print('y_pred_train:',y_pred_train,'y_pred_val:',y_pred_val,'y_pred_test',y_pred_test)

training size: 155
validation size: 38
test size: 48
training RMSE: 20.39471671636198
valiation RMSE: 15.02179412127364
test RMSE: 12.792545865148934
y_pred_train: [105.18742434  98.01342469  99.26091888 104.35576154  97.59759329
  96.55801479  97.59759329 103.73110642 103.31618305 106.64283424
 102.58757007 102.22417161 107.37053919  99.46883458  99.88466598
 101.3409839   98.42925608  98.63717178 102.37965437 100.92424448
 100.71632878 106.01908714  99.78070813  99.98862383 100.30049738
  99.98862383 100.82028663 105.81117144  99.26091888  98.84508748
  99.05300318 100.09258168  98.63717178  99.05300318  99.26091888
  99.67675028  98.22134039  98.42925608  97.90946684  98.22134039
  99.57279243 101.80834021  97.70155114  97.38967759 100.50841308
  96.35009909  98.01342469  97.38967759 103.73201445  97.28571974
  95.93426769  95.51843629  95.10260489  98.37773117  96.76593049
 101.02911035  95.93426769 101.65285745  99.98862383  96.76593049
  98.01342469 107.57845489 102.69152792 101.

In [11]:
lt = [38837,78866,107212,31962,9812,103937,54906,105449,29633,135030,51653,55225,27982,78642,132817,25583,59632,109545,38898,9959,118206,29559,48625,107721,18380
]
 

In [13]:

for i in lt:
    y_pred_train, y_pred_val, y_pred_test=p.prop_modeling(p_data, i)
    print('----------------------')

training size: 155
validation size: 38
test size: 48
training RMSE: 173.73743528574335
valiation RMSE: 18.71778637864807
test RMSE: 17.536148732118
----------------------
training size: 155
validation size: 38
test size: 48
training RMSE: 21.61374370947528
valiation RMSE: 20.344562915152064
test RMSE: 17.60478938289083
----------------------
training size: 155
validation size: 38
test size: 48
training RMSE: 18.85413768487824
valiation RMSE: 22.43610197696458
test RMSE: 53.4267132609482
----------------------
training size: 155
validation size: 38
test size: 48
training RMSE: 12.162693221053607
valiation RMSE: 9.737198229231595
test RMSE: 13.07908190742812
----------------------
training size: 155
validation size: 38
test size: 48
training RMSE: 12.362533785396057
valiation RMSE: 27.4579839919458
test RMSE: 26.233341260090818
----------------------
training size: 155
validation size: 38
test size: 48
training RMSE: 24.29936888033349
valiation RMSE: 58.84888895991391
test RMSE: 39.97310