In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from scipy.stats import skew
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import os
os.chdir('/Users/liyuan/desktop/SI699')

In [2]:
class Data_preprocess():
    
    def __init__(self, train_file_path, test_file_path):
        self.train_file_path = train_file_path
        self.test_file_path = test_file_path
        self.all_data = pd.DataFrame()
        self.sampled_data = pd.DataFrame()
    
    def load_data(self):
        train = pd.read_csv(self.train_file_path)
        test = pd.read_csv(self.test_file_path)
        print('training data has %d records'%len(train))
        print('test data has %d records'%len(test))
        
        # drop columns in training data that are not available in test data set, including :'position', 'click_bool', 'gross_bookings_usd', 'booking_bool'
        cols_train_only = [col for col in train.columns.unique().tolist() if col not in test.columns.unique().tolist()]
        print('Columns only available in training data:',cols_train_only)
        train = train.drop(columns = cols_train_only)

        # combine train and test data
        self.all_data = pd.concat([train, test], ignore_index=True)
        print('Whole dataset has %d records' % len(self.all_data))
        
        # convert 'date_time' to datatime object
        self.all_data['date_time'] = pd.to_datetime(self.all_data.date_time)
        self.all_data.sort_values(by=['date_time'],inplace=True)
        self.all_data = self.all_data.reset_index(drop=True)
        return self.all_data
    
    def clean_data(self,data, output_file_name):
        # handle NA values
        NA_columns = []
        for col in data.columns.unique().tolist():
            if data[col].isna().values.any() == True:
                NA_columns.append(col)
        for col in NA_columns:
            # create binary columns
            new_col = 'new_'+ col
            data[new_col] = data[col].apply(lambda x: 1 if x >= 0 else 0)
        # replace old column NA values to median value
        data = data.fillna(data.median())
        # output to csv file
        data.to_csv(output_file_name +'.csv',index = False, encoding = 'utf-8')
        return data
    
    def sample_data(self,sample_size):
        interval_range = len(self.all_data)//sample_size
        mid_idx_lst = []
        for i in range(1,sample_size+1):
            mid_idx = (interval_range*(i-1) + interval_range*i)//2
            mid_idx_lst.append(mid_idx)
        self.sampled_data = self.all_data.iloc[mid_idx_lst]
        return self.sampled_data


In [3]:
data_p = Data_preprocess('./expedia_data/train.csv','./expedia_data/test.csv')
all_data = data_p.load_data()
sampled_data = data_p.sample_data(5000)
# cleaned_data = data_p.clean_data(all_data,'res/cleaned_data')
cleaned_sampled_data = data_p.clean_data(sampled_data,'res/cleaned_sampled_data')

training data has 9917530 records
test data has 6622629 records
Columns only available in training data: ['position', 'click_bool', 'gross_bookings_usd', 'booking_bool']
Whole dataset has 16540159 records


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
class modeling_pipeline():
    def __init__(self,data,model,variables):
        self.data = data.iloc[:]
        self.train_data = pd.DataFrame()
        self.val_data = pd.DataFrame()
        self.test_data = pd.DataFrame()
        self.model = model
        self.variables = variables
        self.X_train = pd.DataFrame()
        self.X_test = pd.DataFrame()
        self.X_val = pd.DataFrame()
        self.y_train = pd.DataFrame()
        self.y_test = pd.DataFrame()
        self.y_val = pd.DataFrame()
    
    def split_data(self):
        training_size_large = int(len(self.data) * 0.8)   
        validation_size = int(training_size_large * 0.2)
        training_size = training_size_large - validation_size
        test_size = int(len(self.data) * 0.2)
        print('training size: %d'%training_size)
        print('validation size: %d'%validation_size)
        print('test size: %d'%test_size)
        # split data by temporal order
        self.train_data = self.data.iloc[0: training_size]
        self.val_data = self.data.iloc[training_size:(training_size + validation_size)]
        self.test_data = self.data.iloc[(training_size + validation_size): (training_size + validation_size + test_size)]
        return self.train_data, self.val_data, self.test_data
    
    def get_X_y(self):
        # TODO: need to handle 'date_time' properly
        # for now, leave out "date_time" from modeling
        #variables = [col for col in self.data.columns.unique().tolist() if col not in ['price_usd','date_time']]
        self.X_train = self.train_data[self.variables]
        self.y_train = self.train_data['price_usd']
        self.X_val = self.val_data[variables]
        self.y_val = self.val_data['price_usd']
        self.X_test = self.test_data[self.variables]
        self.y_test = self.test_data['price_usd']
        return self.X_train, self.y_train, self.X_val, self.y_val, self.X_test, self.y_test
    
    
    def get_RMSE(self,y_pred,y_true,data):
        return np.sqrt(sum((y_pred - y_true)**2)/len(data))
    
    def get_modeling_result(self):
        reg = self.model.fit(self.X_train, self.y_train)
        y_pred_val = reg.predict(self.X_val)
        y_pred_train = reg.predict(self.X_train)
        val_RMSE = self.get_RMSE(y_pred_val, self.y_val, self.val_data)
        train_RMSE = self.get_RMSE(y_pred_train ,self.y_train, self.train_data)
        print('training RMSE:',train_RMSE)
        print('valiation RMSE:',val_RMSE)

In [None]:
variables = [col for col in all_data.columns.unique().tolist() if col not in ['price_usd','date_time']]
model_p = modeling_pipeline(cleaned_sampled_data, LinearRegression(), variables)
train_data, val_data, test_data = model_p.split_data()
X_train, y_train, X_val, y_val, X_test, y_test = model_p.get_X_y()
model_p.get_modeling_result()

In [None]:
# divide variables into categories
# get categorical variables
categorical_vars = ['srch_id','site_id','visitor_location_country_id','visitor_hist_starrating','prop_country_id','prop_id','prop_starrating',
'srch_destination_id']
other_cols = [col for col in variables if col not in categorical_vars]
# get categorical binary variables
categorical_binary_vars = []
categorical_binary_vars += ['promotion_flag']
categorical_binary_vars += [col for col in sampled_data if col.startswith('new')]
categorical_binary_vars += [col for col in sampled_data if col.endswith('inv')]
categorical_binary_vars += [col for col in sampled_data if col.endswith('bool')]
# get continous variables
continous_vars = [ col for col in variables if (col not in categorical_binary_vars) & (col not in categorical_vars )]
print ("categorical binary vars: ", len(categorical_binary_vars))
print ("categorical non binary vars: ", len(categorical_vars))
print ("continues vars: ", len(continous_vars))