In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from scipy.stats import skew
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import os
os.chdir('/Users/liyuan/desktop/SI699')

In [2]:
class Data_preprocess():
    
    def __init__(self, train_file_path, test_file_path):
        self.train_file_path = train_file_path
        self.test_file_path = test_file_path
        self.all_data = pd.DataFrame()
        self.sampled_data = pd.DataFrame()
    
    def load_data(self):
        train = pd.read_csv(self.train_file_path)
        test = pd.read_csv(self.test_file_path)
        print('training data has %d records'%len(train))
        print('test data has %d records'%len(test))
        
        # drop columns in training data that are not available in test data set, including :'position', 'click_bool', 'gross_bookings_usd', 'booking_bool'
        cols_train_only = [col for col in train.columns.unique().tolist() if col not in test.columns.unique().tolist()]
        print('Columns only available in training data:',cols_train_only)
        train = train.drop(columns = cols_train_only)

        # combine train and test data
        self.all_data = pd.concat([train, test], ignore_index=True)
        print('Whole dataset has %d records' % len(self.all_data))
        
        # convert 'date_time' to datatime object
        self.all_data['date_time'] = pd.to_datetime(self.all_data.date_time)
        self.all_data.sort_values(by=['date_time'],inplace=True)
        self.all_data = self.all_data.reset_index(drop=True)
        return self.all_data    
    
    def clean_data(self,data, output_file_name):
        # handle NA values
        NA_columns = []
        for col in data.columns.unique().tolist():
            if data[col].isna().values.any() == True:
                NA_columns.append(col)
        for col in NA_columns:
            # create binary columns
            new_col = 'new_'+ col
            data[new_col] = data[col].apply(lambda x: 1 if x >= 0 else 0)
        # replace old column NA values to median value
        data = data.fillna(data.median())
        # output to csv file
        data.to_csv(output_file_name +'.csv',index = False, encoding = 'utf-8')
        return data
    
    def sample_data(self,sample_size):
        interval_range = len(self.all_data)//sample_size
        mid_idx_lst = []
        for i in range(1,sample_size+1):
            mid_idx = (interval_range*(i-1) + interval_range*i)//2
            mid_idx_lst.append(mid_idx)
        self.sampled_data = self.all_data.iloc[mid_idx_lst]
        return self.sampled_data


In [3]:
data_p = Data_preprocess('./expedia_data/train.csv','./expedia_data/test.csv')
all_data = data_p.load_data()
sampled_data = data_p.sample_data(5000)
# cleaned_data = data_p.clean_data(all_data,'res/cleaned_data')
cleaned_sampled_data = data_p.clean_data(sampled_data,'res/cleaned_sampled_data')

training data has 9917530 records
test data has 6622629 records
Columns only available in training data: ['position', 'click_bool', 'gross_bookings_usd', 'booking_bool']
Whole dataset has 16540159 records


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [142]:
class Reformat_data():
    def __init__(self):
        self.data = pd.DataFrame()
        self.variables = []
        self.categorical_vars = []
        self.categorical_binary_vars = []
        self.continuous_vars = []
        self.dest_data_list = {}
        self.datetime_range = []
        # list destination id with enough data
        self.valid_dest_list = []
        self.daily_price_data = []
        # self.all_daily_price = pd.DataFrame()
        self.all_daily_price = {}
    
    def load_data(self, data_file_path):
        self.data = pd.read_csv(data_file_path, encoding = 'utf-8')
        return self.data
        
    def divide_variables(self):
        self.variables += [col for col in self.data.columns.unique().tolist() if col not in ['price_usd','date_time']]
        self.categorical_vars += ['srch_id','site_id','visitor_location_country_id','visitor_hist_starrating','prop_country_id','prop_id','prop_starrating',
        'srch_destination_id']
        other_cols = [col for col in self.variables if col not in self.categorical_vars]
        # get categorical binary variables
        self.categorical_binary_vars += ['promotion_flag']
        self.categorical_binary_vars += [col for col in self.data if col.startswith('new')]
        self.categorical_binary_vars += [col for col in self.data if col.endswith('inv')]
        self.categorical_binary_vars += [col for col in self.data if col.endswith('bool')]
        # get continous variables
        self.continuous_vars += [ col for col in variables if (col not in categorical_binary_vars) & (col not in categorical_vars )]
        return self.categorical_vars,self.categorical_binary_vars,self.continuous_vars
    
    def get_data_by_dest(self):
        '''separate entire dataset by destinations; append to a list'''
        srch_destination_ids = self.data['srch_destination_id'].unique().tolist()
        for srch_destination_id in srch_destination_ids:
            destination_data = self.data[self.data['srch_destination_id'] == srch_destination_id]
            self.dest_data_list[srch_destination_id] = destination_data
        return self.dest_data_list
    
    def get_datetime_range(self):
        # covert 'date_time' to datetime object
        self.data['date_time'] = pd.to_datetime(self.data.date_time)
        # resample by day
        data = self.data.set_index('date_time')
        price_data = data['price_usd'].resample('D').median()
        self.datetime_range += price_data.index.tolist()
        return self.datetime_range
    
    def get_daily_price(self, dest_id):
        dest_data = self.dest_data_list[dest_id]
        dest_data['date_time'] = pd.to_datetime(dest_data.date_time)
        dest_data = dest_data.set_index('date_time')
        dest_daily_price = dest_data['price_usd'].resample('D').median()
        return dest_daily_price
    
    def get_valid_dests(self):
        '''get a list of destination ids covering at least 50% of the datetime range'''
        for dest_id in self.dest_data_list.keys():
            # after remove na, if num of available records exceed 50% percentile of all date range, then keep it; otherwise, remove destination from dataset
            if len(self.get_daily_price(dest_id).dropna()) >= len(self.datetime_range)*0.5:
                self.valid_dest_list.append(dest_id)
        return self.valid_dest_list
    
#     def concatenate_daily_price(self):
#         '''concatenate daily price data of all destinations'''
#         # TODO: need to merge multiple 
#         for dest_id in self.dest_data_list.keys():
#             self.daily_price_data.append(self.get_daily_price(dest_id))
#         self.all_daily_price = pd.concat(self.daily_price_data, ignore_index=True) 
#         return self.all_daily_price  
    
    def get_all_daily_price(self):
        for dest_id in self.dest_data_list.keys():
            self.all_daily_price[dest_id] = self.get_daily_price(dest_id)
        return self.all_daily_price

                

In [143]:
data_r = Reformat_data()
sampled = data_r.load_data('res/sampled_data_5000.csv')
categorical_vars,categorical_binary_vars,continuous_vars = data_r.divide_variables()
dest_data_list = data_r.get_data_by_dest()
print('there are %d unique destinations' % len(dest_data_list))
datetime_range = data_r.get_datetime_range()
print('there are %d days'%len(datetime_range))
# valid_dest_list = data_r.get_valid_dests()
# all_daily_price = data_r.concatenate_daily_price()
all_daily_price = data_r.get_all_daily_price()

there are 2247 unique destinations
there are 242 days


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [153]:
print(list(all_daily_price.keys())[:10])
# sort price data by destination, based on their valid record number
print('sorted list of destinations rankded in desceding of valid record number (top10):')
print(sorted(all_daily_price.keys(), key = lambda x: len(all_daily_price[x].dropna()), reverse = True)[:10])
print('largest number of valid records among all destinations: %d'%len(all_daily_price[8192].dropna()))

[17894, 19274, 4453, 11485, 3404, 11299, 9265, 23904, 24844, 20834]
sorted list of destinations rankded in desceding of valid record number (top10):
[8192, 4562, 10979, 9402, 23904, 13292, 8347, 6948, 13870, 13216]
largest number of valid records among all destinations: 59


In [217]:
class seq2seq():
    def __init__(self):
        self.data = pd.DataFrame()
        # subset of data divided by week
        self.sequence_list = []
        # number of weeks cover by data
        self.weeks = 0
        self.extra_day = 0
    
    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)
        return self.data
            
    def get_sequence_data(self):
        '''process data into seven days as a sequence'''
        # TODO: predict next 7 days' price based on previous 7 days
        self.weeks += len(self.data)//7
        self.extra_day += len(self.data) % 7
        i = 1
        while 1 <= i <= self.weeks:
            subset = self.data[(i-1)*7:i*7]
            self.sequence_list.append(subset)
            i+=1
        return self.sequence_list

    def implement_seq2seq():
        pass
    

In [233]:
ss = seq2seq()
data = ss.load_data('res/price_data_8192.csv')
sequence_list = ss.get_sequence_data()

sequence_list[0]['price'].values

array([ 80.,  nan,  nan,  nan, 299., 215.,  nan])

In [223]:
import numpy as np
import tensorflow as tf
import helpers

tf.reset_default_graph()
sess = tf.InteractiveSession()

In [224]:
tf.__version__

'1.12.0'

In [225]:
PAD = 0
EOS = 1

vocab_size = 10
input_embedding_size = 20

encoder_hidden_units = 20
decoder_hidden_units = encoder_hidden_units

encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')