In [141]:
%matplotlib inline

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# ================================================= #
# Parameter setting
# ================================================= #
# Original data path
data_path = "../TH_data_challenge.tsv"

In [11]:
# ================================================= #
# Load data
# ================================================= #
df_data = pd.DataFrame.from_csv(data_path, sep='\t', index_col=None)

In [14]:
len(df_data)

184279

In [192]:
# ================================================= #
# Process data into numerical features
# ================================================= #
# --- Separate dim_is_requested ("y") from features --- #
df_data_num = df_data.copy(deep=True)
s_dim_is_requested = df_data_num['dim_is_requested']
del df_data_num['dim_is_requested']
# --- Delete some columns --- #
# ds_night, ds, id_listing_anon, id_user_anon - delete
del df_data_num['ds_night']
del df_data_num['ds']
del df_data_num['id_listing_anon']
del df_data_num['id_user_anon']
# --- Encode categorical features --- #
# --- Put these features separate from the continuous features --- #
list_encoded = []
# dim_market - one-hot encoding
df = encode_column_onehot(
    df_data_num, 'dim_market', ['Los Angeles', 'Paris', 'San Francisco'])
list_encoded.append(df)
# dim_room_type - one-hot encoding
df = encode_column_onehot(
    df_data_num, 'dim_room_type', ['Private room', 'Shared room', 'Entire home/apt'])
list_encoded.append(df)
# dim_is_instant_bookable - boolean -> int
s = df_data_num['dim_is_instant_bookable'].astype(int)
del df_data_num['dim_is_instant_bookable']
list_encoded.append(s)
# cancel_policy - one-hot encoding
df = encode_column_onehot(
    df_data_num, 'cancel_policy', [3, 4, 5, 6, 7, 8, 9])
list_encoded.append(df)
# dim_has_wireless_internet - keep as 0/1
s = df_data_num['dim_has_wireless_internet']
del df_data_num['dim_has_wireless_internet']
list_encoded.append(s)
# ds_night_day_of_week - cyclic encoding
df = encode_column_cyclic(df_data_num, 'ds_night_day_of_week', [0, 7])
list_encoded.append(df)
# # (plot encoded ds_night_day_of_week for illustration)
# fig = plt.figure()
# plt.plot(df['ds_night_day_of_week_1'], df['ds_night_day_of_week_2'], '.')
# ds_night_day_of_year
df = encode_column_cyclic(df_data_num, 'ds_night_day_of_year', [0, 365])
list_encoded.append(df)
# --- Standardize continuous features to have zero mean and unit variance --- #
s_feature_mean = df_data_num.mean(axis=0)
s_feature_std = df_data_num.std(axis=0)
df_data_num = (df_data_num - s_feature_mean) / s_feature_std
# --- Put "y", continuous features and encoded categorical features together --- #
df_data_num = pd.concat([s_dim_is_requested] + [df_data_num] + list_encoded, axis=1)

In [162]:
def encode_column_cyclic(df, column_name, circle_range):
    ''' Delete a column in a DataFrame and return a list of columns (in a df)
    with cyclic encoded numerical values.
    
    Parameters
    ----------
    df: <pd.DataFrame>
        Original df
    column_name: <str>
        Name of the column to be replaced
    circle_range: <list>
        A list of two numbers indicating the range of the orig. data. These
        two numbers are essentially representing the same point in a circle.
        E.g., if weekdays are represented by 0, 1, ..., 6, then circle_range
        = [0, 7]
    Return
    ----------
    df_cyclic: <pd.DataFrame>
        The new df, with column names "column_name_1", "column_name_2"
    '''

    # Generate cyclically encoded columns
    s_sin = np.sin(2 * np.pi * df_data_num[column_name] / (circle_range[1] - circle_range[0]))
    s_cos = np.cos(2 * np.pi * df_data_num[column_name] / (circle_range[1] - circle_range[0]))
    # Delete the original column
    del df[column_name]
    # Generate new df
    new_column_names = []
    for i in range(2):
        new_column_names.append('{}_{}'.format(column_name, i+1))
    df_cyclic = pd.concat([s_sin, s_cos], axis=1, keys=new_column_names)
    
    return df_cyclic

In [133]:
def encode_column_onehot(df, column_name, list_categ):
    ''' Delete a column in a DataFrame and return a list of columns (in a df)
    with one-hot encoded numerical values.
    
    Parameters
    ----------
    df: <pd.DataFrame>
        Original df
    column_name: <str>
        Name of the column to be replaced
    list_catog: <list>
        List of categories in the column
    
    Return
    ----------
    df_onehot: <pd.DataFrame>
        The new df, with column names "column_name_1", "column_name_2", ...
    '''
    
    # Generate one-hot encoded columns
    list_s = []
    for i in range(len(list_categ)):
        s = (df[column_name] == list_categ[i]).astype(int)
        list_s.append(s)
    # Check if all categories sum up to the total number of data points
    sum_num = sum([sum(s) for s in list_s])
    if sum_num != len(df):
        print('Warming: {} has missing data!'.format(column_name))
    # Delete the original column
    del df[column_name]
    # Generate new df
    new_column_names = []
    for i in range(len(list_categ)):
        new_column_names.append('{}_{}'.format(column_name, i+1))
    df_onehot = pd.concat(list_s, axis=1, keys=new_column_names)
    
    return df_onehot