In [22]:
import pandas as pd
import numpy as np
import seaborn as sns

import ot
from sklearn.preprocessing import LabelEncoder

import torch

pd.set_option('display.max_columns', None)


In [53]:
df_ = pd.read_csv('hotel_bookings.csv')
df = df_.copy()
target = df['is_canceled']

In [54]:
# Initialize a label encoder
label_encoder = LabelEncoder()
label_mappings = {}


# Convert categorical columns to numerical representations using label encoding
for column in df.columns:
    if df[column].dtype == 'object':
        # Handle missing values by filling with a placeholder and then encoding
        df[column] = df[column].fillna('Unknown')
        df[column] = label_encoder.fit_transform(df[column])
        label_mappings[column] = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))


# For columns with NaN values that are numerical, we will impute them with the median of the column
for column in df.columns:
    if df[column].isna().any():
        median_val = df[column].median()
        df[column].fillna(median_val, inplace=True)

# Display the first few rows of the transformed dataframe
df.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,1,0,342,2015,5,27,1,0,0,2,0.0,0,0,135,3,1,0,0,0,2,2,3,0,14.0,179.0,0,2,0.0,0,0,1,121
1,1,0,737,2015,5,27,1,0,0,2,0.0,0,0,135,3,1,0,0,0,2,2,4,0,14.0,179.0,0,2,0.0,0,0,1,121
2,1,0,7,2015,5,27,1,0,1,1,0.0,0,0,59,3,1,0,0,0,0,2,0,0,14.0,179.0,0,2,75.0,0,0,1,122
3,1,0,13,2015,5,27,1,0,1,1,0.0,0,0,59,2,0,0,0,0,0,0,0,0,304.0,179.0,0,2,75.0,0,0,1,122
4,1,0,14,2015,5,27,1,0,2,2,0.0,0,0,59,6,3,0,0,0,0,0,0,0,240.0,179.0,0,2,98.0,0,1,1,123


In [72]:
df_norm = (df - df.mean()) / df.std()
df_norm['is_canceled'] = target

df_X = df_norm.drop(columns=['is_canceled']).copy()
df_y = df_norm['is_canceled']

In [125]:
def sliced_wasserstein_distance(X_s, X_t, a=None, b=None, metric='sqeuclidean', n_projections=50):
    """
    Compute the sliced Wasserstein distance between X_s and X_t
    
    Parameters:
    X_s : np.ndarray, shape (n_samples_a, dim)
        samples in the source domain
    X_t : np.ndarray, shape (n_samples_b, dim)
        samples in the target domain
    a : np.ndarray, shape (n_samples_a,), optional
        weights of each sample of X_s, default is uniform weight
    b : np.ndarray, shape (n_samples_b,), optional
        weights of each sample of X_t, default is uniform weight
    metric : str, optional
        metric to be used for Wasserstein-1 distance computation
    n_projections : int, optional
        number of projections
    
    Returns:
    swd : float
        Sliced Wasserstein Distance between X_s and X_t
    """
    # Generate random projection vectors on the unit sphere
    thetas = np.random.randn(n_projections, X_s.shape[1])
    thetas /= np.linalg.norm(thetas, axis=1)[:, None]
    
    swd = 0
    for theta in thetas:
        # Project data onto the vector theta
        proj_X_s = X_s.dot(theta)
        proj_X_t = X_t.dot(theta)
        
        # Sort projections
        proj_X_s_sorted = np.sort(proj_X_s)
        proj_X_t_sorted = np.sort(proj_X_t)
        
        # Compute 1D Wasserstein distance and accumulate
        swd += ot.wasserstein_1d(proj_X_s_sorted, proj_X_t_sorted, a, b, p=2)
    
    return swd / n_projections

In [150]:
indice_source = (df_[
    df_['booking_changes']==0
]).index

indice_target = (df_[
    df_['booking_changes']>=4
]).index

In [156]:
X_s = df_X.loc[indice_source]
X_t = df_X.loc[indice_target]

In [157]:
np.random.seed(42)  # for reproducibility

sliced_wasserstein_distance(X_s, X_t, a=None, b=None, n_projections=50)

3.1300010143025867