In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import ot
from sklearn.preprocessing import LabelEncoder

import torch

from models.mlp import BlackBoxModel

pd.set_option('display.max_columns', None)


In [3]:
df_ = pd.read_csv('hotel_bookings.csv')
df = df_.copy()
target = df['is_canceled']

In [4]:
# Initialize a label encoder
label_encoder = LabelEncoder()
label_mappings = {}


# Convert categorical columns to numerical representations using label encoding
for column in df.columns:
    if df[column].dtype == 'object':
        # Handle missing values by filling with a placeholder and then encoding
        df[column] = df[column].fillna('Unknown')
        df[column] = label_encoder.fit_transform(df[column])
        label_mappings[column] = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))


# For columns with NaN values that are numerical, we will impute them with the median of the column
for column in df.columns:
    if df[column].isna().any():
        median_val = df[column].median()
        df[column].fillna(median_val, inplace=True)

# Display the first few rows of the transformed dataframe
df.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,1,0,342,2015,5,27,1,0,0,2,0.0,0,0,135,3,1,0,0,0,2,2,3,0,14.0,179.0,0,2,0.0,0,0,1,121
1,1,0,737,2015,5,27,1,0,0,2,0.0,0,0,135,3,1,0,0,0,2,2,4,0,14.0,179.0,0,2,0.0,0,0,1,121
2,1,0,7,2015,5,27,1,0,1,1,0.0,0,0,59,3,1,0,0,0,0,2,0,0,14.0,179.0,0,2,75.0,0,0,1,122
3,1,0,13,2015,5,27,1,0,1,1,0.0,0,0,59,2,0,0,0,0,0,0,0,0,304.0,179.0,0,2,75.0,0,0,1,122
4,1,0,14,2015,5,27,1,0,2,2,0.0,0,0,59,6,3,0,0,0,0,0,0,0,240.0,179.0,0,2,98.0,0,1,1,123


In [5]:
df_norm = (df - df.mean()) / df.std()

df_X = df_norm.drop(columns=['is_canceled', 'reservation_status', 'reservation_status_date']).copy()
df_y = target

In [6]:
sample_num = 25

indice_source = (df_[
    df_['booking_changes']==3
].sample(sample_num)).index

indice_target = (df_[
    df_['booking_changes']==4
].sample(sample_num)).index

In [7]:
X_s = df_X.loc[indice_source].values
X_t = df_X.loc[indice_target].values

y_s = df_y.loc[indice_source].values
y_t = df_y.loc[indice_target].values

In [51]:
seed = 42

np.random.seed(seed)  # for reproducibility

n_projections = 50
thetas = np.random.randn(n_projections, X_s.shape[1])
thetas /= np.linalg.norm(thetas, axis=1)[:, None]

In [53]:
df_[df_['is_canceled']==0]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,0.0,0,BB,BEL,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,0.0,0,BB,FRA,Online TA,TA/TO,0,0,0,E,E,0,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,0.0,0,BB,DEU,Online TA,TA/TO,0,0,0,D,D,0,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [78]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

# Select first 10 columns of the dataset as features and use 'is_canceled' as target
X = df_X.values
y = df_y.values

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).view(-1, 1)

# Initialize the model, loss function, and optimizer
model = BlackBoxModel(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Evaluate on test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_loss = criterion(test_outputs, y_test_tensor)

    # Convert outputs to binary using 0.5 as threshold
    y_pred_tensor = (test_outputs > 0.5).float()
    correct_predictions = (y_pred_tensor == y_test_tensor).float().sum()
    accuracy = correct_predictions / y_test_tensor.shape[0]

accuracy.item()


0.8295921087265015

In [55]:
y_pred_s = model(torch.FloatTensor(X_s))
y_pred_t = model(torch.FloatTensor(X_t))

In [56]:
pd.Series(y_pred_s.reshape(-1).detach().numpy() > 0.5).value_counts()

False    24
True      1
Name: count, dtype: int64

In [57]:
pd.Series(y_pred_t.reshape(-1).detach().numpy() > 0.5).value_counts()

False    24
True      1
Name: count, dtype: int64

In [58]:
pd.Series(y_s).value_counts()

0    22
1     3
Name: count, dtype: int64

In [59]:
pd.Series(y_t).value_counts()

0    19
1     6
Name: count, dtype: int64

In [60]:
M_x_list = []
mu_list = []

for theta in thetas:
    # Project data onto the vector theta
    proj_X_s = X_s.dot(theta)
    proj_X_t = X_t.dot(theta)

    proj_X_s_dist_mass = np.ones(len(proj_X_s)) / len(proj_X_s)
    proj_X_t_dist_mass = np.ones(len(proj_X_t)) / len(proj_X_t)

    M_x = ot.dist(
        proj_X_s.reshape(proj_X_s.shape[0], 1),
        proj_X_t.reshape(proj_X_t.shape[0], 1),
        metric="sqeuclidean",
    )

    mu = ot.emd(proj_X_s_dist_mass, proj_X_t_dist_mass, M_x)

    M_x_list.append(M_x)
    mu_list.append(mu)

In [61]:
M_y = ot.dist(
    y_pred_s.reshape(y_pred_s.shape[0], 1),
    y_pred_t.reshape(y_pred_t.shape[0], 1),
    metric="sqeuclidean",
).detach().numpy()

In [62]:
y_pred_s_dist_mass = np.ones(len(y_pred_s)) / len(y_pred_s)
y_pred_t_dist_mass = np.ones(len(y_pred_t)) / len(y_pred_t)

nu = ot.emd(y_pred_s_dist_mass, y_pred_t_dist_mass, M_y)

In [63]:
import torch

def compute_gradient(K, X, X_prime, mu, nu, lambda_val, b):
    """
    Compute the gradient of Q(X) with respect to each x_i in X.

    Parameters:
    - K: List of theta vectors.
    - X: Tensor of shape (n, d) where n is the number of x_i vectors and d is the dimension of each vector.
    - X_prime: Tensor of shape (n, d) representing the x'_j vectors.
    - mu: Tensor of shape (len(K), n, n) representing mu values for each theta, i, and j.
    - nu: Tensor of shape (n, n) representing nu values for each i and j.
    - lambda_val: Scalar lambda value.
    - b: Blackbox model (assumed to be a PyTorch model).

    Returns:
    - Gradient tensor of shape (n, d).
    """

    X = torch.from_numpy(X).float()
    X_prime = torch.from_numpy(X_prime).float()

    # lambda_val = torch.Tensor(lambda_val).double()
    
    # Ensure X requires gradient for autograd
    X.requires_grad_(True)
    
    # Initialize Q value to 0
    Q = torch.tensor(0.0, dtype=torch.float)
    
    n,m = X.shape[0], X_prime.shape[0]
    
    for k, theta in enumerate(K):
        theta = torch.from_numpy(theta).float()
        for i in range(n):
            for j in range(m):
                term1 = mu[k][i, j] * (torch.dot(theta, X[i]) - torch.dot(theta, X_prime[j]))**2
                term2 = lambda_val * nu[i, j] * (b(X[i]) - b(X_prime[j]))**2
                Q += term1 + term2.item()
    
    # Compute gradient
    Q.backward()
    
    return X.grad


lambda_val = 0.5  # Example lambda value



gradient = compute_gradient(K=thetas, X=X_s, X_prime=X_t, mu=mu_list, nu=nu, lambda_val=lambda_val, b=model)
print(gradient)


tensor([[-3.7565e-02,  3.6776e-02,  1.1424e-01, -9.3313e-02,  8.7008e-02,
         -6.1771e-03, -1.0845e-02,  5.0540e-02,  4.3261e-02, -4.8628e-02,
          5.1286e-02, -1.7298e-02, -3.0842e-02,  3.2328e-02,  4.8873e-03,
         -3.4482e-03, -1.3199e-02, -1.4767e-02, -1.7235e-02, -5.6811e-03,
         -2.5928e-01, -7.6134e-03,  2.3347e-02,  1.6987e-02,  5.1463e-02,
         -8.2277e-03,  3.1944e-03, -3.0534e-02, -5.3875e-02],
        [-1.8106e-02,  6.6354e-02,  1.0546e-01, -9.0604e-02,  7.2059e-02,
          1.9818e-02, -5.1123e-02,  5.8986e-02,  6.1062e-02, -1.0126e-01,
         -4.8487e-02, -2.0371e-02, -2.9199e-02, -3.0062e-02,  3.5791e-02,
          8.3945e-04, -4.3052e-03,  2.2157e-02, -1.3780e-02,  4.8283e-02,
         -1.8736e-01, -3.9386e-02,  3.3716e-02, -1.8081e-02, -1.2185e-03,
          1.9874e-02,  3.7039e-02, -4.7739e-02, -5.9474e-02],
        [ 3.6951e-03, -1.4501e-01,  5.4511e-02, -8.1205e-02,  3.0128e-02,
          1.3611e-01,  2.4762e-03,  7.4202e-03, -3.5427e-04, -