### **Step 0:** Import necessary libraries and connect to Google Drive for file access and output storage.

In [None]:
# Adjusting current working directory to parent directory
from pathlib import Path
from os import chdir
from platform import system

try:
    current_directory
except: # First  run - initialize current_directory
    current_directory = Path.cwd()
    if system() == "Linux": # Colab
        from google.colab import drive
        drive.mount('/content/drive')
        current_directory = f"{current_directory}/drive/MyDrive/Colab Notebooks/RecTour2024Challenge"
    else:
        current_directory = current_directory.parent
finally:
    chdir(current_directory)



# External imports
import pandas as pd
import numpy as np

from random import randint

import torch
import torch.nn as nn

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from collections import defaultdict


# Internal imports
from src.data.csv_tools import csv_to_dataframe, dataframe_to_csv, save_submission
from src.data.pickle_tools import save_to_pickle, load_pickle
from src.data.keras_tools import save_keras_model_weights, load_keras_model_weights
from src.utils.preprocessing_tools import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Step 1:** Aggregate reviews for each accommodation: compile all relevant reviews into a single dataset for each accommodation.

In [None]:
# aggregate reviews for each accommodation as a list
for set_name in ["train", "val", "test"]:
    try:
        accommodation_reviews = load_pickle(f"{set_name}_reviews_grouped_by_accommodation")
    except:
        accommodation_reviews = create_accommodation_reviews(set_name)

### **Step 2:** Match users by userid and merge with reviews using reviewid, then create dictionaries where the key is the userid and the value is the reviewid.

In [None]:
# Concatenate train and val sets
train_processed_set, val_processed_set = None, None
try:
    train_processed_set = csv_to_dataframe("train")
except:
    train_processed_set = create_concatenated_set("train")
try:
    val_processed_set = csv_to_dataframe("val")
except:
    val_processed_set = create_concatenated_set("val")

train_user_review_dict = {}
try:
    train_user_review_dict = load_pickle("train_user_review_dict")
except:
    for index, row in train_processed_set.iterrows():
        user_id = row["user_id"]
        review_id = row["review_id"]
        train_user_review_dict[user_id] = review_id
    save_to_pickle(train_user_review_dict, "train_user_review_dict")

val_user_review_dict = {}
try:
    val_user_review_dict = load_pickle("val_user_review_dict")
except:
    for index, row in val_processed_set.iterrows():
        user_id = row["user_id"]
        review_id = row["review_id"]
        val_user_review_dict[user_id] = review_id
    save_to_pickle(val_user_review_dict, "val_user_review_dict")

### **STEP 3:** Handling missing values in review data

In [None]:
# Fill na with empty strings
train_processed_set[["review_title", "review_positive", "review_negative"]] = train_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
val_processed_set[["review_title", "review_positive", "review_negative"]] = val_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
# For test, get reviews data
test_reviews = csv_to_dataframe("test", "reviews")
test_reviews[["review_title", "review_positive", "review_negative"]] = test_reviews[["review_title", "review_positive", "review_negative"]].fillna("")
# Validate that there are no missing values
print("Missing values in train:")
print(train_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())
print("Missing values in val:")
print(val_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())
print("Missing values in test:")
print(test_reviews[["review_title", "review_positive", "review_negative"]].isna().sum())

Missing values in train:
review_title       0
review_positive    0
review_negative    0
dtype: int64
Missing values in val:
review_title       0
review_positive    0
review_negative    0
dtype: int64
Missing values in test:
review_title       0
review_positive    0
review_negative    0
dtype: int64


### **Throughout our work, we explored a total of 18 models. Some featured minor adjustments, while others took us in entirely different directions. In this notebook, we will showcase three main approaches that guided our experimentation.**


# ***First + Seconde approach - Ensemble of three models: Titles, Positive Reviews, and Negative Reviews***

In this section, you will see shared code snippets applicable to both approaches until we reach the point where the two approaches diverge.


### Sanity Check: Encoding Review Data with SentenceTransformer

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

# Sanity check
sanity_dataset = val_processed_set.head()
title_vectors = model.encode(sanity_dataset["review_title"].values)
positive_vectors = model.encode(sanity_dataset["review_positive"].values)
negative_vectors = model.encode(sanity_dataset["review_negative"].values)
print(title_vectors.shape)
print(positive_vectors.shape)
print(negative_vectors.shape)

### Encoding and Saving Review Data Vectors for Train, Validation, and Test Sets

In [None]:
# Get pairs for each set
set_name_data_pair = [("train", train_processed_set), ("val", val_processed_set), ("test", test_reviews)]

for set_name, set_data in set_name_data_pair:
    print("Starting", set_name)

    review_ids = set_data["review_id"].values
    try:
      load_pickle(f"{set_name}_title_vectors_dict")
    except:
      title_vectors = model.encode(set_data["review_title"].values, show_progress_bar=True)
      title_vectors_dict = dict(zip(review_ids, title_vectors))
      save_to_pickle(title_vectors_dict, f"{set_name}_title_vectors_dict")
    print(f"Finished title.")

    try:
      load_pickle(f"{set_name}_positive_vectors_dict")
    except:
      positive_vectors = model.encode(set_data["review_positive"].values, show_progress_bar=True)
      positive_review_vectors_dict = dict(zip(review_ids, positive_vectors))
      save_to_pickle(positive_review_vectors_dict, f"{set_name}_positive_vectors_dict")
    print(f"Finished positive.")

    try:
      load_pickle(f"{set_name}_negative_vectors_dict")
    except:
      negative_vectors = model.encode(set_data["review_negative"].values, show_progress_bar=True)
      negative_review_vectors_dict = dict(zip(review_ids, negative_vectors))
      save_to_pickle(negative_review_vectors_dict, f"{set_name}_negative_vectors_dict")
    print(f"Finished negative.")

Starting train
Finished title.
Finished positive.
Finished negative.
Starting val
Finished title.
Finished positive.
Finished negative.
Starting test
Finished title.
Finished positive.
Finished negative.


### Create country embeddings for users

In [None]:
# Get all unique countries from train and val
users_countries_embedding_dict = {}
try:
    users_countries_embedding_dict = load_pickle("users_countries_embedding_dict")
except:
    train_countries = train_processed_set["guest_country"].unique()
    val_countries = val_processed_set["guest_country"].unique()
    all_countries = np.unique(np.concatenate((train_countries, val_countries)))
    embedding = nn.Embedding(len(all_countries), 8)                                   # We chose an embedding size of 8 because the unique values for guest_country can be effectively represented using a vector of size 8.
    for country_index, country in enumerate(all_countries):
        country_embedding = embedding(torch.tensor(country_index)).detach().numpy()
        users_countries_embedding_dict[country] = country_embedding
    save_to_pickle(users_countries_embedding_dict, "users_countries_embedding_dict")

### Create country embeddings for accommodations

In [None]:
accommodation_countries_embedding_dict = {}
try:
  accommodation_countries_embedding_dict = load_pickle("accommodation_countries_embedding_dict")
except:
  train_countries = train_processed_set["accommodation_country"].unique()
  val_countries = val_processed_set["accommodation_country"].unique()
  test_users = csv_to_dataframe("test", "users")
  test_countries = test_users["accommodation_country"].unique()
  all_countries = np.unique(np.concatenate((train_countries, val_countries, test_countries)))
  embedding = nn.Embedding(len(all_countries), 8)                                                               # We chose an embedding size of 8 because the unique values for accommodation_country can be effectively represented using a vector of size 8.
  for country_index, country in enumerate(all_countries):
    country_embedding = embedding(torch.tensor(country_index)).detach().numpy()
    accommodation_countries_embedding_dict[country] = country_embedding
  save_to_pickle(accommodation_countries_embedding_dict, "accommodation_countries_embedding_dict")

### Create guest type embeddings

In [None]:
# Get unique guest_types
guest_types_embedding_dict = {}
try:
    guest_types_embedding_dict = load_pickle("guest_types_embedding_dict")
except:
    train_guest_types = train_processed_set["guest_type"].unique()
    val_guest_types = val_processed_set["guest_type"].unique()
    all_guest_types = np.unique(np.concatenate((train_guest_types, val_guest_types)))
    embedding = nn.Embedding(len(all_guest_types), 2)                                                     # We chose an embedding size of 2 because the unique values for guest_type can be effectively represented using a vector of size 2.
    for guest_type_index, guest_type in enumerate(all_guest_types):
        guest_type_embedding = embedding(torch.tensor(guest_type_index)).detach().numpy()
        guest_types_embedding_dict[guest_type] = guest_type_embedding
    save_to_pickle(guest_types_embedding_dict, "guest_types_embedding_dict")

### Create accommodation type embeddings

In [None]:
accommodation_type_embedding_dict = {}
try:
  accommodation_type_embedding_dict = load_pickle("accommodation_type_embedding_dict")
except:
  train_acc_types = train_processed_set["accommodation_type"].unique()
  val_acc_types = val_processed_set["accommodation_type"].unique()
  test_users = csv_to_dataframe("test", "users")
  test_acc_types = test_users["accommodation_type"].unique()
  all_acc_types = np.unique(np.concatenate((train_acc_types, val_acc_types, test_acc_types)))
  embedding = nn.Embedding(len(all_acc_types), 5)                                                     # We chose an embedding size of 5 because the unique values for accommodation_type can be effectively represented using a vector of size 5.
  for acc_types_index, acc_types in enumerate(all_acc_types):
    acc_types_embedding = embedding(torch.tensor(acc_types_index)).detach().numpy()
    accommodation_type_embedding_dict[acc_types] = acc_types_embedding
  save_to_pickle(accommodation_type_embedding_dict, "accommodation_type_embedding_dict")

### Create user embedding for train and validation

In [None]:
def get_processed_test_users():
    test_users = csv_to_dataframe("test", "users")
    test_users['guest_country'] = test_users['guest_country'].fillna("EMPTY")
    return test_users

set_name_data_pair[2] = ("test", get_processed_test_users())
for set_name, set_df in set_name_data_pair:
  try:
      current_users_embeddings_dict = load_pickle(f"{set_name}_user_embeddings_dict")
  except:
    current_users_embeddings_dict = {}
    for _, row in set_df.iterrows():
        user_id = row["user_id"]
        embedded_guest_country = users_countries_embedding_dict[row["guest_country"]]
        embedded_guest_type = guest_types_embedding_dict[row["guest_type"]]
        embedded_month = row["month"] / 12                                                                        # We performed normalization by dividing by the maximum value.
        embedded_room_nights = row["room_nights"] / 112                                                           # We performed normalization by dividing by the maximum value.
        embedded_acc_country = accommodation_countries_embedding_dict[row["accommodation_country"]]
        embedded_acc_type = accommodation_type_embedding_dict[row["accommodation_type"]]
        current_users_embeddings_dict[user_id] = np.concatenate((embedded_guest_country, embedded_guest_type, np.array([embedded_month, embedded_room_nights]), embedded_acc_country, embedded_acc_type))
    save_to_pickle(current_users_embeddings_dict, f"{set_name}_user_embeddings_dict")

### Sanity check to validate embedding size

In [None]:
list(current_users_embeddings_dict.values())[0].shape

(25,)

### Create np array that each cell contains user embedding

In [None]:
train_user_embeddings_array = np.array(list(load_pickle("train_user_embeddings_dict").values()))
print(train_user_embeddings_array.shape)
val_user_embeddings_array = np.array(list(load_pickle("val_user_embeddings_dict").values()))
print(val_user_embeddings_array.shape)

(1628989, 25)
(203787, 25)


### Load all section reviews embeddigns

In [None]:
# Load all section reviews embeddigns
train_title_vectors_dict = load_pickle("train_title_vectors_dict")
train_positive_vectors_dict = load_pickle("train_positive_vectors_dict")
train_negative_vectors_dict = load_pickle("train_negative_vectors_dict")
val_title_vectors_dict = load_pickle("val_title_vectors_dict")
val_positive_vectors_dict = load_pickle("val_positive_vectors_dict")
val_negative_vectors_dict = load_pickle("val_negative_vectors_dict")

### Create embedding arrays for train and validation datasets

In [None]:
reviews_embeddings_arrays = {
    "train": {
        "users": train_user_embeddings_array,
        "title": np.array(list(train_title_vectors_dict.values())),
        "positive": np.array(list(train_positive_vectors_dict.values())),
        "negative": np.array(list(train_negative_vectors_dict.values())),
        "labels": np.concatenate([np.ones(len(train_positive_vectors_dict)), np.zeros(len(train_positive_vectors_dict) * 3)], axis=0)
    },
    "val": {
        "users": val_user_embeddings_array,
        "title": np.array(list(val_title_vectors_dict.values())),
        "positive": np.array(list(val_positive_vectors_dict.values())),
        "negative": np.array(list(val_negative_vectors_dict.values())),
        "labels": np.concatenate([np.ones(len(val_positive_vectors_dict)), np.zeros(len(val_positive_vectors_dict) * 3)], axis=0)
    }
}

### Helper functions

In [None]:
def get_correct_pairs(set_name, section_name):
    return reviews_embeddings_arrays[set_name]["users"], reviews_embeddings_arrays[set_name][section_name]

def get_set_labels(section_name):
    return reviews_embeddings_arrays[section_name]["labels"]

**First approach** – We proposed a contrastive learning method where negative samples are selected based on a similarity threshold with the positive sample, rather than randomly. This approach aims to provide the model with more challenging negative samples for improved training effectiveness.

In our experiments, we tested different thresholds and varied the number of negative samples between 2 and 4 to evaluate their impact on the model's performance. The presented code snippet is one example of the implementation.

### Generating hard negative review samples based on cosine similarity

In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    x = dot_product / (norm_vec1 * norm_vec2)
    return x

def create_negative_lists(coorect_pairs, reviews_embeddings_users_dict, embeddings_list, set_name, section_name):
    """
    Parameters:
    - coorect_pairs: List of tuples containing user embeddings and their associated review embeddings.
    - reviews_embeddings_users_dict: Dictionary mapping review embeddings to user embeddings.
    - embeddings_list: List of all review embeddings in the dataset.
    - set_name: Name of the dataset (e.g., "train", "val").
    - section_name: Specific section of the dataset being processed.

    Returns:
    - Three numpy arrays of negative review embeddings for each pair.
    """

    amount_of_reviews = len(coorect_pairs)
    last_review_index = amount_of_reviews - 1
    print_steps = amount_of_reviews // 20
    current_print = 0
    negative_reviews1, negative_reviews2, negative_reviews3 = [], [], []
    for current_review_index, (user_embedding, review) in enumerate(coorect_pairs, start=1):

        if current_review_index % print_steps == 0:
            print(f"{set_name}-{section_name}: {current_print}% completed")
            current_print += 5

        user_embedding = tuple(user_embedding)

        # Generate the first negative review with similarity < 0.35
        negative_review1 = embeddings_list[randint(0, last_review_index)]
        while cosine_similarity(negative_review1, review) < 0.35:
            negative_review1 = embeddings_list[randint(0, last_review_index)]

        # Generate the second negative review with similarity < 0.35
        negative_review2 = embeddings_list[randint(0, last_review_index)]
        while cosine_similarity(negative_review2, review) < 0.35:
            negative_review2 = embeddings_list[randint(0, last_review_index)]

        # Generate the third negative review with similarity < 0.35
        negative_review3 = embeddings_list[randint(0, last_review_index)]
        while cosine_similarity(negative_review3, review) < 0.35:
            negative_review3 = embeddings_list[randint(0, last_review_index)]

        # Append generated negative reviews to respective lists
        negative_reviews1.append(negative_review1)
        negative_reviews2.append(negative_review2)
        negative_reviews3.append(negative_review3)

    # Return the generated negative review embeddings as numpy arrays
    return np.array(negative_reviews1), np.array(negative_reviews2), np.array(negative_reviews3)

### Proposed architecture in approach 1

In [None]:
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout

def create_contrastive_model():

  user_input = Input(shape=(25,), name='user')
  user_dense = Dense(128, activation='relu', name='user_dense_1')(user_input)
  user_dense = Dense(64, activation='relu', name='user_dense_2')(user_dense)

  review_input = Input(shape=(384,), name='review')
  review_dense = Dense(256, activation='relu', name='review_dense_1')(review_input)
  review_dense = Dense(64, activation='relu', name='review_dense_2')(review_dense)

  combined = Concatenate(name='concatenate_layer')([user_dense, review_dense])

  combined_dense = Dense(128, activation='relu', name='combined_dense_1')(combined)
  combined_dense = Dropout(0.3, name='dropout_layer')(combined_dense)
  combined_dense = Dense(64, activation='relu', name='combined_dense_2')(combined_dense)

  output = Dense(1, activation='sigmoid', name='output_layer')(combined_dense)

  return Model([user_input, review_input], output, name='contrastive_model')

**Second approach** - We proposed a contrastive learning method where negative samples are selected to ensure they do not belong to the same user group as the positive sample and are distinct from each other. The model uses a dot product to measure the relationship between user and review embeddings, and training is guided by a custom contrastive loss function.

In our experiments, we varied the number of negative samples between 2 and 4 to evaluate their impact on the model's performance. The presented code snippet is one example of the implementation.



### Generating negative review samples

In [None]:
def create_negative_lists(coorect_pairs, reviews_embeddings_users_dict, embeddings_list, set_name, section_name):
    """
    Parameters:
    - coorect_pairs: List of tuples containing user embeddings and their associated review embeddings.
    - reviews_embeddings_users_dict: Dictionary mapping review embeddings to user embeddings.
    - embeddings_list: List of all review embeddings in the dataset.
    - set_name: Name of the dataset (e.g., "train", "val").
    - section_name: Specific section of the dataset being processed.

    Returns:
    - Three numpy arrays of negative review embeddings for each pair.
    """

    amount_of_reviews = len(coorect_pairs)
    last_review_index = amount_of_reviews - 1
    print_steps = amount_of_reviews // 20
    current_print = 0
    negative_reviews1, negative_reviews2, negative_reviews3 = [], [], []
    for current_review_index, (user_embedding, review) in enumerate(coorect_pairs, start=1):

        if current_review_index % print_steps == 0:
            print(f"{set_name}-{section_name}: {current_print}% completed")
            current_print += 5

        user_embedding = tuple(user_embedding)

        negative_review1 = embeddings_list[randint(0, last_review_index)]
        while user_embedding in reviews_embeddings_users_dict[tuple(negative_review1)]:
            negative_review1 = embeddings_list[randint(0, last_review_index)]

        negative_review2 = embeddings_list[randint(0, last_review_index)]
        while np.all(negative_review1 == negative_review2) or user_embedding in reviews_embeddings_users_dict[tuple(negative_review2)]:
            negative_review2 = embeddings_list[randint(0, last_review_index)]

        negative_review3 = embeddings_list[randint(0, last_review_index)]
        while np.all(negative_review1 == negative_review3) or np.all(negative_review2 == negative_review3) or user_embedding in reviews_embeddings_users_dict[tuple(negative_review3)]:
            negative_review3 = embeddings_list[randint(0, last_review_index)]

        negative_reviews1.append(negative_review1)
        negative_reviews2.append(negative_review2)
        negative_reviews3.append(negative_review3)

    return np.array(negative_reviews1), np.array(negative_reviews2), np.array(negative_reviews3)

### Proposed architecture and loss function in approach 2

In [None]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    # Loss for positive pairs: minimize squared distance
    square_pred = tf.square(y_pred)
    # Loss for negative pairs: penalize if distance < margin
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    # Combine positive and negative loss
    return tf.reduce_mean((1 - y_true) * square_pred + y_true * margin_square)

def create_contrastive_model():
    user_input = Input(shape=(25,), name='user')
    user_dense = Dense(384, activation='relu', name='user_dense')(user_input)

    review_input = Input(shape=(384,), name='review')

    # Compute the dot product between user and review embeddings
    dotProduct = Lambda(lambda tensors: tf.reduce_sum(tensors[0] * tensors[1], axis=-1, keepdims=True)) \
                              ([user_dense, review_input])

    return Model([user_input, review_input], dotProduct, name='contrastive_model')

### Training the model – shared for both approaches (difference in loss function)

In [None]:
def create_section_model(section_name, batch_size=512, epochs=15):
    print("Creating training data")
    train_users, train_section_reviews = get_correct_pairs("train", section_name)
    reviews_embeddings_users_dict_train = defaultdict(set)
    coorect_pairs = list(zip(train_users, train_section_reviews))
    for user, review in coorect_pairs:
      review_key = tuple(review)
      reviews_embeddings_users_dict_train[review_key].add(tuple(user))
    train_negative1, train_negative2, train_negative3 = create_negative_lists(coorect_pairs, reviews_embeddings_users_dict_train, train_section_reviews, "train", section_name)
    train_all_users = np.concatenate([train_users, train_users, train_users, train_users], axis=0)
    train_all_section_reviews = np.concatenate([train_section_reviews, train_negative1, train_negative2, train_negative3], axis=0)
    train_all_labels = get_set_labels("train")

    print("Creating validation data")
    val_users, val_section_reviews = get_correct_pairs("val", section_name)
    reviews_embeddings_users_dict_val = defaultdict(set)
    coorect_pairs = list(zip(val_users, val_section_reviews))
    for user, review in coorect_pairs:
      review_key = tuple(review)
      reviews_embeddings_users_dict_val[review_key].add(tuple(user))
    val_negative1, val_negative2, val_negative3 = create_negative_lists(coorect_pairs, reviews_embeddings_users_dict_val, val_section_reviews, "val", section_name)
    val_all_users = np.concatenate([val_users, val_users, val_users, val_users], axis=0)
    val_all_section_reviews = np.concatenate([val_section_reviews, val_negative1, val_negative2, val_negative3], axis=0)
    val_all_labels = get_set_labels("val")

    model = create_contrastive_model()

    model.compile(optimizer='adam', loss='binary_crossentropy') # or model.compile(optimizer='adam', loss='contrastive_loss')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

    history = model.fit(
        [train_all_users, train_all_section_reviews], train_all_labels,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=([val_all_users, val_all_section_reviews], val_all_labels),
        callbacks=[early_stopping, lr_scheduler]
    )
    return model

### Creating the 3 models - shared for both approaches

In [None]:
for section_name in ["title", "negative", "positive"]:
    section_model = create_section_model(section_name)
    section_model.save(f"out/models/{section_name}_3_negative_with_embedded_of_acc_with_binary_crossentropy.h5")

Creating training data
train-positive: 0% completed
train-positive: 5% completed
train-positive: 10% completed
train-positive: 15% completed
train-positive: 20% completed
train-positive: 25% completed
train-positive: 30% completed
train-positive: 35% completed
train-positive: 40% completed
train-positive: 45% completed
train-positive: 50% completed
train-positive: 55% completed
train-positive: 60% completed
train-positive: 65% completed
train-positive: 70% completed
train-positive: 75% completed
train-positive: 80% completed
train-positive: 85% completed
train-positive: 90% completed
train-positive: 95% completed
Creating validation data
val-positive: 0% completed
val-positive: 5% completed
val-positive: 10% completed
val-positive: 15% completed
val-positive: 20% completed
val-positive: 25% completed
val-positive: 30% completed
val-positive: 35% completed
val-positive: 40% completed
val-positive: 45% completed
val-positive: 50% completed
val-positive: 55% completed
val-positive: 60% co

# ***Third approach***

In this approach, we leverage a pre-trained transformer model to create embeddings for user and review descriptions. Using these embeddings, we train the model with a Multiple Negatives Ranking Loss, which optimizes the model to associate the correct user-review pairs while minimizing similarity to negative examples. The training process involves generating structured input examples from the dataset, pairing user and review embeddings, and fine-tuning the transformer with a ranking objective.

In our experiments, we tested various combinations of features to describe the user and the review. The code snippet here demonstrates one example of these combinations.








### Importing libraries and installing dependencies

In [None]:
!pip install sentence-transformers torch pandas numpy
!pip install datasets
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from datasets import Dataset

### Preparing training examples with user and review descriptions

In [None]:
train_examples = []
for i, row in train_processed_set.iterrows():
    user_desc = f"{row['guest_country']} {row['guest_type']} {row['month']} {row['room_nights']} {row['accommodation_country']} {row['accommodation_type']}"
    review_desc = f"{row['review_title']} {row['review_positive']} {row['review_negative']}"
    train_examples.append(InputExample(texts=[review_desc , user_desc]))

### Training the model with Multiple Negatives Ranking Loss

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    optimizer_params={'lr': 2e-5, 'weight_decay': 0.01 },
    warmup_steps=((len(train_examples) / 64) * 3) * 0.05,
    show_progress_bar=True
)

model.save("out/models/user_review_matching_model2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,3.8218
1000,3.3811
1500,3.1226
2000,2.9511
2500,2.8054
3000,2.6517
3500,2.5269
4000,2.4483
4500,2.3512
5000,2.305


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]