STEP 0: Imports

In [1]:
# Adjusting current working directory to parent directory
from pathlib import Path
from os import chdir
from platform import system

try:
    current_directory
except: # First  run - initialize current_directory
    current_directory = Path.cwd()
    if system() == "Linux": # Colab
        from google.colab import drive
        drive.mount('/content/drive')
        current_directory = f"{current_directory}/drive/MyDrive/Colab Notebooks/RecTour2024Challenge"
    else:
        current_directory = current_directory.parent
finally:
    chdir(current_directory)



# External imports
import pandas as pd
import numpy as np

from random import randint

import torch
import torch.nn as nn

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau



# Internal imports
from src.data.csv_tools import csv_to_dataframe, dataframe_to_csv, save_submission
from src.data.pickle_tools import save_to_pickle, load_pickle
from src.utils.preprocessing_tools import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


STEP 1: Merge and aggregate the raw data:
    for each accommodation get a list of all the relevant reviews to it
    create a single dataset for each set of the train and val - first merge users to match by userid (they are unique), then merge to reviews by review id

In [2]:
# aggregate reviews for each accommodation as a list
for set_name in ["train", "val", "test"]:
    try:
        accommodation_reviews = load_pickle(f"{set_name}_reviews_grouped_by_accommodation")
    except:
        accommodation_reviews = create_accommodation_reviews(set_name)

In [3]:
# Concatenate train and val sets
train_processed_set, val_processed_set = None, None
try:
    train_processed_set = csv_to_dataframe("train")
except:
    train_processed_set = create_concatenated_set("train")
try:
    val_processed_set = csv_to_dataframe("val")
except:
    val_processed_set = create_concatenated_set("val")

print("train size, val size")
print(len(train_processed_set), len(val_processed_set))

train_user_review_dict = {}
try:
    train_user_review_dict = load_pickle("train_user_review_dict")
except:
    for index, row in train_processed_set.iterrows():
        user_id = row["user_id"]
        review_id = row["review_id"]
        train_user_review_dict[user_id] = review_id
    save_to_pickle(train_user_review_dict, "train_user_review_dict")

val_user_review_dict = {}
try:
    val_user_review_dict = load_pickle("val_user_review_dict")
except:
    for index, row in val_processed_set.iterrows():
        user_id = row["user_id"]
        review_id = row["review_id"]
        val_user_review_dict[user_id] = review_id
    save_to_pickle(val_user_review_dict, "val_user_review_dict")

train size, val size
1628989 1628989


STEP 2: Create vector for each text of the review

In [4]:
# Fill na with empty strings
train_processed_set[["review_title", "review_positive", "review_negative"]] = train_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
val_processed_set[["review_title", "review_positive", "review_negative"]] = val_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
# For test, get reviews data
test_reviews = csv_to_dataframe("test", "reviews")
test_reviews[["review_title", "review_positive", "review_negative"]] = test_reviews[["review_title", "review_positive", "review_negative"]].fillna("")
# Validate that there are no missing values
print("Missing values in train:")
print(train_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())
print("Missing values in val:")
print(val_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())
print("Missing values in test:")
print(test_reviews[["review_title", "review_positive", "review_negative"]].isna().sum())

Missing values in train:
review_title       0
review_positive    0
review_negative    0
dtype: int64
Missing values in val:
review_title       0
review_positive    0
review_negative    0
dtype: int64
Missing values in test:
review_title       0
review_positive    0
review_negative    0
dtype: int64


In [5]:
# Initialize the model
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


If you want to check if it's working on a small dataset run the below cell, otherwise you can skip it

In [6]:
# Sanity check
sanity_dataset = val_processed_set.head()
title_vectors = model.encode(sanity_dataset["review_title"].values)
positive_vectors = model.encode(sanity_dataset["review_positive"].values)
negative_vectors = model.encode(sanity_dataset["review_negative"].values)
print(title_vectors.shape)
print(positive_vectors.shape)
print(negative_vectors.shape)

(5, 384)
(5, 384)
(5, 384)


In [7]:
# Get pairs for each set
set_name_data_pair = [("train", train_processed_set), ("val", val_processed_set), ("test", test_reviews)]

In [8]:
for set_name, set_data in set_name_data_pair:
    print("Starting", set_name)

    review_ids = set_data["review_id"].values
    try:
      load_pickle(f"{set_name}_title_vectors_dict")
    except:
      title_vectors = model.encode(set_data["review_title"].values, show_progress_bar=True)
      title_vectors_dict = dict(zip(review_ids, title_vectors))
      save_to_pickle(title_vectors_dict, f"{set_name}_title_vectors_dict")
    print(f"Finished title.")

    try:
      load_pickle(f"{set_name}_positive_vectors_dict")
    except:
      positive_vectors = model.encode(set_data["review_positive"].values, show_progress_bar=True)
      positive_review_vectors_dict = dict(zip(review_ids, positive_vectors))
      save_to_pickle(positive_review_vectors_dict, f"{set_name}_positive_vectors_dict")
    print(f"Finished positive.")

    try:
      load_pickle(f"{set_name}_negative_vectors_dict")
    except:
      negative_vectors = model.encode(set_data["review_negative"].values, show_progress_bar=True)
      negative_review_vectors_dict = dict(zip(review_ids, negative_vectors))
      save_to_pickle(negative_review_vectors_dict, f"{set_name}_negative_vectors_dict")
    print(f"Finished negative.")

Starting train
Finished title.
Finished positive.
Finished negative.
Starting val
Finished title.
Finished positive.
Finished negative.
Starting test
Finished title.
Finished positive.
Finished negative.


STEP 3 - Create embeddings for countries

In [9]:
# Get all unique countries from train and val
countries_embedding_dict = {}
try:
    countries_embedding_dict = load_pickle("countries_embedding_dict")
except:
    train_countries = train_processed_set["guest_country"].unique()
    val_countries = val_processed_set["guest_country"].unique()
    all_countries = np.unique(np.concatenate((train_countries, val_countries)))
    embedding = nn.Embedding(len(all_countries), 8)
    for country_index, country in enumerate(all_countries):
        country_embedding = embedding(torch.tensor(country_index)).detach().numpy()
        countries_embedding_dict[country] = country_embedding
    save_to_pickle(countries_embedding_dict, "countries_embedding_dict")

STEP 4 - Encode the guest types

In [10]:
# Get unique guest_types
guest_types_embedding_dict = {}
try:
    guest_types_embedding_dict = load_pickle("guest_types_embedding_dict")
except:
    train_guest_types = train_processed_set["guest_type"].unique()
    val_guest_types = val_processed_set["guest_type"].unique()
    all_guest_types = np.unique(np.concatenate((train_guest_types, val_guest_types)))
    embedding = nn.Embedding(len(all_guest_types), 2)
    for guest_type_index, guest_type in enumerate(all_guest_types):
        guest_type_embedding = embedding(torch.tensor(guest_type_index)).detach().numpy()
        guest_types_embedding_dict[guest_type] = guest_type_embedding

STEP 5 - Create user embedding for train and validation

In [11]:
for set_name, set_df in set_name_data_pair[: 2]:
  try:
      current_users_embeddings_dict = load_pickle(f"{set_name}_user_embeddings_dict")
  except:
    current_users_embeddings_dict = {}
    for _, row in set_df.iterrows():
        user_id = row["user_id"]
        embedded_guest_country = countries_embedding_dict[row["guest_country"]]
        embedded_guest_type = guest_types_embedding_dict[row["guest_type"]]
        embedded_month = row["month"] / 12
        embedded_room_nights = row["room_nights"] / 112
        current_users_embeddings_dict[user_id] = np.concatenate((embedded_guest_country, embedded_guest_type, np.array([embedded_month, embedded_room_nights])))
    save_to_pickle(current_users_embeddings_dict, f"{set_name}_user_embeddings_dict")

STEP 6 - Training and validation preprocessing

In [12]:
# Create np array that each cell contains user embedding
train_user_embeddings_array = np.array(list(load_pickle("train_user_embeddings_dict").values()))
print(train_user_embeddings_array.shape)
val_user_embeddings_array = np.array(list(load_pickle("val_user_embeddings_dict").values()))
print(val_user_embeddings_array.shape)

(1628989, 12)
(203787, 12)


In [13]:
# Load all section reviews embeddigns
train_title_vectors_dict = load_pickle("train_title_vectors_dict")
train_positive_vectors_dict = load_pickle("train_positive_vectors_dict")
train_negative_vectors_dict = load_pickle("train_negative_vectors_dict")
val_title_vectors_dict = load_pickle("val_title_vectors_dict")
val_positive_vectors_dict = load_pickle("val_positive_vectors_dict")
val_negative_vectors_dict = load_pickle("val_negative_vectors_dict")

In [14]:
reviews_embeddings_arrays = {
    "train": {
        "users": train_user_embeddings_array,
        "title": np.array(list(train_title_vectors_dict.values())),
        "positive": np.array(list(train_positive_vectors_dict.values())),
        "negative": np.array(list(train_negative_vectors_dict.values())),
        "labels": np.concatenate([np.ones(len(train_user_embeddings_array)), np.zeros(len(train_user_embeddings_array) * 2)], axis=0)
    },
    "val": {
        "users": val_user_embeddings_array,
        "title": np.array(list(val_title_vectors_dict.values())),
        "positive": np.array(list(val_positive_vectors_dict.values())),
        "negative": np.array(list(val_negative_vectors_dict.values())),
        "labels": np.concatenate([np.ones(len(val_user_embeddings_array)), np.zeros(len(val_user_embeddings_array) * 2)], axis=0)
    }
}

In [15]:
# Helper functions
def get_correct_pairs(set_name, section_name):
    return reviews_embeddings_arrays[set_name]["users"], reviews_embeddings_arrays[set_name][section_name]

def get_set_labels(section_name):
    return reviews_embeddings_arrays[section_name]["labels"]

def create_double_negative_lists(embeddings_list, set_name="", section_name=""):
    amount_of_reviews = len(embeddings_list)
    last_review_index = amount_of_reviews - 1
    print_steps = amount_of_reviews // 20
    current_print = 0
    negative_reviews1, negative_reviews2 = [], []
    for current_review_index, current_embedding in enumerate(embeddings_list, start=1):
        if current_review_index % print_steps == 0:
            print(f"{set_name}-{section_name}: {current_print}% completed")
            current_print += 5
        negative_review1 = embeddings_list[randint(0, last_review_index)]
        while np.all(negative_review1 == current_embedding):
            # print("repeated1")
            negative_review1 = embeddings_list[randint(0, last_review_index)]
        negative_review2 = embeddings_list[randint(0, last_review_index)]
        while np.all(negative_review2 == current_embedding) or np.all(negative_review1 == negative_review2):
            # print("repeated2")
            negative_review2 = embeddings_list[randint(0, last_review_index)]
        negative_reviews1.append(negative_review1)
        negative_reviews2.append(negative_review2)
    return np.array(negative_reviews1), np.array(negative_reviews2)

STEP 7 - Creating the model

In [16]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    square_pred = tf.square(y_pred)
    margin_square = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean((1 - y_true) * square_pred + y_true * margin_square)

def create_contrastive_model():
    user_input = Input(shape=(12,), name='user')
    user_dense = Dense(384, activation='relu', name='user_dense')(user_input)

    review_input = Input(shape=(384,), name='review')

    cosine_similarity = Lambda(lambda tensors: tf.reduce_sum(tensors[0] * tensors[1], axis=-1, keepdims=True)) \
                              ([user_dense, review_input])

    return Model([user_input, review_input], cosine_similarity, name='contrastive_model')

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

def create_section_model(section_name, batch_size=512, epochs=15):
    print("Creating training data")
    train_users, train_section_reviews = get_correct_pairs("train", section_name)
    train_negative1, train_negative2 = create_double_negative_lists(train_section_reviews, "train", section_name)
    train_all_users = np.concatenate([train_users, train_users, train_users], axis=0)
    train_all_section_reviews = np.concatenate([train_section_reviews, train_negative1, train_negative2], axis=0)
    train_all_labels = get_set_labels("train")
    print("Creating validation data")
    val_users, val_section_reviews = get_correct_pairs("val", section_name)
    val_negative1, val_negative2 = create_double_negative_lists(val_section_reviews, "val", section_name)
    val_all_users = np.concatenate([val_users, val_users, val_users], axis=0)
    val_all_section_reviews = np.concatenate([val_section_reviews, val_negative1, val_negative2], axis=0)
    val_all_labels = get_set_labels("val")


    model = create_contrastive_model()

    model.compile(optimizer='adam', loss=contrastive_loss)

    history = model.fit(
        [train_all_users, train_all_section_reviews], train_all_labels,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=([val_all_users, val_all_section_reviews], val_all_labels),
        callbacks=[early_stopping, lr_scheduler]
    )
    return model

In [17]:
for section_name in ["title", "positive", "negative"]:
    section_model = create_section_model(section_name)
    section_model.save(f"{section_name}_model.h5")

Creating training data
train-title: 0% completed
train-title: 5% completed
train-title: 10% completed
train-title: 15% completed
train-title: 20% completed
train-title: 25% completed
train-title: 30% completed
train-title: 35% completed
train-title: 40% completed
train-title: 45% completed
train-title: 50% completed
train-title: 55% completed
train-title: 60% completed
train-title: 65% completed
train-title: 70% completed
train-title: 75% completed
train-title: 80% completed
train-title: 85% completed
train-title: 90% completed
train-title: 95% completed
Creating validation data
val-title: 0% completed
val-title: 5% completed
val-title: 10% completed
val-title: 15% completed
val-title: 20% completed
val-title: 25% completed
val-title: 30% completed
val-title: 35% completed
val-title: 40% completed
val-title: 45% completed
val-title: 50% completed
val-title: 55% completed
val-title: 60% completed
val-title: 65% completed
val-title: 70% completed
val-title: 75% completed
val-title: 80% c



Creating training data
train-positive: 0% completed
train-positive: 5% completed
train-positive: 10% completed
train-positive: 15% completed
train-positive: 20% completed
train-positive: 25% completed
train-positive: 30% completed
train-positive: 35% completed
train-positive: 40% completed
train-positive: 45% completed
train-positive: 50% completed
train-positive: 55% completed
train-positive: 60% completed
train-positive: 65% completed
train-positive: 70% completed
train-positive: 75% completed
train-positive: 80% completed
train-positive: 85% completed
train-positive: 90% completed
train-positive: 95% completed
Creating validation data
val-positive: 0% completed
val-positive: 5% completed
val-positive: 10% completed
val-positive: 15% completed
val-positive: 20% completed
val-positive: 25% completed
val-positive: 30% completed
val-positive: 35% completed
val-positive: 40% completed
val-positive: 45% completed
val-positive: 50% completed
val-positive: 55% completed
val-positive: 60% co



Creating training data
train-negative: 0% completed
train-negative: 5% completed
train-negative: 10% completed
train-negative: 15% completed
train-negative: 20% completed
train-negative: 25% completed
train-negative: 30% completed
train-negative: 35% completed
train-negative: 40% completed
train-negative: 45% completed
train-negative: 50% completed
train-negative: 55% completed
train-negative: 60% completed
train-negative: 65% completed
train-negative: 70% completed
train-negative: 75% completed
train-negative: 80% completed
train-negative: 85% completed
train-negative: 90% completed
train-negative: 95% completed
Creating validation data
val-negative: 0% completed
val-negative: 5% completed
val-negative: 10% completed
val-negative: 15% completed
val-negative: 20% completed
val-negative: 25% completed
val-negative: 30% completed
val-negative: 35% completed
val-negative: 40% completed
val-negative: 45% completed
val-negative: 50% completed
val-negative: 55% completed
val-negative: 60% co

