### **Step 0:** Import necessary libraries and connect to Google Drive for file access and output storage.

In [None]:
# Adjusting current working directory to parent directory
from pathlib import Path
from os import chdir
from platform import system

try:
    current_directory
except: # First  run - initialize current_directory
    current_directory = Path.cwd()
    if system() == "Linux": # Colab
        from google.colab import drive
        drive.mount('/content/drive')
        current_directory = f"{current_directory}/drive/MyDrive/Colab Notebooks/RecTour2024Challenge"
    else:
        current_directory = current_directory.parent
finally:
    chdir(current_directory)



# External imports
import pandas as pd
import numpy as np

from random import randint

import torch
import torch.nn as nn

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import load_model



# Internal imports
from src.data.csv_tools import csv_to_dataframe, dataframe_to_csv, save_submission
from src.data.pickle_tools import save_to_pickle, load_pickle
from src.data.keras_tools import save_keras_model_weights, load_keras_model_weights
from src.utils.preprocessing_tools import *

Mounted at /content/drive


# ***Prediction process for the third approach***

### Loading the trained SentenceTransformer model

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("user_review_matching_model2")

### Preparing test data and splitting for evaluation

In [None]:
accommodation_reviews_dict = load_pickle("test_reviews_grouped_by_accommodation")
test_users_df = csv_to_dataframe("test", "users")
test_review_df = csv_to_dataframe("test", "reviews")
columns = ["accommodation_id", "user_id"] + [f"review_{i}" for i in range(1, 11)]
split_dfs = np.array_split(test_users_df, 20)

  return bound(*args, **kwds)


### Precomputing and saving review encode vectors

In [None]:
reviews_reviews_vectors_dict = None
try:
    reviews_reviews_vectors_dict = load_pickle(f"test_hf_reviews_vectors_dict")
except:
    review_ids = test_review_df["review_id"].values
    combined_reviews = [
    f"{title} {positive} {negative}"
    for title, positive, negative in zip(
        test_review_df['review_title'].values,
        test_review_df['review_positive'].values,
        test_review_df['review_negative'].values)]
    reviews_vectors = model.encode(combined_reviews , show_progress_bar=True)
    reviews_reviews_vectors_dict = dict(zip(review_ids, reviews_vectors))
    save_to_pickle(reviews_reviews_vectors_dict, f"test_hf_reviews_vectors_dict")
len(reviews_reviews_vectors_dict)

Batches:   0%|          | 0/6224 [00:00<?, ?it/s]

Object saved to test_hf_reviews_all_featurs_vectors_dict.pickle


199138

## Generating recommendations and creating submission files (split into 20 parts as checkpoints)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cache = {}
for index, split_df in enumerate(split_dfs):
  submission_df = pd.DataFrame(columns=columns)
  for current_user_index, row in split_df.iterrows():
      if current_user_index % 2000 == 0:
        print(f"Current index: {current_user_index}")
      user_id = row["user_id"]
      accommodation_id = row["accommodation_id"]

      user_embedding = model.encode(f"{row['guest_country']} {row['guest_type']} {row['month']} {row['room_nights']} {row['accommodation_country']} {row['accommodation_type']} {row['accommodation_score']} {row['accommodation_star_rating']} {row['location_is_ski']} {row['location_is_beach']} {row['location_is_city_center']}")

      if accommodation_id in cache:
        candidate_reviews_embeddings = cache[accommodation_id]
      else:
        accommodation_reviews_ids = accommodation_reviews_dict[accommodation_id]
        candidate_reviews_embeddings = [reviews_reviews_vectors_dict[review_id] for review_id in accommodation_reviews_ids]
        cache[accommodation_id] = candidate_reviews_embeddings

      cosine_scores = cosine_similarity(candidate_reviews_embeddings, user_embedding.reshape(1, -1)).flatten()
      top_10_reviews = np.argsort(cosine_scores)[-10:][::-1]
      reviews = [accommodation_reviews_dict[accommodation_id][i] for i in top_10_reviews]
      submission_df = submission_df._append(pd.Series([accommodation_id, user_id] + reviews, index=columns), ignore_index=True)

  dataframe_to_csv(submission_df, f"SubmissionPart{index}.csv")

Current index: 0
Current index: 2000
Current index: 4000
Current index: 6000
Current index: 8000
Current index: 10000
Current index: 12000
Current index: 14000
Current index: 16000
Current index: 18000
Current index: 20000
Current index: 22000
Current index: 24000
Current index: 26000
Current index: 28000
Current index: 30000
Current index: 32000
Current index: 34000
Current index: 36000
Current index: 38000
Current index: 40000
Current index: 42000
Current index: 44000
Current index: 46000
Current index: 48000
Current index: 50000
Current index: 52000
Current index: 54000
Current index: 56000
Current index: 58000
Current index: 60000
Current index: 62000
Current index: 64000
Current index: 66000
Current index: 68000
Current index: 70000
Current index: 72000
Current index: 74000
Current index: 76000
Current index: 78000
Current index: 80000
Current index: 82000
Current index: 84000
Current index: 86000
Current index: 88000
Current index: 90000
Current index: 92000
Current index: 94000


# ***Prediction process for the first and seconde approach***

### **STEP 1:** Loading test data and precomputed embeddings

In [None]:
test_users_df = csv_to_dataframe("test", "users")
users_embeddings_dict = load_pickle("test_user_embeddings_dict")
title_vectors_dict = load_pickle("test_title_vectors_dict")
positive_vectors_dict = load_pickle("test_positive_vectors_dict")
negative_vectors_dict = load_pickle("test_negative_vectors_dict")
accommodation_reviews_dict = load_pickle("test_reviews_grouped_by_accommodation")

### **STEP 2:** Load models

### To use the saved model from approach 2, we needed to redefine its architecture before loading the weights. This is because only the training weights were saved, not the architecture, due to the use of the Lambda layer, which requires explicit redefinition.

In [None]:
def create_contrastive_model():
    user_input = Input(shape=(25,), name='user')
    user_dense = Dense(384, activation='relu', name='user_dense')(user_input)

    review_input = Input(shape=(384,), name='review')

    dotProduct = Lambda(lambda tensors: tf.reduce_sum(tensors[0] * tensors[1], axis=-1, keepdims=True)) \
                              ([user_dense, review_input])

    return Model([user_input, review_input], dotProduct, name='contrastive_model')

positive_model = create_contrastive_model()
load_keras_model_weights(positive_model, "positive_model")
negative_model = create_contrastive_model()
load_keras_model_weights(negative_model, "negative_model")
title_model = create_contrastive_model()
load_keras_model_weights(title_model, "title_model")

### To load the model for approach 1, we used the load_model method from Keras. This method directly loads the saved model, including both the architecture and the trained weights

In [None]:
from tensorflow.keras.models import load_model
positive_model = load_model("out/models/positive_3_negative_with_embedded_of_acc_with_binary_crossentropy.h5")
negative_model = load_model("out/models/negative_3_negative_with_embedded_of_acc_with_binary_crossentropy.h5")
title_model = load_model("out/models/title_3_negative_with_embedded_of_acc_with_binary_crossentropy.h5")



### **STEP 3:** Prepare the input to the models

In [None]:
# Finding reviews that are likely to match a user given the accommodation
def get_model_input(user_id, accommodation_id):
    user_embedding = users_embeddings_dict[user_id]
    accommodation_reviews_ids = accommodation_reviews_dict[accommodation_id]
    titles_embeddings = [title_vectors_dict[review_id] for review_id in accommodation_reviews_ids]
    positive_embeddings = [positive_vectors_dict[review_id] for review_id in accommodation_reviews_ids]
    negative_embeddings = [negative_vectors_dict[review_id] for review_id in accommodation_reviews_ids]
    titles_embeddings = np.array(titles_embeddings)
    positive_embeddings = np.array(positive_embeddings)
    negative_embeddings = np.array(negative_embeddings)
    user_embedding = np.array([user_embedding] * len(accommodation_reviews_ids))
    return [user_embedding, titles_embeddings] , [user_embedding, positive_embeddings] , [user_embedding, negative_embeddings]

### **STEP 4:** Preparing columns and splitting test data into chunks

In [None]:
columns = ["accommodation_id", "user_id"] + [f"review_{i}" for i in range(1, 11)]
split_dfs = np.array_split(test_users_df, 20)

  return bound(*args, **kwds)


### **STEP 5:** Predict

In [None]:
for index, split_df in enumerate(split_dfs):
  submission_df = pd.DataFrame(columns=columns)
  for current_user_index, row in split_df.iterrows():
      if current_user_index % 2000 == 0:
          print(f"Current index: {current_user_index}")
      user_id = row["user_id"]
      accommodation_id = row["accommodation_id"]
      title_input, positive_input, negative_input = get_model_input(user_id, accommodation_id)

      # Prediction for each of the three models
      title_prediction = title_model.predict(title_input, verbose = 0)
      positive_prediction = positive_model.predict(positive_input, verbose = 0)
      negative_prediction = negative_model.predict(negative_input, verbose = 0)

      # Get the top 10 reviews
      top_10_reviews = []
      for i in range(len(positive_prediction)):
          top_10_reviews.append(title_prediction[i][0] * 0.2 + positive_prediction[i][0] * 0.4 + negative_prediction[i][0] * 0.4)     # Ensemble
      top_10_reviews = np.argsort(top_10_reviews)[-10:][::-1]
      reviews = [accommodation_reviews_dict[accommodation_id][i] for i in top_10_reviews]
      submission_df = submission_df._append(pd.Series([accommodation_id, user_id] + reviews, index=columns), ignore_index=True)

  dataframe_to_csv(submission_df, f"SubmissionPart{index}.csv")


Current index: 0




Current index: 2000
Current index: 4000
Current index: 6000
Current index: 8000
Current index: 10000
Current index: 12000
Current index: 14000
Current index: 16000
Current index: 18000
Current index: 20000
Current index: 22000
Current index: 24000
Current index: 26000
Current index: 28000
Current index: 30000
Current index: 32000
Current index: 34000
Current index: 36000
Current index: 38000
Current index: 40000
Current index: 42000
Current index: 44000
Current index: 46000
Current index: 48000
Current index: 50000
Current index: 52000
Current index: 54000
Current index: 56000
Current index: 58000
Current index: 60000
Current index: 62000
Current index: 64000
Current index: 66000
Current index: 68000
Current index: 70000
Current index: 72000
Current index: 74000
Current index: 76000
Current index: 78000
Current index: 80000
Current index: 82000
Current index: 84000
Current index: 86000
Current index: 88000
Current index: 90000
Current index: 92000
Current index: 94000
Current index: 96

### Finally, we combine all 20 parts into a single submission file - This applies to all three approaches

In [None]:
import os
import pandas as pd

folder_path = os.getcwd()
file_paths = [f"{folder_path}/SubmissionPart{i}.csv" for i in range(20)]
merged_df = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)


In [None]:
save_submission(merged_df)

# ***In this section, we present experiments we conducted using validation to select appropriate weights for each model in approaches 1 and 2.***

### We started by loading and preparing all the necessary components for validation

In [None]:
import pandas as pd

val_top_accomodations_df = pd.read_csv("out/top_accommodations.csv")
val_top_accomodations_df.head()

Unnamed: 0,accommodation_id
0,937920089
1,359671908
2,1395491826
3,-978565784
4,365136640


In [None]:
val_top_accomodations_set = set(val_top_accomodations_df["accommodation_id"].values.tolist())
val_proccessed_df = csv_to_dataframe("val")
filtered_rows_df = val_proccessed_df[val_proccessed_df['accommodation_id'].isin(val_top_accomodations_set)]
filtered_rows_df.shape

(35092, 19)

In [None]:
users_embeddings_dict = load_pickle("val_user_embeddings_dict")
title_vectors_dict = load_pickle("val_title_vectors_dict")
positive_vectors_dict = load_pickle("val_positive_vectors_dict")
negative_vectors_dict = load_pickle("val_negative_vectors_dict")
accommodation_reviews_dict = load_pickle("val_reviews_grouped_by_accommodation")

split_dfs = np.array_split(filtered_rows_df, 5)

  return bound(*args, **kwds)


### Weights we tried

In [None]:
weights_dict = {
    (0.5, 0.3, 0.2): 0,
    (0.4, 0.4, 0.2): 0,
    (0.6, 0.3, 0.1): 0,
    (0.7, 0.2, 0.1): 0,
    (0.3, 0.5, 0.2): 0,
    (0.3, 0.6, 0.1): 0,
    (0.2, 0.7, 0.1): 0,
    (0.4, 0.3, 0.3): 0,
    (0.2, 0.4, 0.4): 0,
    (0.1, 0.3, 0.6): 0,
    (0.1, 0.2, 0.7): 0,
    (0.2, 0.3, 0.5): 0,
    (0.1, 0.4, 0.5): 0,
    (0.3, 0.2, 0.5): 0,
    (0.5, 0.2, 0.3): 0,
    (0.4, 0.1, 0.5): 0,
}

### Prediction on validation for all weight combinations

In [None]:
from statistics import mean


start_index = 0

my_weights = list(weights_dict.keys())[start_index: start_index + 4]

for index, split_df in enumerate(split_dfs):
  print(index)
  for current_user_index, row in split_df.iterrows():
      if current_user_index % 2000 == 0:
          print(f"Current index: {current_user_index}")
      user_id = row["user_id"]
      accommodation_id = row["accommodation_id"]
      true_review = row["review_id"]
      title_input, positive_input, negative_input = get_model_input(user_id, accommodation_id)
      title_prediction = title_model.predict(title_input, verbose = 0)
      positive_prediction = positive_model.predict(positive_input, verbose = 0)
      negative_prediction = negative_model.predict(negative_input, verbose = 0)
      for title_weight, positive_weight, negative_weight in my_weights:
          # Get the top 10 reviews where the average is the highest
          top_10_reviews = []
          for i in range(len(positive_prediction)):
              top_10_reviews.append(title_prediction[i][0] * title_weight + positive_prediction[i][0] * positive_weight + negative_prediction[i][0] * negative_weight)
          top_10_reviews = np.argsort(top_10_reviews)[-10:][::-1]
          reviews = [accommodation_reviews_dict[accommodation_id][i] for i in top_10_reviews]
          if true_review not in reviews:
              continue
          true_review_rank = reviews.index(true_review) + 1
          weights_dict[(title_weight, positive_weight, negative_weight)] += 1 / true_review_rank
  save_to_pickle(weights_dict, f"weights_dict{start_index}")



0




Current index: 12000
Current index: 20000
Current index: 30000
Current index: 40000
Object saved to weights_dict0.pickle
1
Current index: 52000
Current index: 54000
Current index: 58000
Current index: 62000
Current index: 76000
Object saved to weights_dict0.pickle
2
Current index: 82000
Current index: 100000
Object saved to weights_dict0.pickle
3
Current index: 124000
Current index: 132000
Current index: 148000
Object saved to weights_dict0.pickle
4
Current index: 194000
Object saved to weights_dict0.pickle


In [None]:
for key, value in weights_dict.items():
    print(f"{key}: {value / len(filtered_rows_df)}")

(0.5, 0.3, 0.2): 0.02027823915006181
(0.4, 0.4, 0.2): 0.020443270206749083
(0.6, 0.3, 0.1): 0.019449962818821852
(0.7, 0.2, 0.1): 0.01894927211737312
(0.3, 0.5, 0.2): 0.0
(0.3, 0.6, 0.1): 0.0
(0.2, 0.7, 0.1): 0.0
(0.4, 0.3, 0.3): 0.0
(0.2, 0.4, 0.4): 0.0
(0.1, 0.3, 0.6): 0.0
(0.1, 0.2, 0.7): 0.0
(0.2, 0.3, 0.5): 0.0
(0.1, 0.4, 0.5): 0.0
(0.3, 0.2, 0.5): 0.0
(0.5, 0.2, 0.3): 0.0
(0.4, 0.1, 0.5): 0.0
