In [1]:
# Adjusting current working directory to parent directory
from pathlib import Path
from os import chdir
from platform import system

try:
    current_directory
except: # First  run - initialize current_directory
    current_directory = Path.cwd()
    if system() == "Linux": # Colab
        from google.colab import drive
        drive.mount('/content/drive')
        current_directory = f"{current_directory}/drive/MyDrive/Colab Notebooks/RecTour2024Challenge"
    else:
        current_directory = current_directory.parent
finally:
    chdir(current_directory)



# External imports
import pandas as pd
import numpy as np

from random import randint

import torch
import torch.nn as nn

from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import load_model



# Internal imports
from src.data.csv_tools import csv_to_dataframe, dataframe_to_csv, save_submission
from src.data.pickle_tools import save_to_pickle, load_pickle
from src.data.keras_tools import save_keras_model, load_keras_model
from src.utils.preprocessing_tools import *

Mounted at /content/drive


STEP 1 - Load data

In [2]:
test_users_df = csv_to_dataframe("test", "users")
users_embeddings_dict = load_pickle("test_user_embeddings_dict")
title_vectors_dict = load_pickle("test_title_vectors_dict")
positive_vectors_dict = load_pickle("test_positive_vectors_dict")
negative_vectors_dict = load_pickle("test_negative_vectors_dict")
accomodation_reviews_dict = load_pickle("test_reviews_grouped_by_accommodation")

STEP 2 - Load models

In [3]:
def create_contrastive_model():
    user_input = Input(shape=(12,), name='user')
    user_dense = Dense(384, activation='relu', name='user_dense')(user_input)

    review_input = Input(shape=(384,), name='review')

    cosine_similarity = Lambda(lambda tensors: tf.reduce_sum(tensors[0] * tensors[1], axis=-1, keepdims=True)) \
                              ([user_dense, review_input])

    return Model([user_input, review_input], cosine_similarity, name='contrastive_model')

In [4]:
title_model = create_contrastive_model()
title_model.load_weights("title_model.h5")
positive_model = create_contrastive_model()
positive_model.load_weights("positive_model.h5")
negative_model = create_contrastive_model()
negative_model.load_weights("negative_model.h5")

STEP 3 - Prepare the input to the models

In [5]:
def get_model_input(user_id, accommodation_id):
    user_embedding = users_embeddings_dict[user_id]
    accomodation_reviews_ids = accomodation_reviews_dict[accommodation_id]
    titles_embeddings = [title_vectors_dict[review_id] for review_id in accomodation_reviews_ids]
    positive_embeddings = [positive_vectors_dict[review_id] for review_id in accomodation_reviews_ids]
    negative_embeddings = [negative_vectors_dict[review_id] for review_id in accomodation_reviews_ids]
    user_embedding = np.array([user_embedding] * len(titles_embeddings))
    titles_embeddings = np.array(titles_embeddings)
    positive_embeddings = np.array(positive_embeddings)
    negative_embeddings = np.array(negative_embeddings)
    return [user_embedding, titles_embeddings], [user_embedding, positive_embeddings], [user_embedding, negative_embeddings]

In [6]:
# Create df - Structure: accommodation_id,user_id,review_1,review_2,review_3,...,review_10
columns = ["accommodation_id", "user_id"] + [f"review_{i}" for i in range(1, 11)]
submission_df = pd.DataFrame(columns=columns)

In [7]:
split_dfs = np.array_split(test_users_df, 20)

  return bound(*args, **kwds)


In [8]:
type(test_users_df)

In [None]:
submission_df = pd.DataFrame(columns=columns)
for current_user_index, row in test_users_df.iterrows():
    if current_user_index % 2000 == 0:
        print(f"Current index: {current_user_index}")
    user_id = row["user_id"]
    accomodation_id = row["accommodation_id"]
    title_input, positive_input, negative_input = get_model_input(user_id, accomodation_id)
    title_prediction = title_model.predict(title_input, verbose = 0)
    positive_prediction = positive_model.predict(positive_input, verbose = 0)
    negative_prediction = negative_model.predict(negative_input, verbose = 0)
    # Get the top 10 reviews where the average is the highest
    top_10_reviews = []
    for i in range(len(title_prediction)):
        top_10_reviews.append((title_prediction[i] + positive_prediction[i] + negative_prediction[i]).mean())
    top_10_reviews = np.argsort(top_10_reviews)[-10:]
    reviews = [accomodation_reviews_dict[accomodation_id][i] for i in top_10_reviews]
    submission_df = submission_df._append(pd.Series([accomodation_id, user_id] + reviews, index=columns), ignore_index=True)

save_submission(submission_df)


Current index: 0




Current index: 2000
Current index: 4000
Current index: 6000
Current index: 8000
Current index: 10000
Current index: 12000
Current index: 14000
Current index: 16000
Current index: 18000
Current index: 20000
Current index: 22000
Current index: 24000
Current index: 26000
Current index: 28000
Current index: 30000
Current index: 32000
Current index: 34000
Current index: 36000
Current index: 38000
Current index: 40000
Current index: 42000
Current index: 44000
Current index: 46000
Current index: 48000
Current index: 50000
Current index: 52000
Current index: 54000
Current index: 56000
Current index: 58000
Current index: 60000
Current index: 62000
Current index: 64000
Current index: 66000
Current index: 68000
Current index: 70000
Current index: 72000
Current index: 74000
Current index: 76000
Current index: 78000
Current index: 80000
Current index: 82000
Current index: 84000
Current index: 86000
Current index: 88000
Current index: 90000
Current index: 92000
Current index: 94000
Current index: 96