STEP 0: Imports

In [1]:
# Adjusting current working directory to parent directory
from pathlib import Path
from os import chdir
try:
    current_directory
except: # First  run - initialize current_directory
    current_directory = Path.cwd()
finally:
    parent_directory = current_directory.parent
    chdir(parent_directory)


# External imports
from sentence_transformers import SentenceTransformer

# Internal imports
from src.data.csv_tools import csv_to_dataframe
from src.data.pickle_tools import save_to_pickle
from src.utils.preprocessing_tools import *

STEP 1: Merge and aggregate the raw data:
    for each accommodation get a list of all the relevant reviews to it
    create a single dataset for each set of the train and val - first merge users to match by userid (they are unique), then merge to reviews by review id

In [None]:
# aggregate reviews for each accommodation as a list
for set_name in ["train", "val", "test"]:
    accommodation_reviews = create_accommodation_reviews(set_name)
    print("Example: first accommodation reviews")
    first_accommodation_id = list(accommodation_reviews.keys())[0]
    print(first_accommodation_id, accommodation_reviews[first_accommodation_id])

In [None]:
# Concatenate train and val sets
train_processed_set = create_concatenated_set("train")
val_processed_set = create_concatenated_set("val")

print("Sizes: train, val")
print(len(train_processed_set), len(val_processed_set))

In [None]:
# Fill na with empty strings 
train_processed_set[["review_title", "review_positive", "review_negative"]] = train_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
val_processed_set[["review_title", "review_positive", "review_negative"]] = val_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
# For test, get reviews data
test_processed_set = csv_to_dataframe("test", "reviews")
test_processed_set[["review_title", "review_positive", "review_negative"]] = test_processed_set[["review_title", "review_positive", "review_negative"]].fillna("")
# Validate that there are no missing values
print("Missing values in train:")
print(train_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())
print("Missing values in val:")
print(val_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())
print("Missing values in test:")
print(test_processed_set[["review_title", "review_positive", "review_negative"]].isna().sum())

In [6]:
# Initialize the model
model = SentenceTransformer("all-MiniLM-L6-v2")

If you want to check if it's working on a small dataset run the below cell, otherwise you can skip it

In [None]:
# Sanity check
sanity_dataset = val_processed_set.head()
title_vectors = model.encode(sanity_dataset["review_title"].values)
positive_vectors = model.encode(sanity_dataset["review_positive"].values)
negative_vectors = model.encode(sanity_dataset["review_negative"].values)
print(title_vectors.shape)
print(positive_vectors.shape)
print(negative_vectors.shape)

In [8]:
# Get pairs for each set
set_name_data_pair = (("train", train_processed_set), ("val", val_processed_set), ("test", test_processed_set))

In [None]:
for set_name, set_data in set_name_data_pair:
    review_ids = set_data["review_id"].values
    first_review_id = review_ids[0]

    title_vectors = model.encode(set_data["review_title"].values, show_progress_bar=True)
    title_vectors_dict = dict(zip(review_ids, title_vectors))
    print(f"Finished title. Check for {set_name}: review {first_review_id}, vector:{title_vectors_dict[first_review_id]}")
    save_to_pickle(title_vectors_dict, f"{set_name}_title_vectors_dict")
    
    positive_vectors = model.encode(set_data["review_positive"].values, show_progress_bar=True)
    positive_review_vectors_dict = dict(zip(review_ids, positive_vectors))
    print(f"Finished positive. Check for {set_name}: review {first_review_id}, vector:{positive_review_vectors_dict[first_review_id]}")
    save_to_pickle(positive_review_vectors_dict, f"{set_name}_positive_vectors_dict")
    
    negative_vectors = model.encode(set_data["review_negative"].values, show_progress_bar=True)
    negative_review_vectors_dict = dict(zip(review_ids, negative_vectors))
    print(f"Finished negative. Check for {set_name}: review {first_review_id}, vector:{negative_review_vectors_dict[first_review_id]}")
    save_to_pickle(negative_review_vectors_dict, f"{set_name}_negative_vectors_dict")