In [54]:
import h5py
import configparser
import pandas as pd
import numpy as np
import torch
import os, sys
import pickle
import json
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

In [55]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## Similarity with Document Embeddings

In [57]:
#precomputed_text_features_path = "/home/lcur0220/mma_2023_uva_group_1/dataloading/Funda/real_estate_text_features.pkl"
precomputed_text_features_path = "dataloading/Funda/real_estate_sentence_features.pkl"
with open(precomputed_text_features_path, "rb") as f:
    text_df = pickle.load(f)
    text_df.set_index(["funda_id", "text_id"], inplace=True, drop=True)
    text_features = np.array(text_df["text_features"].tolist())
    


In [58]:
def cosine_similarity(a, b):
        return (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))).squeeze(0)
    
def aggregate_scores(embedding_df, scores):
        embedding_df["scores"] = scores
        indices = embedding_df.groupby("funda_id")["scores"].idxmax()
        return indices

## Encoding of query

In [56]:
csv_path = "dataloading/Funda/dataset.csv"
df = pd.read_csv(csv_path, dtype=str, index_col=0)
df = df.set_index('funda_identifier', drop=True)
df.index = df.index.astype(int)

In [77]:
model = SentenceTransformer('krlng/sts-GBERT-bi-encoder')
query = "a princess castle"
q_embedding = model.encode(query)
q_embedding = np.expand_dims(q_embedding, axis=0)

In [78]:
# Calculate cosine similarity
similarity_scores = cosine_similarity(q_embedding, text_features.T).round(4)

In [79]:
# Max aggregation of scores
max_scores_idxs = aggregate_scores(text_df, similarity_scores)
max_scores_ids = max_scores_idxs.apply(lambda x: x[1])
max_scores_ids.name = "max_id"
max_scores = text_df.loc[max_scores_idxs].scores
max_scores.name = "max_score"
max_features = text_df.loc[max_scores_idxs].text_features
max_features.name = "max_feature"

In [80]:
# Merge to main df
max_scores.index = max_scores.index.droplevel(1)
df_merged = df.merge(max_scores, how="left", left_index=True, right_index=True) # scores
df_merged = df_merged.merge(max_scores_ids, how="left", left_index=True, right_index=True) # which sentence

In [81]:
# Extract n'th sentence from description
def get_nth_sentence(data):
    n = data["max_id"]
    description = data["description"]
    return sent_tokenize(description)[n]

df_merged["max_sentence"] = df_merged.apply(get_nth_sentence, axis=1)

In [82]:
df_sorted = df_merged.sort_values("max_score", ascending=False)

In [83]:
df_sorted.head(5)[["max_score", "max_sentence"]]

Unnamed: 0_level_0,max_score,max_sentence
funda_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
42111216,0.0009,Would you like to live like a prince in an...
42101358,0.0008,Achter de woning bevindt zich namelijk een roy...
42127917,0.0008,Enter your own Palace with this unique split l...
42109838,0.0008,Living in a monumental warehouse on the fai...
42117848,0.0008,Welcome to your future Palace!


In [84]:
df_sorted.tail(5)[["max_score", "max_sentence"]]

Unnamed: 0_level_0,max_score,max_sentence
funda_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
42128486,0.0002,Eureka!
42125526,0.0002,"A perfect base; ready to move in, sleekly styl..."
42114116,0.0002,Sold immediately!
42128671,0.0002,SQUARE BUILDING NUMBER 5.30 • Nice corner apar...
42126717,0.0001,Oranjelaan 39 in 3843 AB Harderwijk
