In [33]:
import h5py
import configparser
import pandas as pd
import numpy as np
import torch
import os, sys
import pickle
import json
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## Encoding of query

In [51]:
model = SentenceTransformer('krlng/sts-GBERT-bi-encoder')
query = "Big open kitchen"
q_embedding = model.encode(query)
q_embedding = np.expand_dims(q_embedding, axis=0)

In [58]:
csv_path = "/home/lcur0220/mma_2023_uva_group_1/dataloading/Funda/dataset.csv"
df = pd.read_csv(csv_path, dtype=str, index_col=0)
df = df.set_index('funda_identifier', drop=True)
df.index = df.index.astype(int)

## Similarity with Document Embeddings

In [59]:
#precomputed_text_features_path = "/home/lcur0220/mma_2023_uva_group_1/dataloading/Funda/real_estate_text_features.pkl"
precomputed_text_features_path = "/home/lcur0220/mma_2023_uva_group_1/dataloading/Funda/real_estate_sentence_features.pkl"
with open(precomputed_text_features_path, "rb") as f:
    text_df = pickle.load(f)
    text_df.set_index(["funda_id", "text_id"], inplace=True, drop=True)
    text_features = np.array(text_df["text_features"].tolist())
    


In [60]:
def cosine_similarity(a, b):
        return (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))).squeeze(0)
    
def aggregate_scores(embedding_df, scores):
    embedding_df["scores"] = scores
    indices = embedding_df.groupby("funda_id")["umap_x"].idxmax()
    return indices

In [61]:
# Calculate cosine similarity
similarity_scores = cosine_similarity(q_embedding, text_features.T).round(4)

In [62]:
# Max aggregation of scores
max_scores_idxs = aggregate_scores(text_df, similarity_scores)
max_scores_ids = max_scores_idxs.apply(lambda x: x[1])
max_scores_ids.name = "max_id"
max_scores = text_df.loc[max_scores_idxs].scores
max_scores.name = "max_score"
max_features = text_df.loc[max_scores_idxs].text_features
max_features.name = "max_feature"

In [63]:
# Merge to main df
max_scores.index = max_scores.index.droplevel(1)
df = df.merge(max_scores, how="left", left_index=True, right_index=True)
df = df.merge(max_scores_ids, how="left", left_index=True, right_index=True)

In [64]:
# Extract n'th sentence from description
def get_nth_sentence(data):
    n = data["max_id"]
    description = data["description"]
    return sent_tokenize(description)[n]

df["max_sentence"] = df.apply(get_nth_sentence, axis=1)

In [65]:
df = df.sort_values("max_score", ascending=False)

In [66]:
df.head(5)[["max_score", "max_sentence"]]

Unnamed: 0_level_0,max_score,max_sentence
funda_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
42127234,0.0012,Open kitchen with various equipment.
42108354,0.0012,"On the ground floor, the semi-open kitchen are..."
42107124,0.0011,Open kitchen with simple kitchen unit where yo...
42100590,0.0011,The simple semi-open kitchen offers possibilit...
42103363,0.0011,The living room with open kitchen is wonderful...


In [68]:
df.tail(5)[["max_score", "max_sentence"]]

Unnamed: 0_level_0,max_score,max_sentence
funda_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
42109297,0.0,There are two access roads towards Amsterdam: ...
42116892,0.0,Located just 200 meters walking distance from ...
42100753,0.0,B.jr.
42106040,0.0,The viewing days are on Friday 19-05-2023 and ...
42108163,0.0,Within 20 minutes you are in Amsterdam.
