### Import Libraries

In [1]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import string
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

pd.set_option("max_colwidth", 500)

### Read

In [133]:
print("Reading data...")

##
## Read
##

## Listings
listings_folder_path = '..\\data\\raw\\listings'

# Initialize an empty DataFrame for listings
df_listings = pd.DataFrame()

# Iterate through each listing file in the folder
for listing_file in os.listdir(listings_folder_path):
    listing_file_path = os.path.join(listings_folder_path, listing_file)

    # Read the CSV file with gzip compression
    df = pd.read_csv(listing_file_path, compression='gzip')

    # Concatenate the current DataFrame with the overall listings DataFrame
    df_listings = pd.concat([df, df_listings])

print("Listings data read successfully.")

## Reviews
reviews_folder_path = '..\\data\\raw\\reviews'

# Initialize an empty DataFrame for reviews
df_reviews = pd.DataFrame()

# Iterate through each review file in the folder
for review_file in os.listdir(reviews_folder_path):
    review_file_path = os.path.join(reviews_folder_path, review_file)

    # Read the CSV file with gzip compression
    df = pd.read_csv(review_file_path, compression='gzip')

    # Concatenate the current DataFrame with the overall reviews DataFrame
    df_reviews = pd.concat([df, df_reviews])

print("Reviews data read successfully.")

print("Transforming data...")

##
## Transform
##

# Listings
# Split name into actual name and summary
df_listings['subtext'] = df_listings['name'].str.split(' · ').str[1:].apply(lambda x: ' · '.join(x))
df_listings['subtext'] = df_listings['subtext'].str.replace('·', '•')
df_listings['name'] = df_listings['name'].str.split(' · ').str[0]

# Rename columns for clarity
df_listings.rename(columns={'listing_url': 'link', 'picture_url': 'photo', 'neighbourhood': 'location','review_scores_rating':'starRating','latitude':'lat','longitude':'long'}, inplace=True)

listings_id_column = 'id'
listings_nlp_columns = [
    'amenities',
    'accommodates',
    'name',
    'subtext',
    'property_type',
    'room_type',
    'location',
    'neighbourhood_cleansed',
    'description'
]

# Other columns to save in the DataFrame
cols_aux_final = ['id', 'name', 'subtext', 'description', 'link', 'photo', 'price', 'location','starRating','lat','long']

# Combine specified columns into a new column 'corpus_text_host'
df_listings.loc[:, 'corpus_text_host'] = ''
for nlp_col in listings_nlp_columns:
    df_listings.loc[:, 'corpus_text_host'] += ' ' + df_listings.loc[:, nlp_col].fillna('').astype(str) + '. '

# Select final columns for the listings DataFrame
df_listings = df_listings[cols_aux_final + ['corpus_text_host']]

# Rename the 'comments' column to 'corpus_text_reviews'
df_reviews.rename(columns={'comments': 'corpus_text_reviews'}, inplace=True)

Reading data...
Listings data read successfully.
Reviews data read successfully.
Transforming data...


In [134]:
df_reviews = df_reviews.loc[:500]

### Preprocess for NLP

In [135]:
# Download nltk
nltk.download('stopwords') 
nltk.download('wordnet')

# Function to perform all cleaning steps
def clean_text(text):
        
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    
    # Lowercase the text
    text = text.lower()
    
    # Tokenization
    tokens = re.split(r'\W+', text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Stemming
    tokens = [porter_stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Set of English stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer and lemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [136]:
df_reviews['corpus_text_reviews'] = df_reviews['corpus_text_reviews'].apply(clean_text)

In [137]:
df_reviews = df_reviews[df_reviews['corpus_text_reviews'].str.len()>1].reset_index(drop=True)

In [138]:
df_reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,corpus_text_reviews
0,8941071,68391055,2016-04-04,10164333,Smruti,"[daniel, great, host, extrem, respons, question, flexibl, extend, stay, kind, apart, exactli, look, pictur, cute, clean, comfort, locat, amaz, quiet, block, close, restaur, whole, food, stay, 2, month, start, job, realli, felt, like, home, thank, daniel]"
1,8941071,153719836,2017-05-21,97944097,Rob,"[apart, great, u, spend, weekend, weho, clean, quiet, night, street, away, good, restaur, cafe, bar, u, thing, miss, iron, cloth, underground, park, good, bmw, x5, big, space, would, fit, ]"
2,8941071,147589354,2017-04-27,4123723,Widya,"[daniel, great, host, concern, guest, wellb, easi, reach, fast, repli, apart, perfectli, locat, walk, access, suppli, shop, hip, place, jog, neighborhood, cant, better, apart, spaciou, bright, comfort, thing, miss, like, toaster, bit, darkli, lit, night, definit, great, valu, moneybrthank, daniel]"
3,8941071,145742425,2017-04-19,1459499,Darian,"[great, locat, spaciou, daniel, place, home, away, home, also, flexibl, check, inout, time, realli, appreci]"
4,8941071,144400833,2017-04-15,98494277,Charlie,"[daniel, place, expect, realli, good, locat, calm, cosybrdaniel, help, make, stress, free, experi, brwould, recommend, ]"
...,...,...,...,...,...,...
489,9260889,57583165,2015-12-27,1965583,Iqbal,"[home, describ, tast, decor, famili, friendli, quiet, neighborhood, clean, well, furnish, kitchen, well, stock, pan, dinnerwar, flatwar, backyard, nice, outdoor, din, home, close, univers, hollywood, blvd, sara, hous, manag, quick, respond, queri, easili, reachabl, text, phone, even, stock, kitchen, basic, perish, nice, gestur, kid, enjoy, home, much, comfort, well, entertain, thank, vast, collect, toy, book, dvd, roku, top, well, decor, christma, porch, light, christma, tree, thank, sara, l..."
490,8987171,263744451,2018-05-12,78791844,Jhj931220@Nate.Com,"[1파머스마켓, 그로브몰라끄마, 큰, 마켓, 가까워서, 걸어다니며, 구경하기, 좋았음, br2숙소의, 안전이, 보장된, 곳, 아파트, 단지, 내에, 거리도, 깨끗하고, 경비가, 철저함br3방에서, 보이는, 뷰가, 멋짐, 매일, 아침, 뷰를, 보며, 일어날때, 설렘, br4우리가, 영어를, 못하여, 많은, 대화가없었지만, 배려를, 해주시는게, 느껴졌음, br5방과, 욕실은, 매우깨끗했고, 침대가, 편했음, br6우버를, 이용하여, 다녔지만, 모든, 관광지를, 다니기에, 적절한, 위치였다]"
491,8987171,253214879,2018-04-13,99804734,Johnny,"[小区很新, 房主也很友善, 女主人是一位很友善会讲中文的漂亮小姐姐, 男主人整天也是对房客笑呵呵的, 唯一的问题是对于自驾的朋友, 因为小区停车位很少, 晚归的话停车不是很方便, ]"
492,8987171,240928049,2018-03-06,174145199,Heuidong,"[큰, 단지, 안에, 있는, 아파트입니다, 치안이, 좋고, 근처에, 쇼핑몰도, 있습니다, 비버리힐즈, 할리우드와, 가까우며, 방에서, 보는, 뷰가, 굉장히, 좋습니다, 호스트는, 굉장히, 친절하며, 편안함을, 줍니다, 완전, 강추]"


### Model

In [140]:
# SBERT model name
model_name = 'multi-qa-MiniLM-L6-cos-v1'

# Initialize SBERT model
print('##### INITIALIZING SBERT MODEL #####')
model = SentenceTransformer(model_name)

# Cached Embeddings Path (changes according to model)
embedding_cache_path = f'cache\\cached-embeddings-{model_name}_mean_average_clean.pkl'

# Current corpus texts
current_corpus_texts_reviews = df_reviews['corpus_text_reviews']

# Encode ALL the current corpus texts into embeddings
print('##### ENCODING ALL CORPUS TEXTS #####')
storage_dict = {}
for corpus_name, corpus_text in zip(['embeddings_reviews'],[current_corpus_texts_reviews]):
    print(f'> {corpus_name}')
    corpus_embeddings = model.encode(corpus_text,show_progress_bar=True,convert_to_tensor=True)

    storage_dict['text_'+corpus_name.split('_')[1]] = corpus_text 
    storage_dict[corpus_name] = corpus_embeddings
    
    for col in ['listing_id']:
        storage_dict[col] = df_reviews[col].to_list()
    
        
# Update & export complete text and embeddings as pkl for future executions
print('##### EXPORTING  #####')
with open(embedding_cache_path, "wb") as fOut:
    pickle.dump(storage_dict, fOut)

##### INITIALIZING SBERT MODEL #####
##### ENCODING ALL CORPUS TEXTS #####
> embeddings_reviews


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

##### EXPORTING  #####


#### Weight Encodings

In [159]:
df = pd.DataFrame.from_dict(storage_dict)
df['embeddings_reviews'] = df['embeddings_reviews'].apply(lambda x: list(x))

In [160]:
def prod(x):
    print(x)
    return torch.prod(torch.tensor(x),dim=0)

In [168]:
df.loc[[30,31],'embeddings_reviews']

30    [tensor(-0.0024), tensor(0.0206), tensor(0.0077), tensor(0.0575), tensor(-0.1284), tensor(0.0862), tensor(0.0500), tensor(-0.0066), tensor(-0.0383), tensor(-0.0341), tensor(0.0182), tensor(-0.0815), tensor(0.0348), tensor(-0.0187), tensor(-0.0028), tensor(0.0786), tensor(0.1157), tensor(-0.0105), tensor(-0.0589), tensor(0.0636), tensor(-0.0822), tensor(-0.0003), tensor(0.0335), tensor(0.0440), tensor(-0.0271), tensor(-0.0251), tensor(0.0467), tensor(0.0090), tensor(0.0279), tensor(0.0114), t...
31    [tensor(0.0765), tensor(0.0222), tensor(-0.0268), tensor(0.0626), tensor(-0.0333), tensor(-0.0570), tensor(-0.0566), tensor(-0.0922), tensor(0.0163), tensor(-0.0328), tensor(0.0463), tensor(-0.0725), tensor(-0.0354), tensor(0.0587), tensor(0.0654), tensor(-0.1209), tensor(0.0185), tensor(-0.0587), tensor(-0.0003), tensor(-0.0067), tensor(-0.0058), tensor(-0.0511), tensor(-0.0405), tensor(0.0499), tensor(-0.0481), tensor(-0.0501), tensor(-0.0258), tensor(0.0455), tensor(-0.0692), tens

In [167]:
torch.prod(torch.tensor(df.loc[[30,31],'embeddings_reviews']),dim=0)

ValueError: could not determine the shape of object type 'Series'

In [161]:
df.groupby(['listing_id'])['embeddings_reviews'].apply(prod).reset_index()

30    [tensor(-0.0024), tensor(0.0206), tensor(0.0077), tensor(0.0575), tensor(-0.1284), tensor(0.0862), tensor(0.0500), tensor(-0.0066), tensor(-0.0383), tensor(-0.0341), tensor(0.0182), tensor(-0.0815), tensor(0.0348), tensor(-0.0187), tensor(-0.0028), tensor(0.0786), tensor(0.1157), tensor(-0.0105), tensor(-0.0589), tensor(0.0636), tensor(-0.0822), tensor(-0.0003), tensor(0.0335), tensor(0.0440), tensor(-0.0271), tensor(-0.0251), tensor(0.0467), tensor(0.0090), tensor(0.0279), tensor(0.0114), t...
Name: 8925296, dtype: object


ValueError: could not determine the shape of object type 'Series'

In [35]:
import numpy as np

# Create a dictionary to store mean averaged embeddings for each listing
mean_embeddings_by_listing = {}

# Iterate over each listing
for idx, listing_id in enumerate(storage_dict['listing_id']):
    # Extract embeddings for the current listing
    embeddings_for_listing = storage_dict['embeddings_reviews'][idx]

    # Calculate mean of embeddings for the current listing
    mean_embeddings = np.mean(embeddings_for_listing, axis=0)

    # Store mean embeddings in the dictionary
    mean_embeddings_by_listing[listing_id] = mean_embeddings

# Now, mean_embeddings_by_listing contains the mean embeddings for each listing


TypeError: mean() received an invalid combination of arguments - got (axis=int, dtype=NoneType, out=NoneType, ), but expected one of:
 * (*, torch.dtype dtype)
 * (tuple of ints dim, bool keepdim, *, torch.dtype dtype)
 * (tuple of names dim, bool keepdim, *, torch.dtype dtype)


In [32]:
# Create a weight tensor
weights = torch.tensor([1])
embeddings = ['embeddings_reviews']
corpus_embeddings = torch.zeros_like(storage_dict[embeddings[0]])  # Initialize an empty tensor

for i, corpus in enumerate(embeddings):
    
    # Weight the vectors with the specified weights
    weighted_embeddings = storage_dict[corpus] * weights[i]
    
    # Add the weighted embeddings to the corpus_embeddings
    corpus_embeddings += weighted_embeddings

In [33]:
corpus_embeddings

tensor([[ 0.0187,  0.0276,  0.0005,  ..., -0.0364,  0.0141, -0.0065],
        [ 0.0339, -0.0022, -0.0277,  ..., -0.0818,  0.0386,  0.0328],
        [ 0.0187,  0.0276,  0.0005,  ..., -0.0364,  0.0141, -0.0065],
        ...,
        [ 0.0078,  0.0219, -0.0255,  ..., -0.0012, -0.0163, -0.0233],
        [-0.0458,  0.0403,  0.0209,  ...,  0.0179,  0.0692, -0.0112],
        [-0.0419, -0.0097, -0.0258,  ..., -0.0459,  0.0498,  0.0442]])

#### Encode Query

In [17]:
# Encode the query
query = "Cozy cabin close to beach"
clean_query = pd.Series(query).apply(clean_text)
query_embedding = model.encode(query,show_progress_bar=True,convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

#### Apply Semantic Search

In [21]:
top_k = 10

search_results = util.semantic_search(
    query_embedding, corpus_embeddings, top_k=top_k
)

# Extract the indices of the most similar sentences
similar_indices = search_results[0][0:top_k]

# Extract the actual sentences
similar_sentences = df.loc[[item['corpus_id'] for item in similar_indices],['name','description','corpus_text_host','corpus_text_reviews']]
similar_sentences['score'] = [item['score'] for item in similar_indices]
similar_sentences

KeyError: "None of [Index(['name', 'description', 'corpus_text_host', 'corpus_text_reviews'], dtype='object')] are in the [columns]"

### Test optimal weight

0.1

In [107]:
# Create a weight tensor

for j in range(10):
    
    weights = torch.tensor([j/10, 1-(j/10)])
    embeddings = ['embeddings_host','embeddings_reviews']
    corpus_embeddings = torch.zeros_like(storage_dict[embeddings[0]])  # Initialize an empty tensor

    for i, corpus in enumerate(embeddings):

        # Weight the vectors with the specified weights
        weighted_embeddings = storage_dict[corpus] * weights[i]


        # Add the weighted embeddings to the corpus_embeddings
        corpus_embeddings += weighted_embeddings
        
    # Encode the query
    query = "Romantic for couple in mountains"
    clean_query = pd.Series(query).apply(clean_text)
    query_embedding = model.encode(query,show_progress_bar=False,convert_to_tensor=True)
    search_results = util.semantic_search(
        query_embedding, corpus_embeddings, top_k=1
    )

    # Extract the indices of the most similar sentences
    score = search_results[0][0]['score']
    
    print('Weights: [{:,.2f}, {:,.2f}]'.format(j/10, 1-(j/10)), '| Max Score: {:,.3f}'.format(score))

Weights: [0.00, 1.00] | Max Score: 0.488
Weights: [0.10, 0.90] | Max Score: 0.487
Weights: [0.20, 0.80] | Max Score: 0.483
Weights: [0.30, 0.70] | Max Score: 0.474
Weights: [0.40, 0.60] | Max Score: 0.462
Weights: [0.50, 0.50] | Max Score: 0.448
Weights: [0.60, 0.40] | Max Score: 0.438
Weights: [0.70, 0.30] | Max Score: 0.424
Weights: [0.80, 0.20] | Max Score: 0.407
Weights: [0.90, 0.10] | Max Score: 0.390
