### Import Libraries

In [44]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import string
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

pd.set_option("max_colwidth", 500)

### Read

#### Listings

In [45]:
listings_folder_path = '..\\data\\raw\\listings'

df_listings = pd.DataFrame()
for listing_file in os.listdir(listings_folder_path):
    listing_file_path = os.path.join(listings_folder_path,listing_file) 
    df = pd.read_csv(listing_file_path,compression='gzip')
    df_listings = pd.concat([df,df_listings])

In [46]:
# Split name into actual name and summary
df_listings['subtext'] = df_listings['name'].str.split(' · ').str[1:].apply(lambda x: ' · '.join(x))
df_listings['subtext'] = df_listings['subtext'].str.replace('·','•')
df_listings['name'] = df_listings['name'].str.split(' · ').str[0]

df_listings.rename(columns={'listing_url':'link','picture_url':'photo','neighbourhood':'location'},inplace=True)

#### Reviews

In [47]:
reviews_folder_path = '..\\data\\raw\\reviews'

df_reviews = pd.DataFrame()
for review_file in os.listdir(reviews_folder_path):
    review_file_path = os.path.join(reviews_folder_path,review_file) 
    df = pd.read_csv(review_file_path,compression='gzip')
    df_reviews = pd.concat([df,df_reviews])

### Join Data

In [48]:
# Listings
listings_id_column = 'id'
listings_nlp_columns = [
    'amenities',
    'accommodates',
    'name',
    'subtext',
    'property_type',
    'room_type',
    'location',
    'neighbourhood_cleansed',
    'description'
]

# Other columns to save
cols_aux_final = ['id','name','subtext','description','link','photo','price','location']

df_listings.loc[:,'corpus_text_host'] = ''
for nlp_col in listings_nlp_columns:
    df_listings.loc[:,'corpus_text_host'] += ' ' + df_listings.loc[:,nlp_col].fillna('').astype(str)+ '. '
df_listings = df_listings[cols_aux_final+['corpus_text_host']]    

# Reviews
df_reviews_grouped_id = df_reviews.groupby(
    by='listing_id',
    as_index=False
).agg(
    {'comments': lambda review: ' '.join(review.fillna(''))}
)

# Final
df = pd.merge(
    left=df_listings,
    right=df_reviews_grouped_id,
    left_on='id',
    right_on='listing_id',
    how='left'
)

df.rename(columns={'comments':'corpus_text_reviews'},inplace=True)
df.drop(['listing_id'],axis=1,inplace=True)

### Preprocess for NLP

In [49]:
# Download nltk
nltk.download('stopwords') 
nltk.download('wordnet')

# Function to perform all cleaning steps
def clean_text(text):
        
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    
    # Lowercase the text
    text = text.lower()
    
    # Tokenization
    tokens = re.split(r'\W+', text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Stemming
    tokens = [porter_stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Set of English stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer and lemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Model

In [50]:
df = df

#### Encode Corpus

In [51]:
# SBERT model name
model_name = 'multi-qa-MiniLM-L6-cos-v1'

# Initialize SBERT model
print('##### INITIALIZING SBERT MODEL #####')
model = SentenceTransformer(model_name)

# Cached Embeddings Path (changes according to model)
embedding_cache_path = f'cache\\cached-embeddings-{model_name}_weighted_clean.pkl'

# Current corpus texts
current_corpus_texts_host = df['corpus_text_host'].fillna('')
current_corpus_texts_reviews = df['corpus_text_reviews'].fillna('')
   
# If cache pkl file path exists
if os.path.exists(embedding_cache_path):
    None
#     print('##### CACHED EMBEDDINGS PICKLE FOUND #####')

#     # Read cached embeddings
#     with open(embedding_cache_path, "rb") as fIn:
#         cache_data = pickle.load(fIn)
    
#     # Extract corpus text and embeddings from cache pkl
#     cache_corpus_texts_host = cache_data['text_embeddings_host']
#     cache_corpus_texts_reviews = cache_data['text_embeddings_reviews']
#     cache_corpus_embeddings_host = cache_data['embeddings_host']
#     cache_corpus_embeddings_reviews = cache_data['embeddings_reviews']
    
#     print('##### IDENTIFYING CORPUS TEXTS NOT IN CACHE #####')
#     corpus_text_host_not_in_cache = []
#     corpus_text_reviews_not_in_cache = []
#     for text_host, text_reviews in zip(current_corpus_texts_host, current_corpus_texts_reviews):
#         if text_reviews not in cache_corpus_texts_reviews:
#             print('> TEXT NO. {:,.0f} ({:,.0%} OF TOTAL DATASET)'.format(i, len(corpus_text_host_not_in_cache)/len(current_corpus_texts_reviews)))
#             corpus_text_host_not_in_cache.append(text_host)
#             corpus_text_reviews_not_in_cache.append(text_reviews)
    
#     if corpus_text_reviews_not_in_cache != []:
        
#         # Apply the cleaning function to the 'corpus_text' column
#         print('##### CLEANING NEW CORPUS TEXTS #####')
#         corpus_text_reviews_not_in_cache = pd.Series(corpus_text_reviews_not_in_cache).apply(clean_text).to_list()
#         corpus_text_host_not_in_cache = pd.Series(corpus_text_host_not_in_cache).apply(clean_text).to_list()

#         # Encode ONLY the current corpus texts that aren't in cache into embeddings
#         print('##### ENCODING IDENTIFIED CORPUS TEXTS #####')
#         remaining_corpus_embeddings_reviews = model.encode(corpus_text_reviews_not_in_cache,show_progress_bar=True,convert_to_tensor=True)
#         remaining_corpus_embeddings_host = model.encode(corpus_text_host_not_in_cache,show_progress_bar=True,convert_to_tensor=True)
        
#     else:
#         print('> NO NEW CORPUS TEXTS')
#         remaining_corpus_embeddings_reviews = torch.empty(0)
#         remaining_corpus_embeddings_host = torch.empty(0)
        
#     # Joining corpus data into single objects for export later
#     corpus_embeddings_host = torch.cat((cache_corpus_embeddings_host,remaining_corpus_embeddings_host), dim=0)
#     corpus_embeddings_reviews = torch.cat((cache_corpus_embeddings_reviews,remaining_corpus_embeddings_reviews), dim=0)
    
#     corpus_texts_host = cache_corpus_texts_host + corpus_text_host_not_in_cache
#     corpus_texts_reviews = cache_corpus_texts_reviews + corpus_text_reviews_not_in_cache
    
#     # Update & export complete text and embeddings as pkl for future executions
#     print('##### EXPORTING  #####')
#     with open(embedding_cache_path, "wb") as fOut:
#         pickle.dump(storage_dict, fOut)
    
else:
    print('##### CACHED EMBEDDINGS PICKLE NOT FOUND #####')
    corpus_texts_host = current_corpus_texts_host
    corpus_texts_reviews = current_corpus_texts_reviews

    # Encode ALL the current corpus texts into embeddings
    print('##### ENCODING ALL CORPUS TEXTS #####')
    storage_dict = {}
    for corpus_name, corpus_text in zip(['embeddings_host','embeddings_reviews'],[corpus_texts_host, corpus_texts_reviews]):
        print(f'> {corpus_name}')
        corpus_embeddings = model.encode(corpus_text,show_progress_bar=True,convert_to_tensor=True)
        
        storage_dict['text_'+corpus_name.split('_')[1]] = corpus_text 
        storage_dict[corpus_name] = corpus_embeddings
        
    for col in cols_aux_final:
        storage_dict[col] = df[col].to_list()
        
# Update & export complete text and embeddings as pkl for future executions
print('##### EXPORTING  #####')
with open(embedding_cache_path, "wb") as fOut:
    pickle.dump(storage_dict, fOut)

##### INITIALIZING SBERT MODEL #####
##### CACHED EMBEDDINGS PICKLE NOT FOUND #####
##### ENCODING ALL CORPUS TEXTS #####
> embeddings_host


Batches:   0%|          | 0/1394 [00:00<?, ?it/s]

KeyboardInterrupt: 

#### Weight Encodings

In [None]:
# Create a weight tensor
weights = torch.tensor([0.5, 0.5])
embeddings = ['embeddings_host','embeddings_reviews']
corpus_embeddings = torch.zeros_like(storage_dict[embeddings[0]])  # Initialize an empty tensor

for i, corpus in enumerate(embeddings):
    
    # Weight the vectors with the specified weights
    weighted_embeddings = storage_dict[corpus] * weights[i]
    
    
    # Add the weighted embeddings to the corpus_embeddings
    corpus_embeddings += weighted_embeddings

#### Encode Query

In [None]:
# Encode the query
query = "Cozy cabin close to beach"
clean_query = pd.Series(query).apply(clean_text)
query_embedding = model.encode(query,show_progress_bar=True,convert_to_tensor=True)

#### Apply Semantic Search

In [None]:
top_k = 10

search_results = util.semantic_search(
    query_embedding, corpus_embeddings, top_k=top_k
)

# Extract the indices of the most similar sentences
similar_indices = search_results[0][0:top_k]

In [84]:
# Extract the actual sentences
df_result = pd.DataFrame()

for col in ['id', 'name', 'description', 'listing_url', 'picture_url', 'price', 'neighbourhood']:
    for indice in [similar_indices[i]['corpus_id'] for i in range(len(similar_indices))]:
        df_result.loc[indice,col] = storage_dict[col][i]
    
df_result['Score'] = [item['score'] for item in similar_indices]

df_result

Unnamed: 0,id,name,description,listing_url,picture_url,price,neighbourhood,Score
33100,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.665866
21813,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.662432
14317,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.661356
24282,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.655653
3610,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.654913
22119,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.649561
2465,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.649321
29443,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.647911
17401,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.647903
1434,35910806.0,Nature lodge in Avalon · ★4.20 · 8 beds · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...",https://www.airbnb.com/rooms/35910806,https://a0.muscache.com/pictures/f9c0763e-9446-4d85-87b1-3a14cfab7cf3.jpg,$152.00,"Avalon, California, United States",0.646953


### Test optimal weight

0.1

In [107]:
# Create a weight tensor

for j in range(10):
    
    weights = torch.tensor([j/10, 1-(j/10)])
    embeddings = ['embeddings_host','embeddings_reviews']
    corpus_embeddings = torch.zeros_like(storage_dict[embeddings[0]])  # Initialize an empty tensor

    for i, corpus in enumerate(embeddings):

        # Weight the vectors with the specified weights
        weighted_embeddings = storage_dict[corpus] * weights[i]


        # Add the weighted embeddings to the corpus_embeddings
        corpus_embeddings += weighted_embeddings
        
    # Encode the query
    query = "Romantic for couple in mountains"
    clean_query = pd.Series(query).apply(clean_text)
    query_embedding = model.encode(query,show_progress_bar=False,convert_to_tensor=True)
    search_results = util.semantic_search(
        query_embedding, corpus_embeddings, top_k=1
    )

    # Extract the indices of the most similar sentences
    score = search_results[0][0]['score']
    
    print('Weights: [{:,.2f}, {:,.2f}]'.format(j/10, 1-(j/10)), '| Max Score: {:,.3f}'.format(score))

Weights: [0.00, 1.00] | Max Score: 0.488
Weights: [0.10, 0.90] | Max Score: 0.487
Weights: [0.20, 0.80] | Max Score: 0.483
Weights: [0.30, 0.70] | Max Score: 0.474
Weights: [0.40, 0.60] | Max Score: 0.462
Weights: [0.50, 0.50] | Max Score: 0.448
Weights: [0.60, 0.40] | Max Score: 0.438
Weights: [0.70, 0.30] | Max Score: 0.424
Weights: [0.80, 0.20] | Max Score: 0.407
Weights: [0.90, 0.10] | Max Score: 0.390
