### Import Libraries

In [1]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import string
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

pd.set_option("max_colwidth", 500)

### Read

#### Listings

In [2]:
listings_folder_path = '..\\data\\raw\\listings'

df_listings = pd.DataFrame()
for listing_file in os.listdir(listings_folder_path):
    listing_file_path = os.path.join(listings_folder_path,listing_file) 
    df = pd.read_csv(listing_file_path,compression='gzip')
    df_listings = pd.concat([df,df_listings])

#### Reviews

In [3]:
reviews_folder_path = '..\\data\\raw\\reviews'

df_reviews = pd.DataFrame()
for review_file in os.listdir(reviews_folder_path):
    review_file_path = os.path.join(reviews_folder_path,review_file) 
    df = pd.read_csv(review_file_path,compression='gzip')
    df_reviews = pd.concat([df,df_reviews])

In [4]:
df_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,8941071,68391055,2016-04-04,10164333,Smruti,"Danielle was a great host, she was extremely responsive to any questions I had, was flexible when I had to extend my stay and was very kind. The apartment is exactly how it looks in the pictures - very cute, clean and comfortable. The location is amazing on a quiet block but very close to restaurants and a Whole Foods. I stayed here for 2 months when I started my job and it really felt like home. Thanks Danielle!"
1,8941071,153719836,2017-05-21,97944097,Rob,"The apartment was great for us to spend the weekend in WeHo, very clean and quiet at night but just a street away from good restaurants, cafes and bars. For us the only thing missing was an iron for our clothes. The underground parking is good but our BMW X5 was as big as the space would fit."
2,8941071,147589354,2017-04-27,4123723,Widya,"Danielle is a great host, very concerned with her guests well-being, easy to reach and fast to reply. The apartment is perfectly located, walking access to supplies, shopping and hip places, or just for jogging the neighborhood:-) Can't be better! The apartment is spacious and bright, very comfortable. Few things are missing, like a toaster, and it is a bit darkly lit at night. But it's definitely great value for money.<br/>Thank you, Danielle!"
3,8941071,145742425,2017-04-19,1459499,Darian,"Great location and spacious. Danielle's place was a home away from home. She was also very flexible with check in/out times, which I really appreciated."
4,8941071,144400833,2017-04-15,98494277,Charlie,"Danielle's place was as expected, really good location, calm and cosy.<br/>Danielle was very helpful to make it a stress free experience !<br/>Would recommend"


### Preprocess for NLP

In [5]:
# Download nltk
nltk.download('stopwords') 
nltk.download('wordnet')

# Function to perform all cleaning steps
def clean_text(text):
        
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    
    # Lowercase the text
    text = text.lower()
    
    # Tokenization
    tokens = re.split(r'\W+', text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Stemming
    tokens = [porter_stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Set of English stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer and lemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Model

In [41]:
df = df_reviews.loc[:1000,:]

#### Encode Corpus

In [42]:
text_cleaned = pd.Series(df['comments'].fillna('')).apply(clean_text)

In [43]:
corpus_df = pd.DataFrame(text_cleaned)

In [44]:
corpus_df = corpus_df[corpus_df['comments'].apply(lambda x: bool(x))]

def join_non_empty(lst):
    non_empty_lst = [item for item in lst if item]
    return ', '.join(non_empty_lst)

corpus_df['corpus'] = corpus_df['comments'].apply(join_non_empty)

In [45]:
corpus_df = corpus_df.reset_index(drop=True)

In [46]:
# SBERT model name
model_name = 'multi-qa-MiniLM-L6-cos-v1'

# Initialize SBERT model
print('##### INITIALIZING SBERT MODEL #####')
model = SentenceTransformer(model_name)

# Cached Embeddings Path (changes according to model)
embedding_cache_path = f'cache\\cached-embeddings-{model_name}_mean_average_clean.pkl'

# Current corpus texts
current_corpus_texts_reviews = corpus_df['corpus']
   
# If cache pkl file path exists
if os.path.exists(embedding_cache_path):
    None
#     print('##### CACHED EMBEDDINGS PICKLE FOUND #####')

#     # Read cached embeddings
#     with open(embedding_cache_path, "rb") as fIn:
#         cache_data = pickle.load(fIn)
    
#     # Extract corpus text and embeddings from cache pkl  
#     cache_corpus_texts = cache_data['text']
#     cache_corpus_embeddings = cache_data['embeddings']

#     print('##### IDENTIFYING CORPUS TEXTS NOT IN CACHE #####')
#     corpus_text_not_in_cache = []
#     for i, text in enumerate(current_corpus_texts):
#         if text not in cache_corpus_texts:
#             print('> TEXT NO. {:,.0f} ({:,.0%} OF TOTAL DATASET)'.format(i, len(corpus_text_not_in_cache)/len(current_corpus_texts)))
#             corpus_text_not_in_cache.append(text)
    
#     if corpus_text_not_in_cache != []:
        
#         # Apply the cleaning function to the 'corpus_text' column
#         print('##### CLEANING NEW CORPUS TEXTS #####')
#         corpus_text_not_in_cache = pd.Series(corpus_text_not_in_cache).apply(clean_text).to_list()

#         # Encode ONLY the current corpus texts that aren't in cache into embeddings
#         print('##### ENCODING IDENTIFIED CORPUS TEXTS #####')
#         remaining_corpus_embeddings = model.encode(corpus_text_not_in_cache,show_progress_bar=True,convert_to_tensor=True)
#     else:
#         print('> NO NEW CORPUS TEXTS')
#         remaining_corpus_embeddings = torch.empty(0)
        
#     # Joining corpus data into single objects for export later
#     corpus_embeddings = torch.cat((cache_corpus_embeddings,remaining_corpus_embeddings), dim=0)
#     corpus_texts = cache_corpus_texts + corpus_text_not_in_cache

else:
    print('##### CACHED EMBEDDINGS PICKLE NOT FOUND #####')
    corpus_texts_reviews = current_corpus_texts_reviews

    # Encode ALL the current corpus texts into embeddings
    print('##### ENCODING ALL CORPUS TEXTS #####')
    storage_dict = {}
    for corpus_name, corpus_text in zip(['embeddings_reviews'],[corpus_texts_reviews]):
        print(f'> {corpus_name}')
        text_cleaned = corpus_text
        corpus_embeddings = model.encode(text_cleaned,show_progress_bar=True,convert_to_tensor=True)
        
        storage_dict['text_'+corpus_name.split('_')[1]] = corpus_text 
        storage_dict[corpus_name] = corpus_embeddings
        
# Update & export complete text and embeddings as pkl for future executions
print('##### EXPORTING  #####')
with open(embedding_cache_path, "wb") as fOut:
    pickle.dump(storage_dict, fOut)

##### INITIALIZING SBERT MODEL #####
##### CACHED EMBEDDINGS PICKLE NOT FOUND #####
##### ENCODING ALL CORPUS TEXTS #####
> embeddings_reviews


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

##### EXPORTING  #####


#### Weight Encodings

In [48]:
# Create a weight tensor
weights = torch.tensor([1])
embeddings = ['embeddings_reviews']
corpus_embeddings = torch.zeros_like(storage_dict[embeddings[0]])  # Initialize an empty tensor

for i, corpus in enumerate(embeddings):
    
    # Weight the vectors with the specified weights
    weighted_embeddings = storage_dict[corpus] * weights[i]
    
    
    # Add the weighted embeddings to the corpus_embeddings
    corpus_embeddings += weighted_embeddings

#### Encode Query

In [49]:
# Encode the query
query = "Cozy cabin close to beach"
clean_query = pd.Series(query).apply(clean_text)
query_embedding = model.encode(query,show_progress_bar=True,convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

#### Apply Semantic Search

In [50]:
top_k = 10

search_results = util.semantic_search(
    query_embedding, corpus_embeddings, top_k=top_k
)

# Extract the indices of the most similar sentences
similar_indices = search_results[0][0:top_k]

# Extract the actual sentences
similar_sentences = df.loc[[item['corpus_id'] for item in similar_indices],['name','description','corpus_text_host','corpus_text_reviews']]
similar_sentences['score'] = [item['score'] for item in similar_indices]
similar_sentences

KeyError: "None of [Index(['name', 'description', 'corpus_text_host', 'corpus_text_reviews'], dtype='object')] are in the [columns]"

### Test optimal weight

0.1

In [107]:
# Create a weight tensor

for j in range(10):
    
    weights = torch.tensor([j/10, 1-(j/10)])
    embeddings = ['embeddings_host','embeddings_reviews']
    corpus_embeddings = torch.zeros_like(storage_dict[embeddings[0]])  # Initialize an empty tensor

    for i, corpus in enumerate(embeddings):

        # Weight the vectors with the specified weights
        weighted_embeddings = storage_dict[corpus] * weights[i]


        # Add the weighted embeddings to the corpus_embeddings
        corpus_embeddings += weighted_embeddings
        
    # Encode the query
    query = "Romantic for couple in mountains"
    clean_query = pd.Series(query).apply(clean_text)
    query_embedding = model.encode(query,show_progress_bar=False,convert_to_tensor=True)
    search_results = util.semantic_search(
        query_embedding, corpus_embeddings, top_k=1
    )

    # Extract the indices of the most similar sentences
    score = search_results[0][0]['score']
    
    print('Weights: [{:,.2f}, {:,.2f}]'.format(j/10, 1-(j/10)), '| Max Score: {:,.3f}'.format(score))

Weights: [0.00, 1.00] | Max Score: 0.488
Weights: [0.10, 0.90] | Max Score: 0.487
Weights: [0.20, 0.80] | Max Score: 0.483
Weights: [0.30, 0.70] | Max Score: 0.474
Weights: [0.40, 0.60] | Max Score: 0.462
Weights: [0.50, 0.50] | Max Score: 0.448
Weights: [0.60, 0.40] | Max Score: 0.438
Weights: [0.70, 0.30] | Max Score: 0.424
Weights: [0.80, 0.20] | Max Score: 0.407
Weights: [0.90, 0.10] | Max Score: 0.390
