In [20]:
def semantic_search_airbnb(query):
    
    # Libraries
    import string
    import re
    import nltk
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    import pickle
    import torch
    import pandas as pd
    from sentence_transformers import SentenceTransformer, util
    
    # Download nltk
    nltk.download('stopwords') 
    nltk.download('wordnet')

    ##
    ## Cleaning
    ##
    
    # Function to perform all cleaning steps
    def clean_text(text):

        # Remove punctuation
        text = "".join([char for char in text if char not in string.punctuation])

        # Lowercase the text
        text = text.lower()

        # Tokenization
        tokens = re.split(r'\W+', text)

        # Remove stopwords
        tokens = [word for word in tokens if word not in stopwords]

        # Stemming
        tokens = [porter_stemmer.stem(word) for word in tokens]

        # Lemmatization
        tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

        return tokens

    # Set of English stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))

    # Initialize stemmer and lemmatizer
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    
    ##
    ## Semantic Search
    ##
    
    # SBERT model name
    model_name = 'multi-qa-MiniLM-L6-cos-v1'

    # Initialize SBERT model
    model = SentenceTransformer(model_name)

    # Cached Embeddings Path (changes according to model)
    embedding_cache_path = f'cache\\cached-embeddings-{model_name}_weighted_clean.pkl'

    with open(embedding_cache_path, "rb") as fIn:
            cache_data = pickle.load(fIn)
    
    # Create a weight tensor
    weights = torch.tensor([0.5, 0.5])
    embeddings = ['embeddings_host','embeddings_reviews']
    corpus_embeddings = torch.zeros_like(cache_data[embeddings[0]])  # Initialize an empty tensor

    for i, corpus in enumerate(embeddings):

        # Weight the vectors with the specified weights
        weighted_embeddings = cache_data[corpus] * weights[i]

        # Add the weighted embeddings to the corpus_embeddings
        corpus_embeddings += weighted_embeddings

    # Encode the query
    clean_query = pd.Series(query).apply(clean_text)
    query_embedding = model.encode(query,show_progress_bar=True,convert_to_tensor=True)

    top_k = 10
    search_results = util.semantic_search(
        query_embedding, corpus_embeddings, top_k=top_k
    )

    # Extract the indices of the most similar sentences
    similar_indices = search_results[0][0:top_k]

    # Extract the actual sentences
    df_result = pd.DataFrame()
    for col in ['id', 'name', 'description', 'listing_url', 'picture_url', 'price', 'neighbourhood']:
        for indice in [similar_indices[i]['corpus_id'] for i in range(len(similar_indices))]:
            df_result.loc[indice,col] = cache_data[col][indice]
    df_result['Score'] = [item['score'] for item in similar_indices]

    return df_result.to_json()

In [21]:
df = semantic_search_airbnb('romantic cabin by the beach')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
df

'{"id":{"28644":44047433.0,"32057":7.346902248e+17,"15639":38890851.0,"31616":53852504.0,"1464":18433456.0,"1671":6.732050553e+17,"18675":30882750.0,"17401":45257193.0,"3186":3563679.0,"15686":38890624.0},"name":{"28644":"Boutique hotel in Hermosa Beach \\u00b7 \\u26055.0 \\u00b7 1 bedroom \\u00b7 1 bed \\u00b7 1 private bath","32057":"Home in Los Angeles \\u00b7 1 bedroom \\u00b7 1 bed \\u00b7 2 baths","15639":"Bed and breakfast in Long Beach \\u00b7 \\u26054.83 \\u00b7 2 bedrooms \\u00b7 2 beds \\u00b7 1 private bath","31616":"Boat in Long Beach \\u00b7 \\u26054.40 \\u00b7 2 bedrooms \\u00b7 3 beds \\u00b7 Half-bath","1464":"Cottage in Malibu \\u00b7 \\u26054.95 \\u00b7 2 bedrooms \\u00b7 2 beds \\u00b7 1.5 baths","1671":"Boat in Marina del Rey \\u00b7 \\u26054.92 \\u00b7 2 bedrooms \\u00b7 2 beds \\u00b7 1.5 baths","18675":"Cabin in Los Angeles \\u00b7 \\u26054.89 \\u00b7 1 bedroom \\u00b7 1 bed \\u00b7 1.5 baths","17401":"Nature lodge in Avalon \\u00b7 \\u26055.0 \\u00b7 12 beds \\