In [9]:
def semantic_search_airbnb(query):
    
    # Libraries
    import string
    import re
    import nltk
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    import pickle
    import torch
    import pandas as pd
    from sentence_transformers import SentenceTransformer, util
    
    # Download nltk
    nltk.download('stopwords') 
    nltk.download('wordnet')

    ##
    ## Cleaning
    ##
    
    # Function to perform all cleaning steps
    def clean_text(text):

        # Remove punctuation
        text = "".join([char for char in text if char not in string.punctuation])

        # Lowercase the text
        text = text.lower()

        # Tokenization
        tokens = re.split(r'\W+', text)

        # Remove stopwords
        tokens = [word for word in tokens if word not in stopwords]

        # Stemming
        tokens = [porter_stemmer.stem(word) for word in tokens]

        # Lemmatization
        tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

        return tokens

    # Set of English stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))

    # Initialize stemmer and lemmatizer
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    
    ##
    ## Semantic Search
    ##
    
    # SBERT model name
    model_name = 'multi-qa-MiniLM-L6-cos-v1'

    # Initialize SBERT model
    model = SentenceTransformer(model_name)

    # Cached Embeddings Path (changes according to model)
    embedding_cache_path = f'cache\\cached-embeddings-{model_name}_weighted_clean.pkl'

    with open(embedding_cache_path, "rb") as fIn:
            cache_data = pickle.load(fIn)
    
    # Create a weight tensor
    weights = torch.tensor([0.5, 0.5])
    embeddings = ['embeddings_host','embeddings_reviews']
    corpus_embeddings = torch.zeros_like(cache_data[embeddings[0]])  # Initialize an empty tensor

    for i, corpus in enumerate(embeddings):

        # Weight the vectors with the specified weights
        weighted_embeddings = cache_data[corpus] * weights[i]

        # Add the weighted embeddings to the corpus_embeddings
        corpus_embeddings += weighted_embeddings

    # Encode the query
    clean_query = pd.Series(query).apply(clean_text)
    query_embedding = model.encode(query,show_progress_bar=True,convert_to_tensor=True)

    top_k = 10
    search_results = util.semantic_search(
        query_embedding, corpus_embeddings, top_k=top_k
    )

    # Extract the indices of the most similar sentences
    similar_indices = search_results[0][0:top_k]

    # Extract the actual sentences
    df_result = pd.DataFrame()
    for col in ['id', 'name', 'description', 'listing_url', 'picture_url', 'price', 'neighbourhood']:
        for indice in [similar_indices[i]['corpus_id'] for i in range(len(similar_indices))]:
            df_result.loc[indice,col] = cache_data[col][indice]
    df_result['Score'] = [item['score'] for item in similar_indices]

    return df_result

In [10]:
semantic_search_airbnb('romantic cabin by the beach')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,name,description,listing_url,picture_url,price,neighbourhood,Score
28644,44047430.0,Boutique hotel in Hermosa Beach · ★5.0 · 1 bed...,The Ocean View Pacific King is a place where y...,https://www.airbnb.com/rooms/44047433,https://a0.muscache.com/pictures/7f572725-8ee8...,$373.00,"Hermosa Beach, California, United States",0.658417
32057,7.346902e+17,Home in Los Angeles · 1 bedroom · 1 bed · 2 baths,Kick back and relax with your special person i...,https://www.airbnb.com/rooms/734690224784390246,https://a0.muscache.com/pictures/miso/Hosting-...,"$2,000.00",,0.648842
15639,38890850.0,Bed and breakfast in Long Beach · ★4.83 · 2 be...,Sleep on your own private yacht for the night....,https://www.airbnb.com/rooms/38890851,https://a0.muscache.com/pictures/ca4f83ce-a0a1...,$318.00,,0.644609
31616,53852500.0,Boat in Long Beach · ★4.40 · 2 bedrooms · 3 be...,The boat is at marina is a luxury yacht that’s...,https://www.airbnb.com/rooms/53852504,https://a0.muscache.com/pictures/miso/Hosting-...,$285.00,,0.637674
1464,18433460.0,Cottage in Malibu · ★4.95 · 2 bedrooms · 2 bed...,"Stay at our newly listed ""Bu ""Seaside Cabin"". ...",https://www.airbnb.com/rooms/18433456,https://a0.muscache.com/pictures/b8148fd4-0b15...,$159.00,"Malibu, California, United States",0.628764
1671,6.732051e+17,Boat in Marina del Rey · ★4.92 · 2 bedrooms · ...,"A beautiful yacht located in Marina del Ray, w...",https://www.airbnb.com/rooms/673205055256289539,https://a0.muscache.com/pictures/miso/Hosting-...,$319.00,,0.626211
18675,30882750.0,Cabin in Los Angeles · ★4.89 · 1 bedroom · 1 b...,"Welcome to a charming, rustic, cozy & peaceful...",https://www.airbnb.com/rooms/30882750,https://a0.muscache.com/pictures/miso/Hosting-...,$251.00,"Los Angeles, California, United States",0.624323
17401,45257190.0,Nature lodge in Avalon · ★5.0 · 12 beds · 1 sh...,Our Mash tents provide you with the basics. Yo...,https://www.airbnb.com/rooms/45257193,https://a0.muscache.com/pictures/f9c0763e-9446...,$120.00,,0.6176
3186,3563679.0,Rental unit in Malibu · ★4.99 · 1 bedroom · 1 ...,Fall asleep to the sound of ocean waves in thi...,https://www.airbnb.com/rooms/3563679,https://a0.muscache.com/pictures/monet/Select-...,$643.00,"Malibu, California, United States",0.616162
15686,38890620.0,Bed and breakfast in Long Beach · ★5.0 · 2 bed...,Sleep on your own private yacht for the night....,https://www.airbnb.com/rooms/38890624,https://a0.muscache.com/pictures/8adfcd09-7a8e...,$412.00,,0.61567
