### Import Libraries

In [474]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pickle

pd.set_option("max_colwidth", 500)

### Read

#### Listings

In [475]:
listings_folder_path = '..\\data\\raw\\listings'

df_listings = pd.DataFrame()
for listing_file in os.listdir(listings_folder_path):
    listing_file_path = os.path.join(listings_folder_path,listing_file) 
    df = pd.read_csv(listing_file_path,compression='gzip')
    df_listings = pd.concat([df,df_listings])

#### Reviews

In [476]:
reviews_folder_path = '..\\data\\raw\\reviews'

df_reviews = pd.DataFrame()
for review_file in os.listdir(reviews_folder_path):
    review_file_path = os.path.join(reviews_folder_path,review_file) 
    df = pd.read_csv(review_file_path,compression='gzip')
    df_reviews = pd.concat([df,df_reviews])

### Join Data

In [477]:
# Listings
listings_id_column = 'id'
listings_nlp_columns = [
    'amenities',
    'accommodates',
    'name',
    'property_type',
    'room_type',
    'neighbourhood',
    'neighbourhood_cleansed',
    'description'
]

df_listings.loc[:,'corpus_text'] = ''
for nlp_col in listings_nlp_columns:
    df_listings.loc[:,'corpus_text'] += ' ' + df_listings.loc[:,nlp_col].fillna('').astype(str)+ '. '
df_listings = df_listings[['id','name','description','corpus_text']]    

# Reviews
df_reviews_grouped_id = df_reviews.groupby(
    by='listing_id',
    as_index=False
).agg(
    {'comments': lambda review: ' '.join(review.fillna(''))}
)

# Final
df = pd.merge(
    left=df_listings,
    right=df_reviews_grouped_id,
    left_on='id',
    right_on='listing_id',
    how='left'
)

df.loc[:,'corpus_text'] += '' + df.loc[:,'comments'].fillna('').astype(str)
df.drop(['comments','listing_id'],axis=1,inplace=True)

### Preprocess for NLP

In [478]:
import string
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download nltk
nltk.download('stopwords') 
nltk.download('wordnet')

# Function to perform all cleaning steps
def clean_text(text):
        
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    
    # Lowercase the text
    text = text.lower()
    
    # Tokenization
    tokens = re.split(r'\W+', text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords]
    
    # Stemming
    tokens = [porter_stemmer.stem(word) for word in tokens]
    
    # Lemmatization
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Set of English stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

# Initialize stemmer and lemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Model

In [479]:
df = df[:6000]

#### Encode Corpus

In [480]:
# SBERT model name
model_name = 'multi-qa-MiniLM-L6-cos-v1'

# Initialize SBERT model
print('##### INITIALIZING SBERT MODEL #####')
model = SentenceTransformer(model_name)

# Cached Embeddings Path (changes according to model)
embedding_cache_path = f'cached-embeddings-{model_name}_clean.pkl'

# Current corpus texts
current_corpus_texts = df['corpus_text'].to_list()
   
# If cache pkl file path exists
if os.path.exists(embedding_cache_path):
    print('##### CACHED EMBEDDINGS PICKLE FOUND #####')

    # Read cached embeddings
    with open(embedding_cache_path, "rb") as fIn:
        cache_data = pickle.load(fIn)
    
    # Extract corpus text and embeddings from cache pkl  
    cache_corpus_texts = cache_data['text']
    cache_corpus_embeddings = cache_data['embeddings']

    print('##### IDENTIFYING CORPUS TEXTS NOT IN CACHE #####')
    corpus_text_not_in_cache = []
    for i, text in enumerate(current_corpus_texts):
        if text not in cache_corpus_texts:
            print('> TEXT NO. {:,.0f} ({:,.0%} OF TOTAL DATASET)'.format(i, len(corpus_text_not_in_cache)/len(current_corpus_texts)))
            corpus_text_not_in_cache.append(text)
    
    if corpus_text_not_in_cache != []:
        
        # Apply the cleaning function to the 'corpus_text' column
        print('##### CLEANING NEW CORPUS TEXTS #####')
        corpus_text_not_in_cache = pd.Series(corpus_text_not_in_cache).apply(clean_text).to_list()

        # Encode ONLY the current corpus texts that aren't in cache into embeddings
        print('##### ENCODING IDENTIFIED CORPUS TEXTS #####')
        remaining_corpus_embeddings = model.encode(corpus_text_not_in_cache,show_progress_bar=True,convert_to_tensor=True)
    else:
        print('> NO NEW CORPUS TEXTS')
        remaining_corpus_embeddings = torch.empty(0)
        
    # Joining corpus data into single objects for export later
    corpus_embeddings = torch.cat((cache_corpus_embeddings,remaining_corpus_embeddings), dim=0)
    corpus_texts = cache_corpus_texts + corpus_text_not_in_cache

else:
    print('##### CACHED EMBEDDINGS PICKLE NOT FOUND #####')
    
    corpus_texts = current_corpus_texts

    # Encode ALL the current corpus texts into embeddings
    print('##### ENCODING ALL CORPUS TEXTS #####')
    corpus_embeddings = model.encode(corpus_texts,show_progress_bar=True,convert_to_tensor=True)
    
# Update & export complete text and embeddings as pkl for future executions
print('##### EXPORTING  #####')
with open(embedding_cache_path, "wb") as fOut:
    pickle.dump({'text': corpus_texts, 'embeddings': corpus_embeddings}, fOut)

##### INITIALIZING SBERT MODEL #####
##### CACHED EMBEDDINGS PICKLE FOUND #####
##### IDENTIFYING CORPUS TEXTS NOT IN CACHE #####
> TEXT NO. 5,000 (0% OF TOTAL DATASET)
> TEXT NO. 5,001 (0% OF TOTAL DATASET)
> TEXT NO. 5,002 (0% OF TOTAL DATASET)
> TEXT NO. 5,003 (0% OF TOTAL DATASET)
> TEXT NO. 5,004 (0% OF TOTAL DATASET)
> TEXT NO. 5,005 (0% OF TOTAL DATASET)
> TEXT NO. 5,006 (0% OF TOTAL DATASET)
> TEXT NO. 5,007 (0% OF TOTAL DATASET)
> TEXT NO. 5,008 (0% OF TOTAL DATASET)
> TEXT NO. 5,009 (0% OF TOTAL DATASET)
> TEXT NO. 5,010 (0% OF TOTAL DATASET)
> TEXT NO. 5,011 (0% OF TOTAL DATASET)
> TEXT NO. 5,012 (0% OF TOTAL DATASET)
> TEXT NO. 5,013 (0% OF TOTAL DATASET)
> TEXT NO. 5,014 (0% OF TOTAL DATASET)
> TEXT NO. 5,016 (0% OF TOTAL DATASET)
> TEXT NO. 5,017 (0% OF TOTAL DATASET)
> TEXT NO. 5,018 (0% OF TOTAL DATASET)
> TEXT NO. 5,019 (0% OF TOTAL DATASET)
> TEXT NO. 5,020 (0% OF TOTAL DATASET)
> TEXT NO. 5,021 (0% OF TOTAL DATASET)
> TEXT NO. 5,022 (0% OF TOTAL DATASET)
> TEXT NO. 5

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

##### EXPORTING  #####


#### Encode Query

In [481]:
# Encode the query
query = "romantic cabin close to the beach"
clean_query = pd.Series(query).apply(clean_text)
query_embedding = model.encode(query,show_progress_bar=True,convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

#### Apply Semantic Search

In [482]:
top_k = 10

search_results = util.semantic_search(
    query_embedding, corpus_embeddings, top_k=top_k
)

# Extract the indices of the most similar sentences
similar_indices = search_results[0][0:top_k]

# Extract the actual sentences
similar_sentences = df.loc[[item['corpus_id'] for item in similar_indices],['name','description','corpus_text']]
similar_sentences['score'] = [item['score'] for item in similar_indices]
similar_sentences

Unnamed: 0,name,description,corpus_text,score
4086,Boat in Marina del Rey · ★4.70 · 1 bed · 1.5 baths,"SAILBOAT SLEEP-OVER.<br />Enjoy nature for yourself or with company to relax, as the experience can be an enjoyable scape from daily life at the ocean in Marina Del Rey/Venice Beach lodging in a small sailboat. California weather is the best all-year-round and you can book anytime of the year. The local attractions nearby are close to walk, bike, or drive around the Marina with their beaches and shopping centers, bars and restaurants, and plenty of nightlife.<br /><br /><b>The space</b><br /...","[""Host greets you"", ""Bay view"", ""Dining table"", ""Mini fridge"", ""Bed linens"", ""Dedicated workspace"", ""Suave or Alberto... shampoo"", ""Essentials"", ""Hot water kettle"", ""Fire extinguisher"", ""Beach view"", ""Suave or Alberto... body soap"", ""Wifi"", ""Free parking on premises"", ""Beach access"", ""Kitchenette"", ""Waterfront"", ""Hangers"", ""TV"", ""Hair dryer"", ""BBQ grill""]. 2. Boat in Marina del Rey · ★4.70 · 1 bed · 1.5 baths. Boat. Entire home/apt. Marina del Rey, California, United States. Marina de...",0.618174
4159,Home in Redondo Beach · ★4.80 · 1 bedroom · 1 bed · 1 private bath,"Private guest suite with private bathroom, patio and entrance. <br />Quiet neighborhood. 15 min walk to the beach.<br />25 min from LAX<br />Basically, you're getting a nice hotel room in residential area without hassle of checking in paperwork.<br /><br /><b>The space</b><br />Completely private suite with own bathroom, outdoor space and easy access","[""Central heating"", ""Mini fridge"", ""Lock on bedroom door"", ""Luggage dropoff allowed"", ""Books and reading material"", ""Bathtub"", ""Bed linens"", ""Smoke alarm"", ""Dedicated workspace"", ""Private patio or balcony"", ""Essentials"", ""Portable fans"", ""Outdoor dining area"", ""Room-darkening shades"", ""Portable heater"", ""Body soap"", ""Extra pillows and blankets"", ""Clothing storage: closet"", ""Hot water"", ""Private entrance"", ""Free street parking"", ""Long term stays allowed"", ""Shampoo"", ""Wifi"", ""Laundromat nearb...",0.599991
3549,Nature lodge in Avalon · ★4.63 · 1 bedroom · 0 shared baths,"Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bunk beds w/mattresses and a nice deck in front. Your tent is just steps from the beach, bathrooms, and the camp store. Bring a fitted sheet, a sleeping bag, and toiletries. At night when all is quiet, sit on your deck, gaze up at the stars, listen to the waves crashing on the beach. You are on island time!<br /><br /><b>The space</b><br />Whites Landing is a secluded cove 3 miles east ...","[""Beach access \u2013 Beachfront"", ""First aid kit"", ""Host greets you"", ""Waterfront"", ""Private entrance"", ""Hot water"", ""Breakfast"", ""Long term stays allowed"", ""Smoking allowed"", ""Luggage dropoff allowed""]. 8. Nature lodge in Avalon · ★4.63 · 1 bedroom · 0 shared baths. Private room in nature lodge. Private room. Avalon, California, United States. Unincorporated Catalina Island. Our Safari tents provide you with the basics. You will have a large tent cabin that sleeps up to 8 ppl, 4 bu...",0.599756
1434,Cottage in Malibu · ★4.87 · 2 bedrooms · 2 beds · 2 baths,"Stay in a spacious, clean, and charming beach cottage with 1 master bedroom (king bed) and 1 guest bedroom (queen bed) with private baths. Enjoy a full kitchen, breakfast nook, and front patio overlooking the pacific ocean! 10 min walk to the beach!<br /><br />Please read house rules. Booking a reservation means you will respect the rules of staying at this property.<br /><br /><b>The space</b><br />This ocean view cottage is clean, comfortable, and cozy for a group of friends, family or cou...","[""Stove"", ""Carbon monoxide alarm"", ""Iron"", ""Host greets you"", ""Microwave"", ""Bed linens"", ""Smoke alarm"", ""Essentials"", ""Refrigerator"", ""Fire extinguisher"", ""Extra pillows and blankets"", ""Hot water"", ""Oven"", ""Private entrance"", ""Beach essentials"", ""Shampoo"", ""Kitchen"", ""Wifi"", ""Patio or balcony"", ""Free parking on premises"", ""Dishes and silverware"", ""Coffee maker"", ""Cooking basics"", ""Hangers"", ""Hair dryer"", ""Heating""]. 4. Cottage in Malibu · ★4.87 · 2 bedrooms · 2 beds · 2 baths. Entire cot...",0.599208
3706,Guesthouse in Long Beach · 1 bedroom · 1 bed · 1 bath,"With the summer right around the corner, you’re probably dreaming of a warm beach getaway. This cottage is the perfect getaway. It sits just five blocks from the beach and is central to shops and restaurants. It offers a full-size fridge, stove, laundry machines, Wi-Fi, TV, shaded picnic deck, and lemon tree for the picking.<br /><br />For stays less than thirty days, inquire directly. Namaste.","[""Free parking on premises"", ""Smoke alarm"", ""Carbon monoxide alarm"", ""Dedicated workspace"", ""32\"" HDTV with Netflix"", ""Dishes and silverware"", ""Coffee maker"", ""Exercise equipment"", ""BBQ grill"", ""Pets allowed"", ""Free washer \u2013 In building"", ""Refrigerator"", ""Air conditioning"", ""Kitchen"", ""Fire extinguisher"", ""Free dryer \u2013 In building"", ""Fast wifi \u2013 242 Mbps""]. 2. Guesthouse in Long Beach · 1 bedroom · 1 bed · 1 bath. Entire guesthouse. Entire home/apt. . Long Beach. With ...",0.595431
3690,Place to stay in Long Beach · 1 bedroom · 2 beds · 1 bath,"Welcome to your charming Long Beach getaway at Beach Bungalow!<br /><br /><b>The space</b><br />This cozy home is the perfect seaside retreat, just a short walk from the beach. As soon as you arrive, you'll be struck by the quintessential Southern California landscaping that surrounds the property. Towering cacti and mature succulents fill the yard, creating an idyllic atmosphere that is both lively and tranquil. Enjoy your morning coffee while seated comfortably on the private patio, baskin...","[""Stove"", ""Carbon monoxide alarm"", ""Iron"", ""Toaster"", ""Air conditioning"", ""Microwave"", ""Bed linens"", ""Smoke alarm"", ""Dedicated workspace"", ""Indoor fireplace"", ""Pets allowed"", ""Essentials"", ""Refrigerator"", ""Dryer"", ""Private entrance"", ""Oven"", ""Beach essentials"", ""Washer"", ""Ceiling fan"", ""Backyard"", ""Shampoo"", ""Kitchen"", ""Wifi"", ""Wine glasses"", ""Free parking on premises"", ""Dishes and silverware"", ""Coffee maker"", ""Cooking basics"", ""Blender"", ""Self check-in"", ""TV"", ""Security cameras on property...",0.587478
3321,Home in Malibu · ★4.86 · 2 bedrooms · 2 beds · 2 baths,"Located on Malibu's pristine shore this property has a spectacular setting. This newly remodeled oceanfront beach home is the perfect place for a relaxing getaway. Large sun deck to enjoy California's beautiful weather, equipped with lounge chairs &<br /><br /><b>Other things to note</b><br />Refundable deposit: $1,000<br />Minimum stay: 3 nights<br />Total includes:<br />Cleaning fee: $150<br />Tax: 12%<br /><br /><b>Registration number</b><br />STR21-0061","[""Stove"", ""Carbon monoxide alarm"", ""Iron"", ""Air conditioning"", ""Microwave"", ""Luggage dropoff allowed"", ""Bed linens"", ""Smoke alarm"", ""First aid kit"", ""Baking sheet"", ""Pets allowed"", ""Essentials"", ""Refrigerator"", ""Indoor fireplace: electric, gas"", ""Shower gel"", ""Fire extinguisher"", ""Dryer"", ""Extra pillows and blankets"", ""Hot water"", ""Oven"", ""Private entrance"", ""Free street parking"", ""Washer"", ""Long term stays allowed"", ""Backyard"", ""Shampoo"", ""Kitchen"", ""Wifi"", ""Patio or balcony"", ""Free parkin...",0.586302
1464,Cottage in Malibu · ★4.95 · 2 bedrooms · 2 beds · 1.5 baths,"Stay at our newly listed ""Bu ""Seaside Cabin"". We recently restored this 2 bedroom, 1.5 bath 1970's vintage Malibu mobile home unit into a seaside and nature lovers escape! Enjoy the best of Malibu's beaches and mountains from this sweet spot.<br /><br />Note: please read house rules in detail before booking. Booking a reservation means you will respect the rules of staying at this property.<br /><br /><b>The space</b><br />Stay at an historic ""old malibu"" resort property. Enjoy tennis and pr...","[""Stove"", ""Iron"", ""Host greets you"", ""Microwave"", ""Bed linens"", ""Smoke alarm"", ""Private patio or balcony"", ""Essentials"", ""Refrigerator"", ""Fire extinguisher"", ""Extra pillows and blankets"", ""Hot water"", ""Oven"", ""Private entrance"", ""Beach essentials"", ""Shampoo"", ""Kitchen"", ""Wifi"", ""Free parking on premises"", ""Dishes and silverware"", ""Coffee maker"", ""Cooking basics"", ""Hangers"", ""Hair dryer"", ""Heating""]. 3. Cottage in Malibu · ★4.95 · 2 bedrooms · 2 beds · 1.5 baths. Entire cottage. Entire h...",0.58612
1952,Guest suite in Los Angeles · ★4.96 · 1 bedroom · 1 bed · 1 private bath,"Very comfy and spacious private Patio room with beauitufl King size bed and high end queen sleeper sofa couch. Private entrance. Double French doors open up into the patio deck with the hot tub Jacuzzi a few feet away. Wonderful lounge chairs and dining table and chairs make the patio deck a perfect place to dine, sun-bath, or simply relax. Recently remodeled bathroom is also large. Forget your worries in this spacious and serene space.<br /><br /><b>The space</b><br />The Patio Room is ...","[""Smoke alarm"", ""Carbon monoxide alarm"", ""First aid kit"", ""Wifi"", ""Washer"", ""TV"", ""Air conditioning"", ""Lock on bedroom door"", ""Hot tub"", ""Fire extinguisher""]. 4. Guest suite in Los Angeles · ★4.96 · 1 bedroom · 1 bed · 1 private bath. Private room in guest suite. Private room. Los Angeles, California, United States. Pacific Palisades. Very comfy and spacious private Patio room with beauitufl King size bed and high end queen sleeper sofa couch. Private entrance. Double French doors ...",0.580158
3752,Villa in Malibu · ★4.33 · 6 bedrooms · 6 beds · 4.5 baths,,"[""Beach access \u2013 Beachfront"", ""First aid kit"", ""Fire extinguisher"", ""Dining table"", ""Indoor fireplace"", ""TV with DVD player"", ""Washer"", ""Essentials"", ""Air conditioning"", ""Outdoor furniture"", ""Kitchen"", ""Wifi"", ""Heating"", ""Free residential garage on premises \u2013 3 spaces"", ""Dryer"", ""Patio or balcony""]. 12. Villa in Malibu · ★4.33 · 6 bedrooms · 6 beds · 4.5 baths. Entire villa. Entire home/apt. . Malibu. . Great! We had an incredibly relaxing and peaceful trip to Malibu. It w...",0.579726
