In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import random

### **Load Data**

In [3]:
user_df = pd.read_excel('../data/Visitors Preference Dataset .xlsx')
places_df = pd.read_csv('../data/Places Dataset Classified Reviews Cleaned Combined.csv')

In [4]:
from unidecode import unidecode

def clean_non_ascii(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: unidecode(str(x)))
    return df

user_df = clean_non_ascii(user_df)
places_df = clean_non_ascii(places_df)

### **Select a Random User**

In [5]:
user_id = 20

user_1 = user_df[user_df['User ID'] == user_id]
user_1

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
19,20,Elizabeth Peterson,elizabeth.peterson@example.com,"['wildlife viewing', 'camping', 'surfing']","['Tangalle', 'Udawalawe National Park', 'Miris..."


In [6]:
preferred_activities = user_1["Preferred Activities"]
preferred_activities = preferred_activities.values[0]
preferred_activities

"['wildlife viewing', 'camping', 'surfing']"

In [7]:
import ast

def convert_string_to_list(input_string):
    return ast.literal_eval(input_string)

print(preferred_activities)

['wildlife viewing', 'camping', 'surfing']


In [8]:
prefered_destinations = user_1["Bucket list destinations Sri Lanka"]
prefered_destinations = prefered_destinations.values[0]
prefered_destinations_array = convert_string_to_list(prefered_destinations)
prefered_destinations_array

['Tangalle',
 'Udawalawe National Park',
 'Mirissa Beach',
 'Weligama Beach (surf and stay)',
 'Yala National Park']

In [9]:
prefered_activity_array = convert_string_to_list(preferred_activities)
prefered_activity_array

['wildlife viewing', 'camping', 'surfing']

In [10]:
for destination in prefered_destinations_array:
    print(destination)

Tangalle
Udawalawe National Park
Mirissa Beach
Weligama Beach (surf and stay)
Yala National Park


In [11]:
import pandas as pd

def find_places_based_on_user_preferences(prefered_destinations_array) -> pd.DataFrame:
    pattern = '|'.join(prefered_destinations_array)
    
    filtered_df = places_df[
        places_df['name'].str.contains(pattern, regex=True, na=False)
    ]
    
    return filtered_df


In [12]:
input_places = find_places_based_on_user_preferences(prefered_destinations_array)
input_places

  places_df['name'].str.contains(pattern, regex=True, na=False)


Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,location_info,activities,Geographical,Historical,Religious,Natural,Entertainment,Accommodation,Shopping,Food,classified_reviews,negetive_rate,combined_info
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaAC/...,"Mirissa Beach, located on Sri Lanka's southern...","['Whale watching', 'Snorkeling and diving', 'S...",Mirissa Beach is located on the southern coast...,Mirissa has a rich maritime history and has be...,The area is close to the significant Buddhist ...,The beach is known for its stunning natural be...,Mirissa is a popular spot for nightlife activi...,Various types of accommodations are available ...,"Local markets and shops offer handicrafts, bea...",Mirissa offers a variety of seafood options an...,"[1, 1, 1, 1, 1]",0.0,mirissa beach located sri lankas southern coas...
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",4.459437,582.5,['Tangalle was a bit of a letdown for me. The ...,"Tangalle, located on Sri Lanka's southern coas...","['Beach Relaxation', 'Snorkeling and Diving', ...",Tangalle is located on the southern coast of S...,Tangalle has a rich history with influences fr...,"The area features several temples, including t...","Famous for its beautiful sandy beaches, Tangal...",Tangalle offers tranquil beach experiences and...,While specifically not promoting accommodation...,Visitors can find local markets selling handic...,The local cuisine features fresh seafood and t...,"[1, 1, 1, 1, 1]",0.0,tangalle located sri lankas southern coast boa...
11,Yala National Park,6.463961,81.471885,Sri Lanka,4.1,2810.0,['Yala National Park is a true gem for wildlif...,"Yala National Park, located in southeastern Sr...","['Wildlife Safaris', 'Bird Watching', 'Photogr...",Yala National Park is located in the southeast...,"Yala has a rich heritage, with ancient Buddhis...",There are ancient temples and monasteries near...,Yala National Park is renowned for its diverse...,Visitors can engage in thrilling wildlife view...,While this section would detail places to stay...,Local crafts and wildlife-themed souvenirs can...,Yala is surrounded by local villages where tra...,"[1, 1, 1, 1, 1, 1]",0.0,yala national park located southeastern sri la...
12,Udawalawe National Park,6.474629,80.876319,Sri Lanka,4.3,6156.0,['Udawalawe National Park was absolutely breat...,"Udawalawe National Park, located in Sri Lanka,...","['Safari Tours', 'Bird Watching', 'Elephant Tr...",Udawalawe National Park is located in the sout...,The park was established in 1972 primarily to ...,While there are no specific religious sites wi...,Udawalawe is renowned for its diverse flora an...,Visitors can enjoy wildlife safaris that allow...,Nearby accommodations typically cater to visit...,The nearby town offers local handicrafts and s...,Local cuisine offers a variety of traditional ...,"[1, 1, 1, 1, 1]",0.0,udawalawe national park located sri lanka reno...
32,Tangalle Beach,6.022726,80.800836,"Tangalle, Sri Lanka",4.2,511.0,['Tangalle Beach is a hidden gem! The soft san...,"Tangalle Beach, located on the southern coast ...","['Snorkeling', 'Surfing', 'Beach Volleyball', ...",Tangalle Beach is located on the southern coas...,Tangalle has a history linked to early Portugu...,Near Tangalle are several significant religiou...,"The beach is known for its natural beauty, fea...",Tangalle Beach offers various outdoor activiti...,The area features various guesthouses and holi...,Tangalle has local markets offering handicraft...,"The area is famous for its seafood, particular...","[1, 1, 1, 1, 1]",0.0,tangalle beach located southern coast sri lank...


### **Model Building and Trainig**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [14]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(places_df['combined_info'])

In [15]:
import pickle

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

# Save the TF-IDF matrix
with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar_records(input_description, input_activities, tfidf_matrix)-> pd.DataFrame:
    input_activities_str = ' '.join(input_activities)
    input_description = input_description
    
    input_tfidf = tfidf.transform([input_description])
    cosine_similarities = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
    
    related_docs_indices = cosine_similarities.argsort()[:-10:-1]
    return_dict = places_df.iloc[related_docs_indices]
    return_dict['Cosine Similarity'] = cosine_similarities[related_docs_indices]

    return return_dict
    

In [17]:
combined_infos = []

for index, row in input_places.iterrows():
    combined_infos.append(row['combined_info'])
    
combined_infos

['mirissa beach located sri lankas southern coast stunning destination known golden sands clear turquoise waters tourists enjoy activities like whale watching snorkeling surfing well relaxing beach exploring nearby coconut groves vibrant nightlife features beachside bars restaurants serving fresh seafood visitors also take boat trips see breathtaking coastline visit charming town mirissa local crafts culture natural beauty diverse activities mirissa beach perfect getaway beach lovers adventure seekers alike mirissa beach located southern coast sri lanka matara district mirissa rich maritime history part trade routes cultural interactions since ancient times evidence sea trade archaeological findings area close significant buddhist temples weherahena temple visited many pilgrims offer insight sri lankan buddhist practices beach known stunning natural beauty stretching along coastline golden sands clear turquoise waters surrounding area includes lush greenery palm trees mirissa popular s

In [18]:
input_activities = prefered_activity_array

### **Get Similar Results**

In [19]:
similar_records = pd.DataFrame()

for combined_info in combined_infos:
    new_records = get_most_similar_records(combined_info, input_activities, tfidf_matrix)
    
    if not similar_records.empty:
        new_records = new_records.loc[~new_records.index.isin(similar_records.index)]

    similar_records = pd.concat([similar_records, new_records], ignore_index=False)

print(similar_records.shape)


similar_records.shape


(30, 21)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_dict['Cosine Similarity'] = cosine_similarities[related_docs_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_dict['Cosine Similarity'] = cosine_similarities[related_docs_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_dict['Cosine Similarity'] = cosine_similarit

(30, 21)

### **Post Processing**

In [20]:
def check_negetive_rate(places_df):
    places = []
    for index, row in places_df.iterrows():
        if row['negetive_rate'] <= 0.2:
            places.append(row)
    return pd.DataFrame(places)

In [21]:
def sort_by_confidence_then_rating(df_places):
    df_places_sorted = df_places.sort_values(by=['Cosine Similarity', 'rating'], ascending=False)
    return df_places_sorted

In [22]:
def except_recommendations_already_in_input(df_places_sorted, prefered_destinations_array):
    df_places_sorted = df_places_sorted[~df_places_sorted['name'].isin(prefered_destinations_array)]
    return df_places_sorted


similar_records = except_recommendations_already_in_input(similar_records, prefered_destinations_array)

In [23]:
similar_records = check_negetive_rate(similar_records)

In [24]:
similar_records = sort_by_confidence_then_rating(similar_records)
similar_records = similar_records[~similar_records['name'].isin(prefered_destinations_array)].head(5)

In [25]:
similar_records_infos = []

for index, row in similar_records.iterrows():
    similar_records_infos.append(row['combined_info'])

similar_records_infos
    

["udawalawe located sri lanka renowned stunning national park home diverse array wildlife including elephants leopards various bird species tourists enjoy jeep safaris explore park's natural beauty observe animals habitat udawalawe reservoir offers opportunities birdwatching photography additionally visitors learn elephant conservation udawalawe elephant transit home orphaned elephants rehabilitated serene landscape rich biodiversity make udawalawe mustvisit destination nature lovers adventure seekers udawalawe located southern part sri lanka primarily known udawalawe national park spans 30821 hectares features mix grasslands shrub jungles considered one important national parks sri lanka udawalawe established 1972 provide sanctuary elephants protect catchment area udawalawe reservoir several small temples shrines situated around area reflecting local culture religious practices including buddhist temples attract visitors region renowned rich biodiversity particularly large population 

### **Model Evaluation**

#### **Relevance**

In [26]:
similar_records_infos = ''.join(similar_records_infos)
combined_infos = ''.join(combined_infos)

def cosine_similarity_score(str1, str2):
    if not str1 or not str2:
        return 0.0
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([str1, str2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

similarity = cosine_similarity_score(similar_records_infos, combined_infos)
print(f"Cosine similarity: {similarity:.2f}")

Cosine similarity: 0.79


#### **Diversity**

In [27]:
from geopy.distance import great_circle

coordinates_of_similar_places = similar_records[['lat', 'lng']]
coordinates_of_input_places = input_places[['lat', 'lng']]

all_coordinates = pd.concat([coordinates_of_similar_places, coordinates_of_input_places], ignore_index=True)

def geospatial_diversity(locations):
    if len(locations) < 2:
        return 0.0
    
    distances = []
    for i in range(len(locations)):
        for j in range(i + 1, len(locations)):
            distance = great_circle(locations[i], locations[j]).kilometers
            distances.append(distance)
    
    return np.mean(distances) if distances else 0.0


all_coordinates_list = all_coordinates.values.tolist()
diversity = geospatial_diversity(all_coordinates_list)
print(f"Geospatial Diversity (Average Distance): {diversity:.2f} km")

Geospatial Diversity (Average Distance): 54.36 km


### **Top 5 Recommendations**

In [28]:
similar_records["name"]

133                  Udawalawa
46                     Mirissa
198          Yala Green Safari
185    Mirissa whale Explorers
37                 Marakolliya
Name: name, dtype: object