In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
user_df = pd.read_excel('../data/Visitors Preference Dataset .xlsx')
places_df = pd.read_csv('../data/Places Dataset Classified Reviews Cleaned Combined .csv')

In [3]:
from unidecode import unidecode

def clean_non_ascii(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: unidecode(str(x)))
    return df

user_df = clean_non_ascii(user_df)
places_df = clean_non_ascii(places_df)

In [4]:
user_1 = user_df[user_df['User ID'] == 30]
user_1

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
29,30,Kristin Lee,kristin.lee@example.com,"['cultural experiences', 'temple pilgrimages',...","[""Sri Pada / Adam's Peak"", 'Kandy Temple', 'An..."


In [5]:
preferred_activities = user_1["Preferred Activities"]
preferred_activities = preferred_activities.values[0]
preferred_activities

"['cultural experiences', 'temple pilgrimages', 'sailing']"

In [6]:
import ast

def convert_string_to_list(input_string):
    return ast.literal_eval(input_string)

print(preferred_activities)

['cultural experiences', 'temple pilgrimages', 'sailing']


In [7]:
prefered_destinations = user_1["Bucket list destinations Sri Lanka"]
prefered_destinations = prefered_destinations.values[0]
prefered_destinations_array = convert_string_to_list(prefered_destinations)
prefered_destinations_array

["Sri Pada / Adam's Peak",
 'Kandy Temple',
 'Anuradhapura',
 'Negombo Lagoon',
 'Arankelle Forest Monastery']

In [8]:
prefered_activity_array = convert_string_to_list(preferred_activities)
prefered_activity_array

['cultural experiences', 'temple pilgrimages', 'sailing']

In [9]:
for destination in prefered_destinations_array:
    print(destination)

Sri Pada / Adam's Peak
Kandy Temple
Anuradhapura
Negombo Lagoon
Arankelle Forest Monastery


In [10]:
import pandas as pd

def find_places_based_on_user_preferences(prefered_destinations_array) -> pd.DataFrame:
    pattern = '|'.join(prefered_destinations_array)
    
    filtered_df = places_df[
        places_df['name'].str.contains(pattern, regex=True, na=False)
    ]
    
    return filtered_df


In [11]:
input_places = find_places_based_on_user_preferences(prefered_destinations_array)
input_places

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,activities,Geographical,Historical,Religious,Natural,Entertainment,Accommodation,Shopping,Food,classified_reviews,negetive_rate,combined_info
23,Anuradhapura,8.311352,80.403651,"Anuradhapura, Sri Lanka",4.459437,638.579937,"['Anuradhapura is rich in history, but I found...","['Biking around archaeological sites', 'Explor...",Ancient city located in the North Central Prov...,"One of the ancient capitals of Sri Lanka, know...","Home to numerous stupas and temples, including...","Surrounded by green landscapes, including sacr...",Cultural festivals and local events celebratin...,Variety of options from guesthouses to eco-lod...,"Local markets offer handicrafts, textiles, and...",Diverse cuisine featuring local specialties of...,"[1, 1, 0, 1, 1]",0.2,ancient city located north central province ch...
30,Sri Pada / Adam's Peak,6.809643,80.499388,"Sri Pada / Adam's Peak, Sri Lanka",4.9,5430.0,['Hiking AdamAC/AEURA(tm)s Peak was one of the...,"['Trekking', 'Pilgrimage']",Towering mountain; important topographical fea...,Significant pilgrimage site with rich history;...,Sacred to multiple religions; associated with ...,Biodiverse area; various flora and fauna; scen...,Cultural festivals; night climbing experiences.,Limited guesthouses; basic amenities available.,Local crafts; pilgrimage-related items.,Traditional local cuisine; simple eateries.,"[1, 1, 1, 1, 1, 1]",0.0,towering mountain important topographical feat...
45,Negombo Lagoon,7.158004,79.84648,"Negombo Lagoon, Sri Lanka",4.5,268.0,['Negombo Lagoon was a delightful escape! We t...,"['Bird watching', 'Fishing tours', 'Kayaking',...","Coastal lagoon region, rich in biodiversity.",Significant for trade; influenced by colonial ...,Shrine and local fishing communities with cult...,Home to mangrove forests and diverse wildlife.,Local festivals and traditional boat races.,Proximity to beach resorts and guesthouses.,Local craft markets and fish markets.,Seafood delicacies and local cuisine.,"[1, 1, 1, 1, 1]",0.0,coastal lagoon region rich biodiversity signif...
315,Anuradhapura New Town,8.322765,80.402577,"Anuradhapura, Sri Lanka",4.4,167.0,['Anuradhapura New Town is a fascinating blend...,"['Historical Ruins Exploration', 'Bicycle Tour...",Located in the north-central region of the cou...,Home to ancient Sri Lankan kings and numerous ...,Contains significant Buddhist temples and stup...,"Surrounded by lush greenery, rice fields, and ...",Various cultural performances and local events...,Options range from guesthouses to hotels cater...,"Local markets offering traditional crafts, tex...",Local cuisine featuring traditional Sri Lankan...,"[1, 1, 1, 1, 1]",0.0,located northcentral region country characteri...
316,Dakkhina Stupa - Anuradhapura,8.34116,80.395259,"Anuradhapura, Sri Lanka",4.6,183.0,['The Dakkhina Stupa is a hidden gem in Anurad...,"['Visit the stupa', 'Photography of ancient ar...","Located within a UNESCO World Heritage site, c...","One of the ancient stupas, significant to the ...","A vital pilgrimage site, showcasing Buddhist h...",Surrounded by tranquil landscapes and flora ty...,Educational tours focusing on history and arch...,"Various options available in nearby towns, cat...",Souvenir shops focusing on local crafts and re...,"Local cuisine available in nearby eateries, em...","[1, 1, 1, 1, 1, 1, 1]",0.0,located within unesco world heritage site char...
317,"Maha Viharaya, Anuradhapura",8.344689,80.396583,"Anuradhapura, Sri Lanka",4.7,187.0,['Maha Viharaya is a remarkable place steeped ...,"['Exploring ancient ruins', 'Guided historical...","Located in north-central part of Sri Lanka, su...","Once a prominent monastic complex, reflecting ...","Significant site for Buddhist pilgrims, housin...",Surrounded by lush greenery and integrated int...,Cultural performances and festivals related to...,Various lodging options available in nearby ar...,Local craft shops and souvenir vendors in vici...,Traditional Sri Lankan cuisine available at ne...,"[1, 1, 1, 1, 1]",0.0,located northcentral part sri lanka surrounded...


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [13]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(places_df['combined_info'])

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_most_similar_records(input_description, input_activities, tfidf_matrix)-> pd.DataFrame:
    input_activities_str = ' '.join(input_activities)
    input_description = input_description
    
    input_tfidf = tfidf.transform([input_description])
    cosine_similarities = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
    
    related_docs_indices = cosine_similarities.argsort()[:-10:-1]
    return_dict = places_df.iloc[related_docs_indices]
    return_dict['Cosine Similarity'] = cosine_similarities[related_docs_indices]

    return return_dict
    

In [15]:
combined_infos = []

for index, row in input_places.iterrows():
    combined_infos.append(row['combined_info'])
    
combined_infos

['ancient city located north central province characterized flat plains significant water reservoirs one ancient capitals sri lanka known wellpreserved ruins role development buddhism region home numerous stupas temples including significant pilgrimage sites buddhists surrounded green landscapes including sacred bodhi trees network lakes reservoirs cultural festivals local events celebrating heritage traditions variety options guesthouses ecolodges catering different preferences local markets offer handicrafts textiles traditional items diverse cuisine featuring local specialties often based rice curry',
 'towering mountain important topographical feature significant pilgrimage site rich history references ancient texts sacred multiple religions associated buddha shiva biodiverse area various flora fauna scenic views cultural festivals night climbing experiences limited guesthouses basic amenities available local crafts pilgrimagerelated items traditional local cuisine simple eateries'

In [16]:
input_activities = prefered_activity_array

In [17]:
similar_records = pd.DataFrame()

for combined_info in combined_infos:
    new_records = get_most_similar_records(combined_info, input_activities, tfidf_matrix)
    
    if not similar_records.empty:
        new_records = new_records.loc[~new_records.index.isin(similar_records.index)]

    similar_records = pd.concat([similar_records, new_records], ignore_index=False)

print(similar_records.shape)


similar_records.shape


(45, 20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_dict['Cosine Similarity'] = cosine_similarities[related_docs_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_dict['Cosine Similarity'] = cosine_similarities[related_docs_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return_dict['Cosine Similarity'] = cosine_similarit

(45, 20)

In [18]:
def check_negetive_rate(places_df):
    places = []
    for index, row in places_df.iterrows():
        if row['negetive_rate'] <= 0.2:
            places.append(row)
    return pd.DataFrame(places)

In [19]:
def sort_by_confidence_then_rating(df_places):
    df_places_sorted = df_places.sort_values(by=['Cosine Similarity', 'rating'], ascending=False)
    return df_places_sorted

In [20]:
similar_records = check_negetive_rate(similar_records)

In [21]:
similar_records = sort_by_confidence_then_rating(similar_records)
similar_records = similar_records[~similar_records['name'].isin(prefered_destinations_array)].head(5)

In [22]:
similar_records_infos = []

for index, row in similar_records.iterrows():
    similar_records_infos.append(row['combined_info'])

similar_records_infos
    

['located within unesco world heritage site characterized ancient ruins lush vegetation one ancient stupas significant early buddhist community reflecting architectural brilliance time vital pilgrimage site showcasing buddhist heritage history surrounded tranquil landscapes flora typical regions dry zone educational tours focusing history archaeology various options available nearby towns catering different budgets souvenir shops focusing local crafts religious artifacts local cuisine available nearby eateries emphasizing traditional flavors',
 'coastal lagoon area tropical vegetation significant local communities traditional fishing practices often associated local beliefs coastal rituals rich biodiversity including mangroves wildlife scenic boat rides local cultural events ecofriendly lodges guest houses available local crafts souvenirs often nearby markets fresh seafood traditional sri lankan cuisine',
 'coastal area sandy beaches clear waters influenced colonial history local fishi

### **Model Evaluation**

#### **Relevance**

In [23]:
similar_records_infos = ''.join(similar_records_infos)
combined_infos = ''.join(combined_infos)

def cosine_similarity_score(str1, str2):
    if not str1 or not str2:
        return 0.0
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([str1, str2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

similarity = cosine_similarity_score(similar_records_infos, combined_infos)
print(f"Cosine similarity: {similarity:.2f}")

Cosine similarity: 0.72


#### **Diversity**

In [24]:
from geopy.distance import great_circle

coordinates_of_similar_places = similar_records[['lat', 'lng']]
coordinates_of_input_places = input_places[['lat', 'lng']]

all_coordinates = pd.concat([coordinates_of_similar_places, coordinates_of_input_places], ignore_index=True)

def geospatial_diversity(locations):
    if len(locations) < 2:
        return 0.0
    
    distances = []
    for i in range(len(locations)):
        for j in range(i + 1, len(locations)):
            distance = great_circle(locations[i], locations[j]).kilometers
            distances.append(distance)
    
    return np.mean(distances) if distances else 0.0


all_coordinates_list = all_coordinates.values.tolist()
diversity = geospatial_diversity(all_coordinates_list)
print(f"Geospatial Diversity (Average Distance): {diversity:.2f} km")

Geospatial Diversity (Average Distance): 129.54 km


In [25]:
similar_records["name"]

316    Dakkhina Stupa - Anuradhapura
314                     Umari Lagoon
34                          Uppuveli
138       Deegawapi Raja Maha Vihara
320             Hali-ela , Sri Lanka
Name: name, dtype: object

### Saving models

In [26]:
# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

# Save the TF-IDF matrix
with open('tfidf_matrix.pkl', 'wb') as file:
    pickle.dump(tfidf_matrix, file)