In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
# Load the Excel file
file_path = 'DataSets/Places_Dataset_Test.xlsx'
df = pd.read_excel(file_path)

In [4]:
# Limit the dataset to the first 50 rows
df1 = df.iloc[:50, :]

# Display the first few rows of the limited dataset
df1.head()

Unnamed: 0,name,activities,lat,lng,formatted_address,rating,user_ratings_total,user_ratings_total.1
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri Lanka's ...
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,"['beach visits', 'surfing']",5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...


In [5]:
# Drop rows that contain any null values
data = df1.dropna()
data.head()

Unnamed: 0,name,activities,lat,lng,formatted_address,rating,user_ratings_total,user_ratings_total.1
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri Lanka's ...
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
4,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...
6,Unawatuna Beach,"['feeding fish', 'paddleboarding', 'snorkeling']",6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,['Unawatuna Beach is a slice of paradise! The ...


In [7]:
# Structure of our dataset
# data = {
#     'name': ['Place A', 'Place B', 'Place C'],
#     'rating': [4.5, 3.9, 4.8],
#     'user_ratings_total': [200, 150, 180],
#     'activities': ['Hiking, Swimming', 'Beach', 'Hiking, Sightseeing']
# }

places_df = pd.DataFrame(data)

# TF-IDF Vectorizer for Activities
tfidf = TfidfVectorizer(stop_words='english')
activities_matrix = tfidf.fit_transform(places_df['activities'])

# Convert the matrix to a DataFrame to see the result
activities_df = pd.DataFrame(activities_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [16]:
activities_df

Unnamed: 0,adventures,architecture,beach,beachfront,biking,bird,boat,butterfly,camping,climbing,...,tours,turtle,viewing,visit,visits,watching,water,whale,wildlife,yoga
0,0.0,0.0,0.406839,0.513895,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.320407,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.309503,0.390946,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.24375,0.283284,0.0,0.472389,0.0,0.0
2,0.0,0.0,0.592287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.466457,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.383425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.526194,0.0,0.0,0.301967,0.350944,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.365739,0.0,0.0,0.0,0.0,0.0,0.365739,0.0
6,0.0,0.29693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.814987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.506164,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.330392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.643037,0.0,...,0.0,0.0,0.402471,0.0,0.0,0.0,0.0,0.0,0.402471,0.0


In [11]:
# Add the TF-IDF vectorized activities back to the main DataFrame
places_df = pd.concat([places_df, activities_df], axis=1)
places_df.head()

Unnamed: 0,name,activities,lat,lng,formatted_address,rating,user_ratings_total,user_ratings_total.1,adventures,architecture,...,tours,turtle,viewing,visit,visits,watching,water,whale,wildlife,yoga
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.320407,0.0,0.0,0.0,0.0,0.0
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri Lanka's ...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.24375,0.283284,0.0,0.472389,0.0,0.0
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.466457,0.0,0.0,0.0,0.0,0.0
4,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Unawatuna Beach,"['feeding fish', 'paddleboarding', 'snorkeling']",6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,['Unawatuna Beach is a slice of paradise! The ...,0.0,0.29693,...,0.814987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
places_df.describe()

Unnamed: 0,lat,lng,rating,user_ratings_total,adventures,architecture,beach,beachfront,biking,bird,...,tours,turtle,viewing,visit,visits,watching,water,whale,wildlife,yoga
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,...,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,6.939242,80.611912,4.505,3386.325,0.024711,0.056046,0.07629,0.033799,0.010299,0.055271,...,0.030921,0.029621,0.090377,0.012654,0.138734,0.095161,0.010963,0.01181,0.091302,0.013506
std,0.843526,0.482311,0.307137,5725.122573,0.109434,0.12406,0.174681,0.121015,0.065136,0.151833,...,0.143575,0.131617,0.186613,0.080032,0.214173,0.192628,0.069336,0.074691,0.187832,0.085422
min,5.944703,79.84648,3.7,174.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.11091,80.234399,4.3,386.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.809544,80.570662,4.55,1157.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.668051,80.852582,4.8,2733.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.306577,0.0,0.0,0.0,0.0,0.0
max,8.721837,81.836848,5.0,26736.0,0.532799,0.392809,0.592287,0.513895,0.411955,0.588944,...,0.814987,0.658649,0.525533,0.506164,0.618714,0.661589,0.438519,0.472389,0.525533,0.540254


In [14]:
# Normalize ratings and total ratings to bring them to the same scale
scaler = MinMaxScaler()
places_df[['Normalized_Average_Rating', 'Normalized_Total_Ratings']] = scaler.fit_transform(places_df[['rating', 'user_ratings_total']])

print(places_df)


                                            name  \
0                               Arugam Bay Beach   
1                                  Mirissa Beach   
2                 Weligama Beach (surf and stay)   
4                                Hikkaduwa Beach   
6                                Unawatuna Beach   
7                                  Pigeon Island   
8                               Galle Dutch Fort   
9                       Polonnaruwa Ancient City   
10                                      Sigiriya   
11                            Yala National Park   
12                       Udawalawe National Park   
13                        Wilpattu National Park   
14                       Wasgamuwa National Park   
15                       Minneriya National Park   
16                      Sinharaja Forest Reserve   
17                   Horton Plains National Park   
18                          Kumana National Park   
19                         Bundala National Park   
21          

In [15]:
# bucket_list_destination = 'Place A'  # Example bucket list destination

# Add a boost to places that match the bucket list destination
places_df['Bucket_List_Boost'] = places_df['name'].apply(lambda x: 10 if x == bucket_list_destination else 0)


NameError: name 'bucket_list_destination' is not defined

In [2]:
# Example user input: preferred activities and bucket list destination
# user_activities = 'Hiking, Swimming'

# Vectorize the user's activities
user_activities_vector = tfidf.transform([user_activities])

# Calculate the similarity between user preferences and place activities
cosine_sim = cosine_similarity(user_activities_vector, activities_matrix)

# Add similarity scores to the DataFrame
places_df['Similarity_Score'] = cosine_sim[0]

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Calculate the final score
places_df['Final_Score'] = (
    (places_df['Similarity_Score'] * 0.5) + 
    (places_df['Normalized_Average_Rating'] * 0.2) + 
    (places_df['Bucket_List_Boost'] * 0.1) + 
    (places_df['Normalized_Total_Ratings'] * 0.2)
)

# Sort places based on the final score
recommended_places = places_df.sort_values(by='Final_Score', ascending=False)

# Display top 5 recommended places
print(recommended_places[['Place', 'Final_Score']].head(5))
