In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Load the Excel file
file_path = 'DataSets/PLACES_FINAL.xlsx'
df = pd.read_excel(file_path)

In [3]:
# Remove some columns from data set
df = df.drop(['latest_reviews'], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                390 non-null    object 
 1   activities          390 non-null    object 
 2   rating              390 non-null    float64
 3   user_ratings_total  390 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 12.3+ KB


In [4]:
df.head()

Unnamed: 0,name,activities,rating,user_ratings_total
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",4.8,1591
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",4.6,1748
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",4.4,325
3,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",4.7,1438
4,Tangalle,"['beach visits', 'surfing']",5.0,3


In [5]:
# Structure of our dataset
# data = {
#     'name': ['Place A', 'Place B', 'Place C'],
#     'rating': [4.5, 3.9, 4.8],
#     'user_ratings_total': [200, 150, 180],
#     'activities': ['Hiking, Swimming', 'Beach', 'Hiking, Sightseeing']
# }

places_df = pd.DataFrame(df)

# TF-IDF Vectorizer for Activities
tfidf = TfidfVectorizer(stop_words='english')
activities_matrix = tfidf.fit_transform(places_df['activities'])

# Convert the matrix to a DataFrame to see the result
activities_df = pd.DataFrame(activities_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [6]:
activities_df.head()

Unnamed: 0,adventures,air,animal,archaeological,architecture,art,arts,ayurvedic,ballooning,beach,...,village,visits,walks,watching,water,waterfalls,whale,wildlife,workshops,yoga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369185,...,0.0,0.332438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285771,...,0.0,0.257326,0.0,0.273668,0.0,0.0,0.439083,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.495307,...,0.0,0.446006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333623,...,0.0,0.300415,0.0,0.319493,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.495307,...,0.0,0.446006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Add the TF-IDF vectorized activities back to the main DataFrame
places_df = pd.concat([places_df, activities_df], axis=1)
places_df.head()

Unnamed: 0,name,activities,rating,user_ratings_total,adventures,air,animal,archaeological,architecture,art,...,village,visits,walks,watching,water,waterfalls,whale,wildlife,workshops,yoga
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",4.8,1591,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.332438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",4.6,1748,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.257326,0.0,0.273668,0.0,0.0,0.439083,0.0,0.0,0.0
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",4.4,325,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.446006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",4.7,1438,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.300415,0.0,0.319493,0.0,0.0,0.0,0.0,0.0,0.0
4,Tangalle,"['beach visits', 'surfing']",5.0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.446006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
places_df.describe()

Unnamed: 0,rating,user_ratings_total,adventures,air,animal,archaeological,architecture,art,arts,ayurvedic,...,village,visits,walks,watching,water,waterfalls,whale,wildlife,workshops,yoga
count,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,...,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0,390.0
mean,4.454615,1523.592308,0.055382,0.002407,0.014719,0.015658,0.05086,0.005703,0.011297,0.002961,...,0.001189,0.065839,0.018342,0.060605,0.006207,0.08241,0.012081,0.057529,0.0049,0.002648
std,0.437058,3444.008948,0.14646,0.033562,0.088927,0.084645,0.136903,0.05156,0.067479,0.041292,...,0.023484,0.153003,0.082295,0.153944,0.056891,0.223689,0.082008,0.147376,0.04835,0.037407
min,0.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.3,134.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.5,354.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.775,1224.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,26736.0,0.707107,0.469276,0.707107,0.763799,0.666996,0.6311,0.50892,0.57735,...,0.463762,0.669155,0.556714,0.666257,0.707107,1.0,0.848657,0.707107,0.517728,0.599055


In [9]:
# Normalize ratings and total ratings to bring them to the same scale
scaler = MinMaxScaler()
# Apply the scaler to the columns and assign the result
places_df[['Normalized_Average_Rating', 'Normalized_Total_Ratings']] = pd.DataFrame(
    scaler.fit_transform(places_df[['rating', 'user_ratings_total']]),
    columns=['Normalized_Average_Rating', 'Normalized_Total_Ratings'],
    index=places_df.index  # Ensure the index matches the original DataFrame
)

In [49]:
places_df.head()

Unnamed: 0,name,activities,rating,user_ratings_total,accommodations,activities.1,adventure,adventures,amusement,animal,...,water,waterfall,waterfalls,wellness,whale,wildlife,workshops,yoga,Normalized_Average_Rating,Normalized_Total_Ratings
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",4.8,1591,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.95122,0.059473
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",4.6,1748,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.460168,0.0,0.0,0.0,0.902439,0.065345
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",4.4,325,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.853659,0.012119
3,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",4.7,1438,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.926829,0.05375
4,Tangalle,"['beach visits', 'surfing']",5.0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.5e-05


In [15]:

# Add a boost to places that match the bucket list destination
places_df['Bucket_List_Boost'] = places_df['name'].apply(lambda x: 10 if x == bucket_list_destination else 0)


NameError: name 'bucket_list_destination' is not defined

In [2]:
# Example user input: preferred activities and bucket list destination
# user_activities = 'Hiking, Swimming'

# Vectorize the user's activities
user_activities_vector = tfidf.transform([user_activities])

# Calculate the similarity between user preferences and place activities
cosine_sim = cosine_similarity(user_activities_vector, activities_matrix)

# Add similarity scores to the DataFrame
places_df['Similarity_Score'] = cosine_sim[0]

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Calculate the final score
places_df['Final_Score'] = (
    (places_df['Similarity_Score'] * 0.5) + 
    (places_df['Normalized_Average_Rating'] * 0.2) + 
    (places_df['Bucket_List_Boost'] * 0.1) + 
    (places_df['Normalized_Total_Ratings'] * 0.2)
)

# Sort places based on the final score
recommended_places = places_df.sort_values(by='Final_Score', ascending=False)

# Display top 5 recommended places
print(recommended_places[['Place', 'Final_Score']].head(5))
