In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Load the Excel file
file_path = 'DataSets/PLACES_FINAL.xlsx'
df1 = pd.read_excel(file_path)

In [3]:
df1.head()

Unnamed: 0,name,activities,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",4.8,1591,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",4.6,1748,['Mirissa Beach is truly a gem on Sri Lanka's ...
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",4.4,325,['Weligama Beach is a fantastic spot for both ...
3,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",4.7,1438,['Hikkaduwa Beach is a delightful escape for s...
4,Tangalle,"['beach visits', 'surfing']",5.0,3,['Tangalle was a bit of a letdown for me. The ...


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                390 non-null    object 
 1   activities          390 non-null    object 
 2   rating              390 non-null    float64
 3   user_ratings_total  390 non-null    int64  
 4   latest_reviews      390 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 15.4+ KB


In [5]:
# Drop rows that contain any null values
data = df1.dropna()
data.head()

Unnamed: 0,name,activities,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,"['beach visits', 'beachfront dining', 'surfing']",4.8,1591,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,"['beach visits', 'beachfront dining', 'snorkel...",4.6,1748,['Mirissa Beach is truly a gem on Sri Lanka's ...
2,Weligama Beach (surf and stay),"['beach visits', 'surfing']",4.4,325,['Weligama Beach is a fantastic spot for both ...
3,Hikkaduwa Beach,"['beach visits', 'snorkeling', 'surfing', 'tur...",4.7,1438,['Hikkaduwa Beach is a delightful escape for s...
4,Tangalle,"['beach visits', 'surfing']",5.0,3,['Tangalle was a bit of a letdown for me. The ...


In [6]:
data.describe()

Unnamed: 0,rating,user_ratings_total
count,390.0,390.0
mean,4.454615,1523.592308
std,0.437058,3444.008948
min,0.9,1.0
25%,4.3,134.25
50%,4.5,354.5
75%,4.775,1224.75
max,5.0,26736.0


In [12]:
# structure of our dataset
# data = {
#     'name': ['Place A', 'Place B', 'Place C', 'Place D', 'Place E'],
#     'rating': [4.5, 3.9, 4.8, 4.2, 4.0],
#     'user_ratings_total': [200, 150, 180, 90, 130],
#     'activities': ['Hiking, Swimming', 'Beach', 'Hiking, Sightseeing', 'Beach, Hiking', 'Sightseeing, Swimming']
# }

places_df = pd.DataFrame(data)

# TF-IDF Vectorizer for Activities
tfidf = TfidfVectorizer(stop_words='english')
activities_matrix = tfidf.fit_transform(places_df['activities'])

# Normalize ratings and total ratings to bring them to the same scale
scaler = MinMaxScaler()
places_df[['Normalized_Average_Rating', 'Normalized_Total_Ratings']] = scaler.fit_transform(places_df[['rating', 'user_ratings_total']])

def recommend_places(preferred_activities, bucket_list_destination):
    # Add a boost to places that match the bucket list destination
    places_df['Bucket_List_Boost'] = places_df['name'].apply(lambda x: 10 if x == bucket_list_destination else 0)

    # Vectorize the user's activities
    user_activities_vector = tfidf.transform([preferred_activities])

    # Calculate the similarity between user preferences and place activities
    cosine_sim = cosine_similarity(user_activities_vector, activities_matrix)

    # Add similarity scores to the DataFrame
    places_df['Similarity_Score'] = cosine_sim[0]

    # Calculate the final score
    places_df['Final_Score'] = (
        (places_df['Similarity_Score'] * 0.5) + 
        (places_df['Normalized_Average_Rating'] * 0.1) + 
        (places_df['Bucket_List_Boost'] * 0.3) + 
        (places_df['Normalized_Total_Ratings'] * 0.1)
    )

    # Sort places based on the final score
    recommended_places = places_df.sort_values(by='Final_Score', ascending=False)

    # Return top 5 recommended places
    return recommended_places[['name', 'Final_Score','activities','rating','user_ratings_total']].head(5)

# Testing the model

In [13]:
# Example user inputs
preferred_activities = 'mountain biking,waterfalls,ayurvedic spa treatments'
bucket_list_destination = 'Ramboda Falls, Bambarakiri Ella, Bentota, Hatton, St ClairsÂ Falls'

# Call the function and display the recommended places
top_places = recommend_places(preferred_activities, bucket_list_destination)
top_places

Unnamed: 0,name,Final_Score,activities,rating,user_ratings_total
103,"Spa Ceylon Heritage Spa & Boutique,galle Fort",0.458787,['ayurvedic spa treatments'],4.6,282
65,Swastha Ayurveda,0.451272,['ayurvedic spa treatments'],4.3,229
27,Knuckles,0.298267,"['butterfly watching', 'hiking', 'landscape ph...",4.9,337
117,Bopath Falls,0.197835,"['waterfalls', 'photography']",4.7,2111
20,Sri Dalada Maligawa,0.195122,"['architecture photography', 'cultural experie...",4.8,26736


# Save model as pickle

In [14]:
# Save the TF-IDF vectorizer
with open('tfidf_model.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

# Save the processed DataFrame (places_df)
with open('places_df.pkl', 'wb') as file:
    pickle.dump(places_df, file)

print("Models saved successfully.")


Models saved successfully.


In [None]:
# # Load the TF-IDF vectorizer
# with open('tfidf_model.pkl', 'rb') as file:
#     loaded_tfidf = pickle.load(file)

# # Load the processed DataFrame (places_df)
# with open('places_df.pkl', 'rb') as file:
#     loaded_places_df = pickle.load(file)

# print("Models loaded successfully.")
