In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity      # To measure how similar two things (like users or products) are
from sklearn.feature_extraction.text import TfidfVectorizer  # To convert text into numbers so a model can understand it
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
destinations_df = pd.read_csv("Expanded_Destinations.csv")
reviews_df = pd.read_csv("Final_Updated_Expanded_Reviews.csv")
userhistory_df = pd.read_csv("Final_Updated_Expanded_UserHistory.csv")
users_df = pd.read_csv("Final_Updated_Expanded_Users.csv")

In [3]:

reviews_destinations = pd.merge(reviews_df, destinations_df, on='DestinationID', how='inner')

reviews_destinations_userhistory = pd.merge(reviews_destinations, userhistory_df, on='UserID', how='inner')


df = pd.merge(reviews_destinations_userhistory, users_df, on='UserID', how='inner')

df

Unnamed: 0,ReviewID,DestinationID_x,UserID,Rating,ReviewText,Name_x,State,Type,Popularity,BestTimeToVisit,HistoryID,DestinationID_y,VisitDate,ExperienceRating,Name_y,Email,Preferences,Gender,NumberOfAdults,NumberOfChildren
0,1,178,327,2,Incredible monument!,Jaipur City,Rajasthan,City,8.544352,Oct-Mar,79,175,2024-01-01,3,Pooja,pooja@example.com,"City, Historical",Female,1,1
1,2,411,783,1,Loved the beaches!,Taj Mahal,Uttar Pradesh,Historical,8.284127,Nov-Feb,834,894,2024-03-20,2,Karan,karan@example.com,"City, Historical",Male,1,1
2,4,358,959,3,Incredible monument!,Jaipur City,Rajasthan,City,7.738761,Oct-Mar,998,660,2024-02-15,4,Ritvik,ritvik@example.com,"Nature, Adventure",Male,1,1
3,5,989,353,2,Loved the beaches!,Kerala Backwaters,Kerala,Nature,8.208088,Sep-Mar,202,894,2024-01-01,5,Isha,isha@example.com,"Nature, Adventure",Female,2,0
4,6,473,408,4,A historical wonder,Jaipur City,Rajasthan,City,8.138558,Oct-Mar,331,403,2024-01-01,2,Ishaan,ishaan@example.com,"City, Historical",Male,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,991,701,850,3,Incredible monument!,Taj Mahal,Uttar Pradesh,Historical,8.814029,Nov-Feb,138,131,2024-03-20,1,Hitesh,hitesh@example.com,"Beaches, Historical",Male,2,0
989,991,701,850,3,Incredible monument!,Taj Mahal,Uttar Pradesh,Historical,8.814029,Nov-Feb,643,761,2024-01-01,4,Hitesh,hitesh@example.com,"Beaches, Historical",Male,2,0
990,995,231,346,5,Loved the beaches!,Taj Mahal,Uttar Pradesh,Historical,7.788256,Nov-Feb,454,113,2024-01-01,2,Hitesh,hitesh@example.com,"Beaches, Historical",Male,2,2
991,995,231,346,5,Loved the beaches!,Taj Mahal,Uttar Pradesh,Historical,7.788256,Nov-Feb,556,128,2024-01-01,4,Hitesh,hitesh@example.com,"Beaches, Historical",Male,2,2


# Recommendation Model

# Content based filtering

In [4]:
 # Combine relevant destination columns into a single text feature for recommendations
df['features'] = df['Type'] + ' ' + df['State'] + ' ' + df['BestTimeToVisit'] + " " + df['Preferences']

# Convert text features into numeric vectors and compute similarity between destinations
# stop_word='english' :Removes common words like "the", "is", "and" to focus on meaningful ones
vectorizer = TfidfVectorizer(stop_words='english')    
destination_features = vectorizer.fit_transform(df['features'])
cosine_sim = cosine_similarity(destination_features, destination_features)

 user_id = 1
visited_destinations = userhistory_df[userhistory_df['UserID'] == user_id]['DestinationID'].values
similar_scores = np.sum(cosine_sim[visited_destinations - 1], axis=0)
recommended_destinations_idx = np.argsort(similar_scores)[::-1]

recommendations = []
for idx in recommended_destinations_idx:
    if destinations_df.iloc[idx]['DestinationID'] not in visited_destinations:
        recommendation = destinations_df.iloc[idx][[
            'DestinationID', 'Name', 'State', 'Type', 'Popularity', 'BestTimeToVisit'
        ]].to_dict()
        recommendations.append(recommendation)
    if len(recommendations) >= 5:
        break

recommended_destinations = pd.DataFrame(recommendations)
recommended_destinations


# Collaborative filtering

In [5]:
user_item_matrix = userhistory_df.pivot(index='UserID', columns='DestinationID', values='ExperienceRating')
user_item_matrix.fillna(0, inplace=True)
user_similarity = cosine_similarity(user_item_matrix)

In [6]:
user_id = 1

similar_users = user_similarity[user_id - 1]

similar_users_idx = np.argsort(similar_users)[::-1][1:6]

similar_user_ratings = user_item_matrix.iloc[similar_users_idx].mean(axis=0)

recommended_destinations_ids = similar_user_ratings.sort_values(ascending=False).head(5).index

collaborative_recommendations = destinations_df[destinations_df['DestinationID'].isin(recommended_destinations_ids)][[
    'DestinationID', 'Name', 'State', 'Type', 'Popularity', 'BestTimeToVisit'
]]

collaborative_recommendations


Unnamed: 0,DestinationID,Name,State,Type,Popularity,BestTimeToVisit
61,62,Goa Beaches,Goa,Beach,9.379787,Nov-Mar
80,81,Taj Mahal,Uttar Pradesh,Historical,8.11373,Nov-Feb
295,296,Taj Mahal,Uttar Pradesh,Historical,8.013452,Nov-Feb
388,389,Kerala Backwaters,Kerala,Nature,9.409146,Sep-Mar
778,779,Kerala Backwaters,Kerala,Nature,8.861507,Sep-Mar


# Train Model

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
data = pd.read_csv('final_df.csv')
df.head(2)

Unnamed: 0,ReviewID,DestinationID_x,UserID,Rating,ReviewText,Name_x,State,Type,Popularity,BestTimeToVisit,...,DestinationID_y,VisitDate,ExperienceRating,Name_y,Email,Preferences,Gender,NumberOfAdults,NumberOfChildren,features
0,1,178,327,2,Incredible monument!,Jaipur City,Rajasthan,City,8.544352,Oct-Mar,...,175,2024-01-01,3,Pooja,pooja@example.com,"City, Historical",Female,1,1,"City Rajasthan Oct-Mar City, Historical"
1,2,411,783,1,Loved the beaches!,Taj Mahal,Uttar Pradesh,Historical,8.284127,Nov-Feb,...,894,2024-03-20,2,Karan,karan@example.com,"City, Historical",Male,1,1,"Historical Uttar Pradesh Nov-Feb City, Historical"


In [9]:
y = df['Popularity']
x = df[['Name_x', 'State', 'Type', 'BestTimeToVisit', 'Preferences', 'Gender', 'NumberOfAdults', 'NumberOfChildren']]


In [10]:
y

0      8.544352
1      8.284127
2      7.738761
3      8.208088
4      8.138558
         ...   
988    8.814029
989    8.814029
990    7.788256
991    7.788256
992    8.501225
Name: Popularity, Length: 993, dtype: float64

In [11]:
x

Unnamed: 0,Name_x,State,Type,BestTimeToVisit,Preferences,Gender,NumberOfAdults,NumberOfChildren
0,Jaipur City,Rajasthan,City,Oct-Mar,"City, Historical",Female,1,1
1,Taj Mahal,Uttar Pradesh,Historical,Nov-Feb,"City, Historical",Male,1,1
2,Jaipur City,Rajasthan,City,Oct-Mar,"Nature, Adventure",Male,1,1
3,Kerala Backwaters,Kerala,Nature,Sep-Mar,"Nature, Adventure",Female,2,0
4,Jaipur City,Rajasthan,City,Oct-Mar,"City, Historical",Male,2,0
...,...,...,...,...,...,...,...,...
988,Taj Mahal,Uttar Pradesh,Historical,Nov-Feb,"Beaches, Historical",Male,2,0
989,Taj Mahal,Uttar Pradesh,Historical,Nov-Feb,"Beaches, Historical",Male,2,0
990,Taj Mahal,Uttar Pradesh,Historical,Nov-Feb,"Beaches, Historical",Male,2,2
991,Taj Mahal,Uttar Pradesh,Historical,Nov-Feb,"Beaches, Historical",Male,2,2


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
df['Preferences'].value_counts()

Preferences
Beaches, Historical    345
Nature, Adventure      328
City, Historical       320
Name: count, dtype: int64

In [14]:
categorical_columns= ['Name_x', 'State', 'Type', 'BestTimeToVisit', 'Preferences', 'Gender']

preprocessor = ColumnTransformer([
    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
], remainder='passthrough')

In [15]:
print(x_train.dtypes)
print(x_train.head())


Name_x              object
State               object
Type                object
BestTimeToVisit     object
Preferences         object
Gender              object
NumberOfAdults       int64
NumberOfChildren     int64
dtype: object
                Name_x              State        Type BestTimeToVisit  \
907          Taj Mahal      Uttar Pradesh  Historical         Nov-Feb   
923  Kerala Backwaters             Kerala      Nature         Sep-Mar   
660        Jaipur City          Rajasthan        City         Oct-Mar   
877          Taj Mahal      Uttar Pradesh  Historical         Nov-Feb   
909         Leh Ladakh  Jammu and Kashmir   Adventure         Apr-Jun   

             Preferences  Gender  NumberOfAdults  NumberOfChildren  
907  Beaches, Historical  Female               1                 2  
923    Nature, Adventure  Female               2                 1  
660  Beaches, Historical  Female               2                 2  
877  Beaches, Historical  Female               2       

In [16]:
pipe = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [17]:
pipe.fit(x_train, y_train)
p= pipe.predict(x_test)

In [18]:
print(f"Mean Squared Error: {mean_squared_error(y_test, p):.2f}")
print(f"R² Score: {r2_score(y_test,p):.2f}")

Mean Squared Error: 0.30
R² Score: 0.07


In [19]:
# Just use raw dictionary as long as you convert to DataFrame
user_input = pd.DataFrame([{
    'Name_x': 'Jaipur City',
    'Type': 'City',
    'State': 'Rajasthan',
    'BestTimeToVisit': 'Oct-Mar',
    'Preferences': 'City, Historical',
    'Gender': 'Female',
    'NumberOfAdults': 2,
    'NumberOfChildren': 1,
}])

prediction = pipe.predict(user_input)[0]
print(f"Predicted Popularity Score: {prediction:.2f}")


Predicted Popularity Score: 7.88


In [24]:
import pickle
with open("model.pkl", 'wb') as f:
    pickle.dump(pipe, f)
