In [138]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Load Data

In [102]:
data = pd.read_csv('FINAL_DATA.csv')
tours_data = pd.read_csv('final_tours_and_adventures.csv')

In [103]:
data.head(3)

Unnamed: 0,experience_id,user,liked,shared,bucketlist,purchased,attended,score,age,avg_accomodation_cost,avg_transport_cost,name,description,adventureLevel,price,gender_Male,featured,rating
0,64fc9b6b3d690a3e195ee90a,1.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,1200.0,600.0,Trip to Nairobi Snake Park,\nDive into the fascinating world of reptiles ...,6.0,5555.0,1.0,1.0,1.173121
1,6507ee68313443081a27234a,1.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,1200.0,600.0,Nairobi Mamba Village,Dive into an unforgettable reptilian adventure...,6.0,5555.0,1.0,1.0,1.173121
2,64fc7a953d690a3e195ee83c,1.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,1200.0,600.0,Hangout with Giraffes,Step into a world of wonder at the Giraffe Cen...,6.5,5555.0,1.0,0.0,1.173121


In [104]:
# Feature selection
selected_features = ['user', 'liked', 'shared', 'bucketlist', 'purchased', 'attended', 'score', 'age', 'avg_accomodation_cost',
                     'avg_transport_cost', 'adventureLevel', 'price', 'gender_Male', 'featured', 'rating']
numerical_data = data[selected_features]

In [105]:
# Correlation Analysis
correlation_matrix = numerical_data.corr()
correlation_matrix

Unnamed: 0,user,liked,shared,bucketlist,purchased,attended,score,age,avg_accomodation_cost,avg_transport_cost,adventureLevel,price,gender_Male,featured,rating
user,1.0,-0.019419,0.03169,0.04555,0.015096,0.002711,0.026521,0.04548,0.348113,0.376507,-0.009719,0.017873,-0.035855,0.00663,-0.141861
liked,-0.019419,1.0,0.0487,0.099637,0.113508,0.049935,0.503237,-0.028578,0.013683,0.004538,-0.015209,-0.010434,-0.051211,0.016357,-0.009264
shared,0.03169,0.0487,1.0,0.277283,0.12503,0.054721,0.567466,-0.008321,0.030547,0.017869,-0.013709,-0.005771,0.02138,0.008014,-0.026087
bucketlist,0.04555,0.099637,0.277283,1.0,0.301398,0.191238,0.656101,-0.016731,0.03179,0.02315,0.012155,0.007894,0.007539,0.006886,-0.026488
purchased,0.015096,0.113508,0.12503,0.301398,1.0,0.571011,0.654072,-0.009563,0.020597,0.023278,0.000217,0.029905,-0.016376,-0.017843,0.000112
attended,0.002711,0.049935,0.054721,0.191238,0.571011,1.0,0.516113,0.005371,0.005573,0.005672,-0.014783,0.016404,-0.031254,-0.015853,0.009578
score,0.026521,0.503237,0.567466,0.656101,0.654072,0.516113,1.0,-0.020701,0.038683,0.026357,-0.006858,0.009854,-0.021826,0.000522,-0.016666
age,0.04548,-0.028578,-0.008321,-0.016731,-0.009563,0.005371,-0.020701,1.0,-0.011492,-0.030067,0.036702,-0.007735,0.411349,-0.01271,0.088797
avg_accomodation_cost,0.348113,0.013683,0.030547,0.03179,0.020597,0.005573,0.038683,-0.011492,1.0,0.884887,-0.012675,0.021101,-0.054355,-0.005922,-0.205835
avg_transport_cost,0.376507,0.004538,0.017869,0.02315,0.023278,0.005672,0.026357,-0.030067,0.884887,1.0,-0.004661,0.021708,-0.069883,-0.011281,-0.227383


# Random Forest Regressor to Predict Score based on Correlated Features

In [106]:
# Impute missing values
imputer = SimpleImputer()
data_imputed = pd.DataFrame(imputer.fit_transform(numerical_data), columns=numerical_data.columns)

# Extract features and target
features = ['liked', 'shared', 'bucketlist', 'purchased', 'attended', 'avg_accomodation_cost', 'avg_transport_cost',
            'price', 'featured', 'rating', 'gender_Male', 'price']
target = 'score'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_imputed[features], data_imputed[target],
                                                    test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Add predicted scores to the original DataFrame
data['predicted_score'] = model.predict(scaler.transform(data_imputed[features]))

Mean Squared Error: 0.020013121717830755


In [107]:
data.sample()

Unnamed: 0,experience_id,user,liked,shared,bucketlist,purchased,attended,score,age,avg_accomodation_cost,avg_transport_cost,name,description,adventureLevel,price,gender_Male,featured,rating,predicted_score
133,64fc9b6b3d690a3e195ee90a,5.0,1.0,0.0,0.0,0.0,0.0,1.0,26.0,700.0,200.0,Trip to Nairobi Snake Park,\nDive into the fascinating world of reptiles ...,6.0,5555.0,0.0,1.0,0.115641,1.0


In [108]:
"""
Matrix where rows represent users, columns represent experiences, and the values indicate their predicted score

"""

user_scored_experiences = data.pivot_table(index='user', columns='experience_id', values='predicted_score', fill_value=0)

In [109]:
user_scored_experiences.head()

experience_id,64dfb10e7792cee05d3328d3,64fc2511148fd2e0b23d5031,64fc3d483d690a3e195ee6a4,64fc46763d690a3e195ee6c6,64fc4dd63d690a3e195ee6ee,64fc7a953d690a3e195ee83c,64fc7f3d3d690a3e195ee882,64fc8bc73d690a3e195ee898,64fc90f13d690a3e195ee8e0,64fc9b6b3d690a3e195ee90a,64fca0063d690a3e195ee937,64fca2693d690a3e195ee94d,64fca7ce3d690a3e195ee97a,6507ee68313443081a27234a,65083cd6313443081a272366,65285a2e66f321cbd9ef4ba0
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1.0,2.284286,1.6,1.166667,3.0,2.5,2.0,2.25,1.028571,1.0,1.5,2.333333,1.666667,1.0,1.666667,1.0,1.831667
2.0,2.05,4.0,1.0,0.5,2.0,0.0,1.333333,2.0,1.0,2.0,0.0,2.91,0.0,0.0,0.0,1.5
3.0,4.78,0.0,1.0,1.0,0.0,0.046345,2.0,0.0,3.0,1.0,0.0,0.0,2.74,1.0,1.0,1.0
4.0,0.0,2.87,0.0,1.01,1.0,0.0,1.0,0.0,3.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,2.0,3.0,1.0,0.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0


In [110]:
# User-Experience Interaction Matrix
user_scored_experiences_matrix = user_scored_experiences.values

# KMeans Clustering

In [111]:
"""
We use K-Means clustering to group users into clusters based on their predicted scores. It finds the 
optimal number of clusters using the Silhouette score and then applies K-Means with that number of clusters.
"""

max_clusters = 10
best_score = -1
best_cluster = 0
for n_clusters in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(user_scored_experiences_matrix)
    score = silhouette_score(user_scored_experiences_matrix, cluster_labels)
    if score > best_score:
        best_score = score
        best_cluster = n_clusters
        
print(f'Optimal Cluster Values: {best_cluster}')

Optimal Cluster Values: 4


In [112]:
# Fit K-Means with the best number of clusters
kmeans = KMeans(n_clusters=best_cluster, random_state=42, n_init=10)
user_scored_experiences['cluster'] = kmeans.fit_predict(user_scored_experiences_matrix)

In [113]:
user_scored_experiences

experience_id,64dfb10e7792cee05d3328d3,64fc2511148fd2e0b23d5031,64fc3d483d690a3e195ee6a4,64fc46763d690a3e195ee6c6,64fc4dd63d690a3e195ee6ee,64fc7a953d690a3e195ee83c,64fc7f3d3d690a3e195ee882,64fc8bc73d690a3e195ee898,64fc90f13d690a3e195ee8e0,64fc9b6b3d690a3e195ee90a,64fca0063d690a3e195ee937,64fca2693d690a3e195ee94d,64fca7ce3d690a3e195ee97a,6507ee68313443081a27234a,65083cd6313443081a272366,65285a2e66f321cbd9ef4ba0,cluster
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1.0,2.284286,1.60,1.166667,3.00,2.50,2.000000,2.250000,1.028571,1.000000,1.5,2.333333,1.666667,1.000000,1.666667,1.0,1.831667,2
2.0,2.050000,4.00,1.000000,0.50,2.00,0.000000,1.333333,2.000000,1.000000,2.0,0.000000,2.910000,0.000000,0.000000,0.0,1.500000,1
3.0,4.780000,0.00,1.000000,1.00,0.00,0.046345,2.000000,0.000000,3.000000,1.0,0.000000,0.000000,2.740000,1.000000,1.0,1.000000,3
4.0,0.000000,2.87,0.000000,1.01,1.00,0.000000,1.000000,0.000000,3.000000,1.0,1.000000,2.000000,0.000000,0.000000,0.0,0.000000,1
5.0,0.000000,0.00,2.000000,3.00,1.00,0.000000,1.000000,1.000000,0.000000,1.0,2.000000,1.000000,0.000000,0.000000,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135.0,1.000000,0.00,0.000000,0.00,2.00,0.000000,1.995000,1.000000,0.000000,2.0,0.500000,1.500000,2.000000,3.000000,1.0,0.000000,2
136.0,0.500000,0.00,0.000000,0.00,2.08,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,4.000000,1.0,3.020000,2
137.0,0.000000,0.50,1.000000,0.00,2.50,2.510000,0.000000,1.333333,0.000000,2.0,0.000000,0.000000,2.000000,0.000000,0.0,0.000000,0
138.0,1.000000,0.00,2.000000,2.00,0.00,0.000000,2.000000,1.000000,1.333333,1.5,3.710000,2.000000,1.666667,0.000000,0.0,1.000000,0


In [114]:
def cross_recommendations(user_id, num_recommendations=5):
    # Assuming 'user_scored_experiences' contains the clustered data
    user_cluster = user_scored_experiences.loc[user_id, 'cluster']
    
    # Select users in the same cluster
    cluster_users = user_scored_experiences[user_scored_experiences['cluster'] == user_cluster]
    
    # Get liked experiences of the target user
    user_liked_experiences = user_scored_experiences.loc[user_id, user_scored_experiences.columns[user_scored_experiences.loc[user_id] > 0]].index.tolist()
    
    recommendations = []
    
    # Iterate over other users in the cluster
    for idx, row in cluster_users.iterrows():
        if idx != user_id:
            # Get liked experiences of the current user in the cluster
            liked_experiences = row[row > 0].index.tolist()
            
            # Recommend experiences that the target user hasn't liked
            for exp_id in liked_experiences:
                if exp_id not in user_liked_experiences and exp_id not in recommendations:
                    recommendations.append(exp_id)
                    if len(recommendations) == num_recommendations:
                        return recommendations

In [115]:
# Example usage
user_id = 2  # Replace with the desired user ID
recommendations = cross_recommendations(user_id, num_recommendations=5)
print(f"Cross-Recommendations for User {user_id}: {recommendations}")

Cross-Recommendations for User 2: ['64fca0063d690a3e195ee937', '64fca7ce3d690a3e195ee97a', '65083cd6313443081a272366', '64fc7a953d690a3e195ee83c', '6507ee68313443081a27234a']


In [127]:
# show recommended experiences
experiences_data = pd.read_csv('final_experiences_data.csv')
experiences_data = experiences_data[['_id', 'name', 'description']]

In [128]:
experiences_data['description'] = experiences_data['description'].str.replace('\n', ' ')

In [129]:
experiences_data.head()

Unnamed: 0,_id,name,description
0,64dfb10e7792cee05d3328d3,Wasini Day Trip,Look for dolphins as you cruise the Indian Oce...
1,64fc2511148fd2e0b23d5031,Game Drive at Nairobi National Park,Experience the best of both worlds at Nairobi ...
2,64fc3d483d690a3e195ee6a4,Out of Africa Experience : Karen Blixen Museum,Step into the captivating world of Karen Blixe...
3,64fc46763d690a3e195ee6c6,Lunch with Elephants : Sheldrick Wildlife Trust,Embark on a transformative journey at the Shel...
4,64fc4dd63d690a3e195ee6ee,Nairobi National Museum Tour,Explore the wonders of Kenya's rich heritage a...


In [130]:
# Create a DataFrame for the recommended experiences
user_id = 123
recommended_experiences_df = experiences_data[experiences_data['_id'].isin(cross_recommendations(user_id))]

In [131]:
recommended_experiences_df

Unnamed: 0,_id,name,description
0,64dfb10e7792cee05d3328d3,Wasini Day Trip,Look for dolphins as you cruise the Indian Oce...
1,64fc2511148fd2e0b23d5031,Game Drive at Nairobi National Park,Experience the best of both worlds at Nairobi ...
3,64fc46763d690a3e195ee6c6,Lunch with Elephants : Sheldrick Wildlife Trust,Embark on a transformative journey at the Shel...
7,64fc8bc73d690a3e195ee898,Souvenir Shopping at Maasai Market,Dive into the vibrant world of Kenyan artistry...
8,64fc90f13d690a3e195ee8e0,Kazuri Beads Factory Tour,"Step into the world of Kazuri Beads Factory, w..."


# Content Based filtering based on the name and description of recommended experiences

In [132]:
recommended_experiences_df

Unnamed: 0,_id,name,description
0,64dfb10e7792cee05d3328d3,Wasini Day Trip,Look for dolphins as you cruise the Indian Oce...
1,64fc2511148fd2e0b23d5031,Game Drive at Nairobi National Park,Experience the best of both worlds at Nairobi ...
3,64fc46763d690a3e195ee6c6,Lunch with Elephants : Sheldrick Wildlife Trust,Embark on a transformative journey at the Shel...
7,64fc8bc73d690a3e195ee898,Souvenir Shopping at Maasai Market,Dive into the vibrant world of Kenyan artistry...
8,64fc90f13d690a3e195ee8e0,Kazuri Beads Factory Tour,"Step into the world of Kazuri Beads Factory, w..."


In [133]:
tours_data

Unnamed: 0,name,imageCover,price,description
0,Great Migration at Masai Mara Budget Safari,https://cloudfront.safaribookings.com/lib/keny...,550.0,Witness an amazing annual event of the great m...
1,2Nights Amboseli National Park Aa Lodge Amboseli,https://cloudfront.safaribookings.com/lib/keny...,540.0,"This amazing wild safari tour covers 3days, 2n..."
2,Budget Tour to Lake Nakuru and Masai Mara,https://cloudfront.safaribookings.com/lib/keny...,765.0,Experience abundant wildlife (including the Bi...
3,"Masai Mara, Lake Nakuru National Park Lodge Sa...",https://cloudfront.safaribookings.com/lib/keny...,1080.0,The Masai Mara National Reserve is located in ...
4,Maasai Mara Mid-Range Safari(Group Joining Saf...,https://cloudfront.safaribookings.com/lib/keny...,530.0,Enjoy thrilling 3 days Masai Mara Safari start...
...,...,...,...,...
98,Lamu Island Paradise Retreat,https://themajlisresorts.com/wp-content/upload...,293.0,Escape to a secluded paradise on Lamu Island a...
99,Lamu Sailing Expedition,https://images.squarespace-cdn.com/content/v1/...,298.0,Embark on a sailing adventure in Lamu's turquo...
100,Kilifi Beach Getaway,https://images.squarespace-cdn.com/content/v1/...,270.0,Relax on the stunning beaches of Kilifi and im...
101,Kilifi Coastal Safari,https://shanzubeachfront.com/wp-content/upload...,270.0,Embark on a coastal safari adventure in Kilifi...


# Calculates the cosine similarity between the recommended experiences and the new tours based on their descriptions using spaCy's Word2Vec embeddings. The tours are then ranked by similarity, and the top 12 recommendations are selected.

In [137]:
# Load spaCy model with Word2Vec embeddings
nlp = spacy.load("en_core_web_md")

# Function to get document vectors using spaCy
def get_doc_vector(text):
    return nlp(text).vector

# Extract descriptions from the recommended experiences and tours_data
recommended_descriptions = recommended_experiences_df['description'].tolist()
new_tours_descriptions = tours_data['description'].tolist()

# Get document vectors for recommended experiences and new tours
recommended_vectors = np.array([get_doc_vector(desc) for desc in recommended_descriptions])
new_tours_vectors = np.array([get_doc_vector(desc) for desc in new_tours_descriptions])

# Calculate cosine similarity between recommended experiences and new tours
similarity_matrix = cosine_similarity(recommended_vectors, new_tours_vectors)

# Sum the similarity scores for each new tour
total_similarity_scores = similarity_matrix.sum(axis=0)

# Add a new column to tours_data with the total similarity scores
tours_data['similarity_score'] = total_similarity_scores

# Sort tours_data based on the similarity scores
sorted_tours_data = tours_data.sort_values(by='similarity_score', ascending=False)

# Select the top 12 tours as recommendations
top_12_recommendations = sorted_tours_data.head(12)

# Display the top 12 recommendations
top_12_recommendations[['name', 'price', 'description']]

Unnamed: 0,name,price,description
49,Lake Naivasha and Masai Mara Safari (Mid-Range),784.0,If you are looking for the perfect retreat wit...
67,"Nairobi Park, Shedrick's Centre and Carnivore",252.0,This is a short safari tour of the only park w...
40,Amboseli National Park Mid Range Safari Tour,565.0,Amboseli National Park is one of the most spec...
6,Great Migration in Masai Mara & Lake Nakuru Sa...,550.0,This safari tour is everything that you have w...
62,Maasai Mara and Diani Beach Luxury Safari,2185.0,This safari gives you the lifetime opportunity...
63,Best of Safari Tanzania & Kenya Complete,5455.0,This trip allows you to visit the highlights o...
64,Best of Safari Tanzania & Kenya Complete,4130.0,This trip allows you to visit the highlights o...
36,Safari (Including Masai Mara) & Zanzibar Exten...,2250.0,This is a 6-day amazing safari with the best o...
22,"Amboseli NP, Lake Naivasha & Maasai Mara Mid-R...",1518.0,"This Safari will take you to the ""Land of Gian..."
25,Maasai Mara and Nakuru,1390.0,This tour takes you to the famous maasai mara ...
