In [2]:
from google.colab import files
uploaded = files.upload()

Saving dataset_cognifyz.csv to dataset_cognifyz.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
from scipy.spatial.distance import euclidean
df = pd.read_csv('dataset_cognifyz.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

In [4]:
df.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

In [5]:
#Data cleaning
columns_to_drop = ['Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text']
df = df.drop(columns=columns_to_drop)
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Longitude,Latitude,Cuisines,Average Cost for two,Price range,Aggregate rating,Votes
0,6317637,Le Petit Souffle,121.027535,14.565443,"French, Japanese, Desserts",1100,3,4.8,314
1,6304287,Izakaya Kikufuji,121.014101,14.553708,Japanese,1200,3,4.5,591
2,6300002,Heat - Edsa Shangri-La,121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,4,4.4,270
3,6318506,Ooma,121.056475,14.585318,"Japanese, Sushi",1500,4,4.9,365
4,6314302,Sambo Kojin,121.057508,14.58445,"Japanese, Korean",1500,4,4.8,229


In [6]:
print(df.isnull().sum())  # to check columns with null values

Restaurant ID           0
Restaurant Name         0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Price range             0
Aggregate rating        0
Votes                   0
dtype: int64


In [7]:
df = df.dropna(subset=['Cuisines'])  # drop rows with missing values in 'Cuisines' column
print(df.isnull().sum())

Restaurant ID           0
Restaurant Name         0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Price range             0
Aggregate rating        0
Votes                   0
dtype: int64


In [8]:
#Duplicating original data
df = df.assign(
    avg_cost_for_two=df['Average Cost for two'],
    price_range=df['Price range'],
    aggregate_rating=df['Aggregate rating'],
    votes=df['Votes']
)

In [9]:
#Normalizing numerical columns
scaler = StandardScaler()
columns_to_scale = ['Average Cost for two', 'Price range', 'Aggregate rating', 'Votes']
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [10]:
#Dealing with multiple values in cuisines
df['Cuisines'] = df['Cuisines'].str.split(', ')
df = df.explode('Cuisines')
print(df.head())

   Restaurant ID         Restaurant Name   Longitude   Latitude  Cuisines  \
0        6317637        Le Petit Souffle  121.027535  14.565443    French   
0        6317637        Le Petit Souffle  121.027535  14.565443  Japanese   
0        6317637        Le Petit Souffle  121.027535  14.565443  Desserts   
1        6304287        Izakaya Kikufuji  121.014101  14.553708  Japanese   
2        6300002  Heat - Edsa Shangri-La  121.056831  14.581404   Seafood   

   Average Cost for two  Price range  Aggregate rating     Votes  \
0             -0.006221     1.319726          1.407683  0.365493   
0             -0.006221     1.319726          1.407683  0.365493   
0             -0.006221     1.319726          1.407683  0.365493   
1             -0.000020     1.319726          1.209860  1.009408   
2              0.173592     2.424069          1.143919  0.263210   

   avg_cost_for_two  price_range  aggregate_rating  votes  
0              1100            3               4.8    314  
0       

In [11]:
print(df.isnull().sum())

Restaurant ID           0
Restaurant Name         0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Price range             0
Aggregate rating        0
Votes                   0
avg_cost_for_two        0
price_range             0
aggregate_rating        0
votes                   0
dtype: int64


In [12]:
# Extract features from cuisines
vectorizer = TfidfVectorizer()
cuisines_features = vectorizer.fit_transform(df['Cuisines'])

In [13]:
# Calculate distance feature
def calculate_distance(lat1, lon1, lat2, lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)
    distance = geodesic(coords_1, coords_2).km
    return distance

df['Distance'] = df.apply(lambda row: calculate_distance(row['Latitude'], row['Longitude'], 0, 0), axis=1)
df['dist']=df['Distance'].copy()

In [14]:
#building reccomendation systems
def transform_user_profile(user_profile, scaler, vectorizer):
    # Transform user profile cuisines to TF-IDF feature space
    user_cuisines = vectorizer.transform([user_profile['Cuisines']])

    # Normalize user profile numerical features
    user_profile_normalized = user_profile.copy()
    user_profile_normalized['Price range'] = scaler.transform([[user_profile_normalized['Price range'], 0, 0, 0]])[0][0]
    user_profile_normalized['Distance'] = scaler.transform([[0, 0, 0, user_profile_normalized['Distance']]])[0][3]

    return user_profile_normalized, user_cuisines


def calculate_combined_similarities(user_profile_normalized, user_cuisines, df, cuisines_features):
    # Calculate cosine similarity for cuisines
    cuisines_similarity = cosine_similarity(cuisines_features, user_cuisines).flatten()

    # Calculate numerical features similarity using Euclidean distance
    numerical_features_data = df[['Price range', 'Distance']]
    user_numerical_features = np.array([[user_profile_normalized['Price range'], user_profile_normalized['Distance']]])
    numerical_distance = np.apply_along_axis(lambda x: euclidean(x, user_numerical_features[0]), 1, numerical_features_data.values)
    numerical_similarity = 1 / (1 + numerical_distance)  # Convert distance to similarity

    return cuisines_similarity, numerical_similarity


def generate_recommendations(cuisines_similarity, numerical_similarity, df, top_n=10):
    combined_similarity = (cuisines_similarity + numerical_similarity) / 2
    df['Similarity'] = combined_similarity

    df_sorted = df.sort_values(by='Similarity', ascending=False)

    top_recommendations = df_sorted.head(top_n)
    return top_recommendations

In [15]:
# Declaring a sample user profile
user_profile = {
    'Price range': 4,  # example value
    'Cuisines': 'French, Japanese, North Indian',  # example value
    'Distance': 13350  # example value in kilometers
}

# Transforming user profile
user_profile_normalized, user_cuisines = transform_user_profile(user_profile, scaler, vectorizer)

# Calculating similarities
cuisines_similarity, numerical_similarity = calculate_combined_similarities(user_profile_normalized, user_cuisines, df, cuisines_features)

# Generating recommendations
top_recommendations = generate_recommendations(cuisines_similarity, numerical_similarity, df, top_n=10)

# Displaying top recommendations
print(top_recommendations[['Restaurant Name', 'Cuisines', 'avg_cost_for_two', 'aggregate_rating','votes','dist','Similarity']])



                              Restaurant Name Cuisines  avg_cost_for_two  \
1145                                 La Plage   French               800   
4812                     Chez Jerome - Q Cafe   French              1500   
4993                          Tokyo Mon Amour   French              2200   
9484           Restaurant Mosaic @ The Orient   French              3210   
22                                Chez Michou   French                55   
70                          Paris 6 Classique   French               200   
67                            Les 3 Brasseurs   French               120   
9384                 Restaurant Gordon Ramsay   French               230   
9416  The French by Simon Rogan - The Midland   French               160   
9361                              The Kitchin   French                90   

      aggregate_rating  votes         dist  Similarity  
1145               4.6    302     0.000000    0.382019  
4812               0.0      1     0.000000    0.3