In [175]:
import numpy as np
import pandas as pd
import joblib

In [176]:
df = pd.read_csv('Data_Restaurants.csv', sep = '|')

In [177]:
# drop unnecessary columns
df = df.drop(['Numero de telephone'], axis=1)


In [178]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [179]:
df['Specialite'] = df['Specialite'].str.lower()
df['Specialite'] = df['Specialite'].str.replace(' ', '_')
df['Region'] = df['Region'].str.lower()
df['Region'] = df['Region'].str.replace(' ', '_')
df['Nom du restaurant'] = df['Nom du restaurant'].str.lower()
df['Nom du restaurant'] = df['Nom du restaurant'].str.replace(' ', '_')

In [180]:
# remove punctuation
df['Specialite'] = df['Specialite'].str.replace('[^\w\s]','')
df['Region'] = df['Region'].str.replace('[^\w\s]','')
df['Nom du restaurant'] = df['Nom du restaurant'].str.replace('[^\w\s]','')

  df['Specialite'] = df['Specialite'].str.replace('[^\w\s]','')
  df['Region'] = df['Region'].str.replace('[^\w\s]','')
  df['Nom du restaurant'] = df['Nom du restaurant'].str.replace('[^\w\s]','')


In [181]:
#remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('french')
df['Specialite'] = df['Specialite'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['Region'] = df['Region'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['Nom du restaurant'] = df['Nom du restaurant'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nostr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [182]:
# countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# linear kernel
from sklearn.metrics.pairwise import linear_kernel


In [183]:
df_percent = df.sample(frac=0.2)

In [184]:
df_percent.set_index('Nom du restaurant', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['Specialite'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [185]:
joblib.dump(cosine_similarities, 'recommend_name_restaurant.pkl')

['recommend_name_restaurant.pkl']

In [186]:
def recommend_restaurant(name):
    name = name.lower()
    name = name.replace(' ', '_')
    name = name.replace('[^\w\s]','')
    words = name.split()
    # Remove stop words
    filtered_words = [word for word in words if word not in stopwords.words('french')]
    # Join the filtered words back into a sentence
    name = ' '.join(filtered_words)
    indices = pd.Series(df_percent.index)
    cosine_similarities = joblib.load('recommend_name_restaurant.pkl')
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    try:
        idx = indices[indices == name].index[0]
    except:
        print('Restaurant non trouvé')
        return None
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
# Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['Specialite', 'Note moyenne', 'Indicateur de prix','Ville','Departement','Region'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:

        df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index == each].sample()))
        
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region'], keep=False)
    df_new = df_new.sort_values(by='Note moyenne', ascending=False)
    
    return df_new


recommend_restaurant('mcdonalds')

  df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region']][df_percent.index 

Unnamed: 0,Specialite,Note moyenne,Indicateur de prix,Ville,Departement,Region
mc_donalds,fast_food,5.0,€,Bures-sur-Yvette,91,îledefrance
mc_donalds,fast_food,5.0,€,Forges-les-Bains,91,îledefrance
ypres_burger,fast_food_burgers,4.5,€,Bailleul,59,hautsdefrance
mc_donalds,fast_food,4.5,Unknown,Kingersheim,68,grand_est
mcdonalds,burgers_fast_food,4.0,€€,Levroux,36,centreval_de_loire
mcdonalds,burgers_fast_food,4.0,€€€€,Courcelles-sur-Seine,27,normandie
mcdonalds,fast_food_burgers,3.5,€€€€,Lespignan,34,occitanie
mcdonalds,fast_food_burgers,3.5,€€,Magescq,40,nouvelleaquitaine
mcdonalds,fast_food_burgers,3.5,€€,Le Plessis-Belleville,60,hautsdefrance
mcdonalds,fast_food_burgers,3.0,Unknown,Les Mées,4,provencealpescôte_dazur


In [187]:
def recommend_region(name, region=None):
    name = name.lower()
    name = name.replace(' ', '_')
    name = name.replace('[^\w\s]','')
    words = name.split()
    # Remove stop words
    filtered_words = [word for word in words if word not in stopwords.words('french')]
    # Join the filtered words back into a sentence
    name = ' '.join(filtered_words)
    indices = pd.Series(df_percent.index)
    cosine_similarities = joblib.load('recommend_name_restaurant.pkl')
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    try:
        # Find the index of the restaurant entered
        idx = indices[indices == name].index[0]
    except:
        print('Restaurant non trouvé')
        return None
    
    # Find the restaurants with a similar cosine-sim value and order them from biggest number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Filter dataframe by region if specified
    if region:
        df_new = df_percent[df_percent['Region'] == region].copy()
    else:
        df_new = df_percent.copy()
    
    # Create the top 30 similar restaurants with some of their columns
    df_new = df_new[df_new.index.isin(recommend_restaurant)][['Specialite', 'Note moyenne', 'Indicateur de prix','Ville','Departement','Region']]
        
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region'], keep=False)
    df_new = df_new.sort_values(by='Note moyenne', ascending=False)
    
    return df_new
  


In [188]:
recommend_region('McDonalds', region='îledefrance')


Unnamed: 0_level_0,Specialite,Note moyenne,Indicateur de prix,Ville,Departement,Region
Nom du restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mc_donalds,fast_food,5.0,€,Forges-les-Bains,91,îledefrance
mc_donalds,fast_food,5.0,€,Gometz-le-Châtel,91,îledefrance
mc_donalds,fast_food_burgers,5.0,Unknown,Gometz-le-Châtel,91,îledefrance
mc_donalds,fast_food,5.0,€,Gif-sur-Yvette,91,îledefrance
mc_donalds,fast_food_burgers,5.0,Unknown,Les Molières,91,îledefrance
mc_donalds,fast_food,5.0,€,Bures-sur-Yvette,91,îledefrance
quick,fast_food,5.0,Unknown,Beauchamp,95,îledefrance
quick,fast_food,5.0,Unknown,Franconville,95,îledefrance
mcdonalds,fast_food_burgers,4.5,€,Marcoussis,91,îledefrance
mcdonalds,fast_food_burgers,4.5,€,Fontenay-lès-Briis,91,îledefrance


In [189]:
def recommend_departement(name, departement=None):
    name = name.lower()
    name = name.replace(' ', '_')
    name = name.replace('[^\w\s]','')
    words = name.split()
    # Remove stop words
    filtered_words = [word for word in words if word not in stopwords.words('french')]
    # Join the filtered words back into a sentence
    name = ' '.join(filtered_words)
    indices = pd.Series(df_percent.index)
    cosine_similarities = joblib.load('recommend_name_restaurant.pkl')
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    try:
        # Find the index of the restaurant entered
        idx = indices[indices == name].index[0]
    except:
        print('Restaurant non trouvé')
        return None
    
    # Find the restaurants with a similar cosine-sim value and order them from biggest number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Filter dataframe by department if specified
    if departement is not None:
        df_new = df_percent[df_percent['Departement'] == departement].copy()
    else:
        df_new = df_percent.copy()
    
    # Create the top 30 similar restaurants with some of their columns
    df_new = df_new[df_new.index.isin(recommend_restaurant)][['Specialite', 'Note moyenne', 'Indicateur de prix','Ville','Departement','Region']]
        
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['Specialite','Note moyenne', 'Indicateur de prix','Ville','Departement','Region'], keep=False)
    df_new = df_new.sort_values(by='Note moyenne', ascending=False)
    
    return df_new


In [190]:
recommend_departement('mcdonalds', departement=78)


Unnamed: 0_level_0,Specialite,Note moyenne,Indicateur de prix,Ville,Departement,Region
Nom du restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [191]:
usr_input = input("type a restaurant name:")

import re
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

#function to find between each word in the name of the restaurant and the name of the restaurant in the database
def find_resto(word):
    for i in df_percent.index:
        # check similarity between the word and the name of the restaurant
        similarity = similar(i, word)
        if similarity > 0.9:
            return i
            
def find_region(word):
    for i in df_percent['Region'].unique():
        # check similarity between the word and the name of the region
        similarity = similar(i, word)
        if similarity > 0.9:
            return i
        
def find_departement(word):
    pattern = r'\d+'
    if re.findall(pattern, word):
        for i in df_percent['Departement'].unique():
            # check similarity between the word and the name of the region
            if word == str(i):
                return i
            
def is_resto(word):
    for i in df_percent.index:
        # check similarity between the word and the name of the restaurant
        similarity = similar(i, word)
        if similarity > 0.9:
            return True
            
    return False
            
def is_region(word):
    for i in df_percent['Region'].unique():
        # check similarity between the word and the name of the region
        similarity = similar(i, word)
        if similarity > 0.9:
            return True

    return False

def is_departement(word):
    pattern = r'\d+'
    if re.findall(pattern, word):
        for i in df_percent['Departement'].unique():
            # check similarity between the word and the name of the region
            if word == str(i):
                return True
            
    return False

def print_resto(df,max=1000000):
    if len(df) == 0:
        return "Je n'ai pas trouvé de restaurant correspondant à votre demande"
    else:
        finalAnswer = str("Voici les restaurants que je vous recommande : \n")
        if len(df) < max:
            for i in range(len(df)):
                resto_string = str(i)+" : " +str('Spécialité : '+df.iloc[i]['Specialite']) + ' - Note (/5)' + str(df.iloc[i]['Note moyenne']) + ' - Prix : ' + str(df.iloc[i]['Indicateur de prix']) + ' - Ville : ' + str(df.iloc[i]['Ville']) + ' - Departement : ' + str(df.iloc[i]['Departement']) + ' - Région : ' + str(df.iloc[i]['Region']) 
                finalAnswer += resto_string + "\n"
        else:
            for i in range(max):
                resto_string = str(i)+" : " +str('Spécialité : '+df.iloc[i]['Specialite']) + ' - Note (/5)' + str(df.iloc[i]['Note moyenne']) + ' - Prix : ' + str(df.iloc[i]['Indicateur de prix']) + ' - Ville : ' + str(df.iloc[i]['Ville']) + ' - Departement : ' + str(df.iloc[i]['Departement']) + ' - Région : ' + str(df.iloc[i]['Region']) 
                finalAnswer += resto_string + "\n"

    return finalAnswer

def word_is_max(word):
    if  "max" in word:
        return True
    else:
        return False


def generate_response(user_input):
    restoFind = None
    regionFind = None
    departementFind = None
    max = 1000000
    for word in user_input.split():
            if word_is_max(word):
                max = int(word[4:])
            if is_resto(word):
                restoFind = find_resto(word)
            if is_region(word):
                regionFind = find_region(word)
            if is_departement(word):
                departementFind = find_departement(word)

            print(restoFind, regionFind, departementFind)

    if restoFind != None:
        if restoFind != None:
            print("region")
            print(print_resto(recommend_region(restoFind, regionFind),max))
        elif departementFind != None:
            print("departement")
            print(print_resto(recommend_departement(restoFind, departementFind),max))
        else:
            print("resto")
            print(print_resto(recommend_restaurant(restoFind),max))
    else:
        print("Je n'ai pas compris votre demande, veuillez réessayer")

generate_response(usr_input)


mcdonalds None None
mcdonalds None None
region
Voici les restaurants que je vous recommande : 
0 : Spécialité : burgers_fast_food_coffee__tea - Note (/5)5.0 - Prix : $ - Ville : Calais - Departement : 62 - Région : hautsdefrance
1 : Spécialité : fast_food_burgers - Note (/5)5.0 - Prix : Unknown - Ville : Gometz-le-Châtel - Departement : 91 - Région : îledefrance
2 : Spécialité : fast_food - Note (/5)5.0 - Prix : Unknown - Ville : Franconville - Departement : 95 - Région : îledefrance



: 