## Importing all necessary libraries

In [14]:
import os
import pandas as pd
import time
import json
import re
import numpy as np
from collections import defaultdict

# Importing our modules
import MyFunctions.crawler as crawler
import MyFunctions.parser as parser

import MyFunctions.utilities as utilities
import MyFunctions.engine as engine
import MyFunctions.scoring as scoring
import MyFunctions.geography as geography


import ipywidgets as widgets
from IPython.display import display
import ast
import math 
from geopy.geocoders import Nominatim


import folium
import random

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yarayoussef/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yarayoussef/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yarayoussef/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# 1. Data Collection
This part involves creating a custom dataset for a search engine to process text documents.
The first step is gathering raw data by scraping the Michelin Guide's list of restaurants in Italy. The modular design of the code ensures efficient and manageable development. 
One key module in this process is the crawler, responsible for retrieving and storing web pages for further analysis.

## 1.1-1.2 Crawling the website to get all the listed restaurants
The crawler handles downloading the HTML content of the restaurant pages from the Michelin Guide website. It is implemented in the crawler.py module, and imported in this main

In [None]:
crawler.begin()

['https://guide.michelin.com/en/restaurants', 'https://guide.michelin.com/en/hotels', 'https://guide.michelin.com/en/articles', 'https://guide.michelin.com/en/restaurantlist', 'https://guide.michelin.com/en/restaurants', 'https://guide.michelin.com/en/hotels', 'https://guide.michelin.com/en/best-of', 'https://guide.michelin.com/en/articles', 'https://guide.michelin.com/en/restaurantlist', 'https://guide.michelin.comhttps://intercom.help/michelin-guide-contact-us/en/', 'https://guide.michelin.com/en/subscribe', 'https://guide.michelin.comhttps://guide.michelin.com/en/take-advantage-of-the-privileged-partnership-between-the-michelin-guide-thefork-and-tripadvisor', 'https://guide.michelin.comhttps://www.theforkmanager.com/partnership-thefork-michelin#cc=michelin-referral', 'https://guide.michelin.com#', 'https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare', 'https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro', 'https://guide.michelin.com/en/pi

## 1.3 Parsing html files and extracting info and create dataset
Once the HTML files have been successfully crawled and stored, the next step is to extract relevant information from each restaurant’s page. The parser module processes each HTML file, extracting key details to construct a structured dataset. The parser.py module reads the saved HTML files, parsing the content to find specific data points.
After extracting the data from each restaurant’s page, the information is organized and saved in a structured dataframe format

In [13]:
restaurant_data = []
folder_path ='restaurants_html'
counter = 1

# Iterate on directories and files
for root, dirs, files in os.walk(folder_path):
    for html_file in files:
        file_path = os.path.join(root, html_file)
        # open each html file and read it
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            # extract information using parser module
            restaurant_info = parser.extract_restaurant_info(html_content)
            # storing dictionary of restaurant in list
            restaurant_data.append(restaurant_info)
            # visualisation of correct execution of code
            print(f"Extracted info for {restaurant_info['restaurantName']} number {counter}")
            counter += 1
# Convert list of dictionaries to dataframe creating the dataset
df_restaurants = pd.DataFrame(restaurant_data)

Extracted info for L'Acciuga number 1
Extracted info for Jamantè number 2
Extracted info for Cucine Nervi number 3
Extracted info for Lokanda Devetak number 4
Extracted info for Felix Lo Basso home & restaurant number 5
Extracted info for Da Carla number 6
Extracted info for Antiche Sere number 7
Extracted info for Aria number 8
Extracted info for Olio number 9
Extracted info for Gimmi Restaurant number 10
Extracted info for Dry Aged number 11
Extracted info for Specus number 12
Extracted info for Rada Rooftop number 13
Extracted info for Hydra number 14
Extracted info for Hosteria Toblino number 15
Extracted info for Locanda Marchesani number 16
Extracted info for Quattro Passi number 17
Extracted info for Locanda La Raia number 18
Extracted info for Filo number 19
Extracted info for Santa Elisabetta number 20
Extracted info for Cala Luna number 21
Extracted info for L'Argaj number 22
Extracted info for Anastasia number 23
Extracted info for La Conchiglia number 24
Extracted info for 

Visualize dataframe

In [None]:
display(df_restaurants)

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,L'Acciuga,via Settevalli 217,Perugia,06128,Italy,€€€,"Contemporary,International",You would never guess that there was a gourmet...,"[Air conditioning, Interesting wine list, Terr...","[Amex, Unionpay, Dinersclub, Discover, Jcb, Ma...",+39 339 263 2591,https://www.lacciuga.net/
1,Jamantè,via San Vito 97,Polignano a Mare,70044,Italy,€€€,ModernCuisine,Located not far from the historic centre and t...,"[Air conditioning, Terrace]","[Amex, Dinersclub, Mastercard, Visa]",+39 351 628 7773,https://www.jamanteristorante.com
2,Cucine Nervi,corso Vercelli 117,Gattinara,13045,Italy,€€€,ModernCuisine,Housed within the Cantine Nervi in the town ce...,[Air conditioning],"[Mastercard, Visa]",+39 333 182 4123,https://cucinenervi.com
3,Lokanda Devetak,via Brezici 22,Savogna d'Isonzo,34070,Italy,€€,"RegionalCuisine,TraditionalCuisine",Just two kilometres from the Slovenian border ...,"[Air conditioning, Car park, Garden or park, I...","[Amex, Mastercard, Visa]",+39 0481 882488,https://www.devetak.com/
4,Felix Lo Basso home & restaurant,via Carlo Goldoni 36,Milan,20129,Italy,€€€€,"ItalianContemporary,Creative",Brilliant chef Felix Lo Basso’s menu is inspir...,"[Air conditioning, Counter dining, Wheelchair ...","[Amex, Mastercard, Visa]",+39 02 4540 9759,https://www.felixlobassorestaurant.it/
...,...,...,...,...,...,...,...,...,...,...,...,...
1978,Kleine Flamme,via Cittanuova 31,Vipiteno,39049,Italy,€€€,"Fusion,Creative",In his small open-view kitchen situated in the...,[Terrace],"[Mastercard, Visa]",+39 0472 766065,https://kleineflamme.com/it/
1979,Antica Locanda al Cervo - Landgasthof zum Hirs...,via Schrann 9/c,San Genesio Atesino,39050,Italy,€€,"RegionalCuisine,TraditionalCuisine","Enjoy generous, regional cuisine influenced by...","[Car park, Garden or park, Great view, Terrace...","[Mastercard, Visa]",+39 0471 354195,https://www.hirschenwirt.it/
1980,Lapprodo,via Roma 22,Vibo Valentia Marina,89900,Italy,€€€,"Seafood,ClassicCuisine",Definitely one of the most reliable restaurant...,"[Air conditioning, Terrace, Wheelchair access]","[Amex, Unionpay, Dinersclub, Discover, Maestro...",+39 0963 572640,https://www.lapprodo.com/
1981,MikEle,via Flavio Gioia 1,Maranello,41053,Italy,€€€,Seafood,Located in a residential district on the edge ...,[Air conditioning],"[Amex, Dinersclub, Mastercard, Visa]",+39 0536 941027,https://www.ristorantemikele.com/


Make sure there are no null values

In [None]:
print(df_restaurants.isna().sum())  

restaurantName        0
address               0
city                  0
postalCode            0
country               0
priceRange            0
cuisineType           0
description           0
facilitiesServices    0
creditCards           0
phoneNumber           0
website               0
dtype: int64


In [16]:
# Save dataframe to csv file to avoid re-running the parser everytime
df_restaurants.to_csv('df_restaurants.csv', index=False)

In [2]:
# Load the data from the csv file
df_restaurants = pd.read_csv('df_restaurants.csv')

Checking shape of df

In [None]:
print(df_restaurants.shape)

(1983, 12)


This code saves each restaurant's information, stored in a pandas DataFrame, into a tab-separated values (TSV) file for easy storage

In [None]:
for i, row in df_restaurants.iterrows():
    # Define the file name using the index
    file_name = f"restaurant_{i}.tsv"

    # Prepare row data as a single line with tab-separated values
    content =  f"{row['restaurantName']}\t{row['address']}\t{row['city']}\t{row['postalCode']}\t{row['country']}\t{row['priceRange']}\t{row['cuisineType']}\t{row['description']}\t{row['facilitiesServices']}\t{row['creditCards']}\t{row['phoneNumber']}\t{row['website']}\n"

    subfolder = f"files_tsv"
    file_path = os.path.join(subfolder, file_name)

    # Check if the subfolder exists, create it if it doesn't
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
    # Write the row data to the .tsv file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

    print(f"Created file: {file_name}")

Created file: restaurant_0.tsv
Created file: restaurant_1.tsv
Created file: restaurant_2.tsv
Created file: restaurant_3.tsv
Created file: restaurant_4.tsv
Created file: restaurant_5.tsv
Created file: restaurant_6.tsv
Created file: restaurant_7.tsv
Created file: restaurant_8.tsv
Created file: restaurant_9.tsv
Created file: restaurant_10.tsv
Created file: restaurant_11.tsv
Created file: restaurant_12.tsv
Created file: restaurant_13.tsv
Created file: restaurant_14.tsv
Created file: restaurant_15.tsv
Created file: restaurant_16.tsv
Created file: restaurant_17.tsv
Created file: restaurant_18.tsv
Created file: restaurant_19.tsv
Created file: restaurant_20.tsv
Created file: restaurant_21.tsv
Created file: restaurant_22.tsv
Created file: restaurant_23.tsv
Created file: restaurant_24.tsv
Created file: restaurant_25.tsv
Created file: restaurant_26.tsv
Created file: restaurant_27.tsv
Created file: restaurant_28.tsv
Created file: restaurant_29.tsv
Created file: restaurant_30.tsv
Created file: rest

# 2. Search engines
This project implements two types of search engines to retrieve restaurant information based on user queries:

1. Conjunctive Search Engine: This search engine returns restaurants where all query terms appear in the restaurant description. It performs an AND search, requiring each term to be present.
2. Ranked Search Engine: This search engine returns the top-k restaurants sorted by their similarity to the query, using TF-IDF (Term Frequency-Inverse Document Frequency) and Cosine Similarity to measure relevance.

These search engines enable efficient querying of restaurant data, offering both exact and ranked search results based on user input.

## 2.0 Preprocessing the text
We must clean and preprocess each restaurant's description to ensure accurate search results, using the nltk (Natural Language Toolkit) library to perform these tasks. 
In order to do that we create a ***clean_desc()*** function in the utilities.py file, this function takes the description of the dataset and cleans it applying the following steps:
1. Remove stopwords
2. Lower casing
3. Remove special characters
4. Remove punctuations
5. Apply stemming 
6. Remove extra spaces

We apply this function to the description column of the df, creating a new cleaned description column

In [3]:
# apply function to description column 
df_restaurants['cleaned_desc'] = df_restaurants['description'].apply(utilities.clean_desc)

Visualize new df

In [4]:
df_restaurants

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website,cleaned_desc
0,L'Acciuga,via Settevalli 217,Perugia,6128,Italy,€€€,"Contemporary,International",You would never guess that there was a gourmet...,"['Air conditioning', 'Interesting wine list', ...","['Amex', 'Unionpay', 'Dinersclub', 'Discover',...",+39 339 263 2591,https://www.lacciuga.net/,would never guess gourmet restaur tuck away co...
1,Jamantè,via San Vito 97,Polignano a Mare,70044,Italy,€€€,ModernCuisine,Located not far from the historic centre and t...,"['Air conditioning', 'Terrace']","['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 351 628 7773,https://www.jamanteristorante.com,locat far histor centr picturesqu lama monachi...
2,Cucine Nervi,corso Vercelli 117,Gattinara,13045,Italy,€€€,ModernCuisine,Housed within the Cantine Nervi in the town ce...,['Air conditioning'],"['Mastercard', 'Visa']",+39 333 182 4123,https://cucinenervi.com,hous within cantin nervi town centr contempora...
3,Lokanda Devetak,via Brezici 22,Savogna d'Isonzo,34070,Italy,€€,"RegionalCuisine,TraditionalCuisine",Just two kilometres from the Slovenian border ...,"['Air conditioning', 'Car park', 'Garden or pa...","['Amex', 'Mastercard', 'Visa']",+39 0481 882488,https://www.devetak.com/,two kilometr slovenian border crow fli restaur...
4,Felix Lo Basso home & restaurant,via Carlo Goldoni 36,Milan,20129,Italy,€€€€,"ItalianContemporary,Creative",Brilliant chef Felix Lo Basso’s menu is inspir...,"['Air conditioning', 'Counter dining', 'Wheelc...","['Amex', 'Mastercard', 'Visa']",+39 02 4540 9759,https://www.felixlobassorestaurant.it/,brilliant chef felix lo basso menu inspir nort...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978,Kleine Flamme,via Cittanuova 31,Vipiteno,39049,Italy,€€€,"Fusion,Creative",In his small open-view kitchen situated in the...,['Terrace'],"['Mastercard', 'Visa']",+39 0472 766065,https://kleineflamme.com/it/,small openview kitchen situat centr restaur ch...
1979,Antica Locanda al Cervo - Landgasthof zum Hirs...,via Schrann 9/c,San Genesio Atesino,39050,Italy,€€,"RegionalCuisine,TraditionalCuisine","Enjoy generous, regional cuisine influenced by...","['Car park', 'Garden or park', 'Great view', '...","['Mastercard', 'Visa']",+39 0471 354195,https://www.hirschenwirt.it/,enjoy gener region cuisin influenc famili invo...
1980,Lapprodo,via Roma 22,Vibo Valentia Marina,89900,Italy,€€€,"Seafood,ClassicCuisine",Definitely one of the most reliable restaurant...,"['Air conditioning', 'Terrace', 'Wheelchair ac...","['Amex', 'Unionpay', 'Dinersclub', 'Discover',...",+39 0963 572640,https://www.lapprodo.com/,definit one reliabl restaur area vibo valentia...
1981,MikEle,via Flavio Gioia 1,Maranello,41053,Italy,€€€,Seafood,Located in a residential district on the edge ...,['Air conditioning'],"['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 0536 941027,https://www.ristorantemikele.com/,locat residenti district edg town unexpect ele...


## 2.1 Conjunctive Query
The initial implementation of the search engine focuses exclusively on the description field of each restaurant. It performs a conjunctive search, meaning that only restaurants whose descriptions contain all the query words will be returned.

In [5]:
# Creates a set where all the words of the cleaned description are stored
my_set = set(word for description in df_restaurants['cleaned_desc'] for word in description.split())

A file named **vocabulary.csv** is created, mapping each unique word in the dataset to a unique integer identifier (term_id). This file serves as a reference for the terms used in the search engine.

Inverted Index:
A dictionary is constructed, where each term_id maps to a list of document IDs (restaurant descriptions) in which the term appears.


In [6]:
engine.vocabulary_creator(my_set)
engine.inverted_index_creator(df_restaurants)

Given a user query, the search engine processes the query terms and retrieves restaurants whose descriptions contain all the specified words. This function *find_restaurants* in **utilities.py** calls the **engine.py** conjuctive query function to retreive the restaurants

In [7]:
utilities.find_restaurants("modern seasonal cuisine", df_restaurants)

Unnamed: 0,Restaurant Name,Address,Description,Website
0,Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant st...",
1,Osteria Taviani,piazza Vittorio Emanuele II 28,"This pleasant, warmly decorated restaurant is ...",
2,San Michele,via Castello di Fagagna 33,Situated next to the ruins of the old castle a...,http://sanmichele.restaurant
3,Aprudia,largo del Forno 16,"At this restaurant in the historic centre, whe...",http://www.aprudia.com
4,Esplanade,via Lario 3,"One of Italy’s long-established restaurants, t...",https://www.ristorante-esplanade.com/
5,Vesta Mare,viale Roma 41,"This typical, elegant Versilian beach club wit...",https://vestafiorichiari.com/mare/
6,Cappuccini Cucina San Francesco,via Cappuccini 54,"Housed in the resort of the same name, this el...",https://www.cappuccini.it/
7,Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and m...",https://vadoarazzo.it/
8,Sintesi,viale dei Castani 17,"A modern, welcoming restaurant whose motto “Tr...",http://ristorantesintesi.it
9,Winter Garden Florence,piazza Ognissanti 1,Horse-drawn carriages once entered the old cou...,https://www.wintergardenflorence.com/it/


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity
For the second search engine, restaurants are ranked by relevance to the query. The top-k results are retrieved based on their similarity scores. This engine is implemented in the **engine.py** file as the *process_query* function.

To create an inverted Index with TF-IDF Scores:
For each term in a restaurant’s description, calculate the TF-IDF score to measure its importance. Save these scores in a file that maps each term (term_id) and restaurant to its corresponding TF-IDF value, which will be used to rank restaurants by relevance to the query. This is done in the **engine.py** file using the *inverted_tfidf* function

In [22]:
engine.inverted_tfidf(df_restaurants)

The process_query function processes the query, computes Cosine Similarity between the query and restaurant TF-IDF vectors, and returns the top-k ranked results, or all matches if fewer than k have non-zero similarity.

In [None]:
### Test cell ###
results = engine.process_query(utilities.clean_desc('After many years’ experience in Michelin-starred restaurants, Luigi Tramontano and his wife Nicoletta have opened their first restaurant in the chef’s native Gargnano. Previously a pasta factory, the building has been converted into an elegant, contemporary-style restaurant which has nonetheless retained its charming high ceilings. The cuisine is inspired by regional traditions which are reinterpreted to create gourmet dishes, all prepared with respect for the ingredients used and a strong focus on local produce.'),df_restaurants)
for restaurant, score in results:
    print(f'{restaurant}: {score:.4f}')

Reloaded
O Me O Il Mare: 0.6872
La Tortuga: 0.0781
Trattoria del Vicolo: 0.0491
La Pignata: 0.0398
Casa Perrotta Restaurant: 0.0389
La Terrazza: 0.0382
Badessa: 0.0365
Il Ristorantino - Da Dino: 0.0317
Tana de 'l Ors: 0.0312
Ansitz Steinbock: 0.0308
Acciuga: 0.0305
SaQua by Il Frantoio: 0.0299
Zia: 0.0294
Da Odino: 0.0294
Marco Martini Restaurant: 0.0289
VeRo - Venetian Roots: 0.0276
IO Luigi Taglienti: 0.0275
Re Santi e Leoni: 0.0271
Regallo: 0.0260
Casa Vicina: 0.0257
Bis Osteria Italiana Contemporanea: 0.0251
Due Pini: 0.0242
San Tommaso 10: 0.0239
Lazzaro 1915: 0.0226
Gambero Rosso: 0.0218
Trattoria contemporanea: 0.0216
Miil: 0.0212
Il Fuoco Sacro: 0.0211
Altatto Bistrot: 0.0205
Atelier Moessmer Norbert Niederkofler: 0.0199
La Pista: 0.0197
Rossellinis: 0.0191
Luigi Lepore: 0.0187
Collina: 0.0186
Palais Royal Restaurant: 0.0186
Alici: 0.0185
Ratanà: 0.0169
Magazzino 52: 0.0151
Altriménti: 0.0148
Osteria dalla Peppa: 0.0142
Milano: 0.0141
Taverna dello Spuntino: 0.0141
Casa Coloni:

Given a user query, and top-k restaurants this function calls the *process_query* function in **engine.py** file and it returns a dataframe with the required information sorted by the similarity score calculated. 

In [None]:
utilities.find_ranked_restaurants('modern seasonal cuisine', 5, df_restaurants)

Reloaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['similarity'] = restaurant[1]


Unnamed: 0,restaurantName,address,description,website,similarity
179,Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it,0.250392
997,Piccolo Lord,corso San Maurizio 69 bis/g,"Professional service in a welcoming, modern restaurant run by a young couple. He works in the kitchen while she (having also worked as a chef in the past) runs the front of house. Delicious Mediterranean cuisine with a seasonal focus.",https://www.ristorantepiccololord.it/,0.231623
1160,Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and modern feel serving contemporary cuisine prepared from seasonal, regional products. Charming romantic outdoor area with soft lighting.",https://vadoarazzo.it/,0.230267
1652,La Botte,via Giuseppe Garibaldi 8,"A modern and welcoming contemporary bistro situated in the heart of Stresa’s historic centre. Run by an entire family, the restaurant serves modern and imaginative fish and meat dishes where the focus is always on seasonal ingredients. The interesting wine list also includes a selection of wines by the glass.",http://www.trattorialabottestresa.it,0.225117
1003,Al Vecchio Convento,viale Borri 348,"Ask for a table in the main dining room, with a classic atmosphere and elegant furnishings, to taste the dishes of a cuisine, which is seasonal and mainly Tuscan.",https://www.alvecchioconvento.it/,0.219166


# 3. Define a new score
We enhance the ranking process by incorporating a custom scoring function that evaluates multiple restaurant attributes alongside the description match. 

The scoring function assigns the same weight to the different attributes so that we take them all in consideration equally. 

The ranking function, we search for the matching ones using the 2.1 search engine. Once these descriptions are retrieved we calculate their similarity scores, calculate the actual score based on user preferences, and rank them extracting the top k results. 

In [None]:
# Testing #
user_pref = {
    'query': "modern seasonal cuisine",
    'cuisines': "Contemporary, Seasonal Cuisine",
    'facilities': "Air conditioning"
}

# List to store the links 
urls=[]
# Iterate on each page and saving its content into the html_content variable
for page in range(1,101):
    # get url of each page
    url = first_url + f'page/{page}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html_content = response.text
    else:
        print("Error during download of current page:", response.status_code)

    # Pase the content of each page
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the links in the page 
    restaurant_links = soup.find_all("a", class_="link")
    url_base="https://guide.michelin.com"
    page_urls = []
    # Iterate through all links and extract the URL of each link and concatenate it with the base url
    for i in restaurant_links:
        href = i.get("href")
        href= url_base + href
        page_urls.append(href)
    urls.append(page_urls)
urls = [url for page_urls in urls for url in page_urls]


Reloaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['similarity'] = restaurant[1]


Unnamed: 0,restaurantName,address,description,website,score
0,Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it,0.418848
1,Piccolo Lord,corso San Maurizio 69 bis/g,"Professional service in a welcoming, modern restaurant run by a young couple. He works in the kitchen while she (having also worked as a chef in the past) runs the front of house. Delicious Mediterranean cuisine with a seasonal focus.",https://www.ristorantepiccololord.it/,0.414156
2,Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and modern feel serving contemporary cuisine prepared from seasonal, regional products. Charming romantic outdoor area with soft lighting.",https://vadoarazzo.it/,0.413817
3,La Botte,via Giuseppe Garibaldi 8,"A modern and welcoming contemporary bistro situated in the heart of Stresa’s historic centre. Run by an entire family, the restaurant serves modern and imaginative fish and meat dishes where the focus is always on seasonal ingredients. The interesting wine list also includes a selection of wines by the glass.",http://www.trattorialabottestresa.it,0.412529
4,Al Vecchio Convento,viale Borri 348,"Ask for a table in the main dining room, with a classic atmosphere and elegant furnishings, to taste the dishes of a cuisine, which is seasonal and mainly Tuscan.",https://www.alvecchioconvento.it/,0.336041


In [23]:
# Only keep the restaurants links and make sure there are no duplicates
rest_urls = []
for url in urls:
    if '/restaurant/' in url and url not in rest_urls:
        rest_urls.append(url)

VBox(children=(Text(value='', description='Query:', placeholder='Insert what you are looking for...', style=De…

In [24]:
# A dataset with two new columns, 'lat' and 'lon' 
geo_restaurants_df = pd.merge(df_restaurants, city_coords_unique, left_on='city', right_on='denominazione_ita_altra', how='left')
geo_restaurants_df.shape

(1983, 16)

In [None]:
# Check the presence of duplicated rows 
geo_restaurants_df[geo_restaurants_df['description'].duplicated()]

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website,region,denominazione_ita_altra,lat,lon


In [25]:
# Dataframe with missing values 
df_restaurants_missing_coor = geo_restaurants_df[geo_restaurants_df['lat'].isna()]

In [None]:
df_restaurants_missing_coor.shape

(643, 16)

There are no duplicted rows in the dataste, so the merge didn't give any problems.
But looking at the column about the latitude, there are still Nan values, for two main reasons:
- some cities in our dataset are wrote in english, while the dataset used is in italian, for this reason there is no match bewteen 'city' and 'denominazione_ita_altra' even if they are about the same city (this happened mostly for bit cities, that have an english version of their name, as Roma and Rome);
- some municipalities may simply be missing due to lack of information in the external dataset.

For this reason that we'll try to find the missing values using the second method, geopy.

### 4.2.2 Using geopy


Geopy is a Python library used for geocoding (converting addresses into latitude/longitude) and reverse geocoding (converting coordinates into addresses).
In this case we used it in order to get the missing coordinates of the dataframe df_restaurants_missing_coor.

A new function is built, get_coordinates, and using 'city' and 'region' will get the latitude and longitude for each missing city. 
In the df with missing coordinates, there are way less 'unique' cities than rows, for this reason the function get_coordinates is applied on the dataframe with duplicated rows for city and region are dropped. The coordinates are stored in a dictionary and it will be use from another function, apply_coordinates.
Its goal is to get the coordinates for a city, and then is used to replace Nan values with new 'lat' and 'lon' values.

In [7]:
# Creating geolocator
geolocator = Nominatim(user_agent="MyRestaurantLocatorApp")

# Appling on a new dataframe with only unique values for the pair city and region
unique_cities = df_restaurants_missing_coor[['city', 'region']].drop_duplicates()

# dictionary where memorize coordinates for each city 
city_coords = {}

# Apply the function get_coordinates for each row 
for index, row in unique_cities.iterrows():
    city = row['city']
    region = row['region']
    lat, lon = geography.get_coordinates(city, region, geolocator)
    if lat is not None and lon is not None:
        city_coords[f"{city}, {region}"] = (lat, lon)
        print(f"Coordinates for {city}, {region} - Latitude: {lat}, Longitude: {lon}")
    else:
        print(f"Could not find coordinates for {city}, {region}")
    time.sleep(random.uniform(1, 2))  # needed to use geopy 

# Applying apply_coordinates at all the dataset 
df_restaurants_missing_coor[['lat', 'lon']] = df_restaurants_missing_coor.apply(
    lambda row: geography.apply_coordinates(row, city_coords=city_coords), axis=1, result_type="expand"
)

Successfully downloaded: mater-terrae.html in restaurants_html/page_1
Successfully downloaded: il-colmetto.html in restaurants_html/page_1
Successfully downloaded: osteria-la-lanterna.html in restaurants_html/page_1
Successfully downloaded: orto.html in restaurants_html/page_1
Successfully downloaded: linfa.html in restaurants_html/page_1
Successfully downloaded: locanda-belvedere.html in restaurants_html/page_1
Successfully downloaded: il-ciabot.html in restaurants_html/page_1
Successfully downloaded: rio-bistrot.html in restaurants_html/page_1
Successfully downloaded: sala-dei-grappoli.html in restaurants_html/page_1
Successfully downloaded: san-baylon.html in restaurants_html/page_1
Successfully downloaded: seta-sushi-restaurant.html in restaurants_html/page_1
Successfully downloaded: da-gelsomina.html in restaurants_html/page_1
Successfully downloaded: locanda-gesu-vecchio.html in restaurants_html/page_1
Successfully downloaded: razzo.html in restaurants_html/page_1
Successfully do

In [None]:
df_restaurants_missing_coor

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website,region,denominazione_ita_altra,lat,lon
0,20Tre,via David Chiossone 20 r,Genoa,16123,Italy,€€,"Farmtotable,ModernCuisine",Situated in the heart of Genoa’s historic cent...,['Air conditioning'],"['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 010 247 6191,https://www.ristorante20tregenova.it/,Liguria,,44.407260,8.933862
1,Alessandro Feo,via Angelo Lista 24,Marina di Casal Velino,84040,Italy,€€,"Campanian,Seafood",In a beautiful stone-vaulted building (an old ...,[],"['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 328 893 7083,https://www.alessandrofeoristorante.it/,Campania,,40.176791,15.124097
7,Donevandro,via Garibaldi 2,Popoli,65026,Italy,€€,"Contemporary,SeasonalCuisine","Up until a few years ago, the owner-chef at th...",['Air conditioning'],"['Mastercard', 'Visa']",+39 388 887 6858,http://www.donevandroristorante.it,Abruzzo,,42.171311,13.832817
8,Etra,piazza De Ferrari 4,Genoa,16121,Italy,€€€,"Creative,ItalianContemporary",Etra is an anagram of the Italian word “arte” ...,['Air conditioning'],"['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 010 731 6733,https://www.etra.art/,Liguria,,44.407260,8.933862
9,Il Ristorante Alain Ducasse Napoli,Via Cristoforo Colombo 45,Naples,80133,Italy,€€€€,"Creative,MediterraneanCuisine","Alain Ducasse, one of the great names in conte...","['Air conditioning', 'Great view', 'Interestin...","['Amex', 'Dinersclub', 'Discover', 'Maestrocar...",+39 081 604 1580,https://theromeocollection.com/en/romeo-napoli...,Campania,,40.835885,14.248768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,Eggentaler,via Val d'Ega 47,Cardano,39053,Italy,€€,"MeatsandGrills,ClassicCuisine",Renowned for being one of the best places in t...,"['Air conditioning', 'Car park', 'Terrace', 'W...","['Amex', 'Mastercard', 'Visa']",+39 0471 365294,https://www.eggentaler.com/it/ristorante-eggen...,Trentino-South Tyrol,,46.493717,11.393647
1969,Hebbo Wine & Deli,Località Lago di Dobbiaco,Toblach,39034,Italy,€€€€,"Innovative,Contemporary",Situated on a campsite on the shores of Lake D...,"['Car park', 'Interesting wine list', 'Wheelch...","['Mastercard', 'Visa']",+39 0474 869025,https://www.hebbo.it/it/,Trentino-South Tyrol,,46.735425,12.224280
1973,Orma Roma,via Boncompagni 31,Rome,187,Italy,€€€€,"ItalianContemporary,Colombian",An oak door with three steps leads into the di...,"['Air conditioning', 'Interesting wine list', ...","['Amex', 'Dinersclub', 'Mastercard', 'Visa']",+39 06 854 3182,http://www.ormaroma.it,Lazio,,41.893320,12.482932
1978,Shiroya,via dei Baullari 147,Rome,186,Italy,€€,"Japanese,Asian",One of the most popular restaurants in the his...,"['Air conditioning', 'Terrace']","['Amex', 'Mastercard', 'Visa']",+39 06 6476 0753,https://www.shiroya.it,Lazio,,41.893320,12.482932


In [None]:
df_last_missing = df_restaurants_missing_coor[df_restaurants_missing_coor['lat'].isna()]
list(df_last_missing['city'])

['Fiera di Primiero',
 'Pozza di Fassa',
 'Sorni',
 'Fasano del Garda',
 'San Martino di Castrozza',
 'Torbole',
 'Giulianova Lido',
 'Madonna di Campiglio',
 'Barbian',
 'Varena',
 'Madonna di Campiglio',
 'Fasano del Garda',
 'Ravina',
 'Saint Martin in Passeier',
 'Madonna di Campiglio',
 'Madonna di Campiglio',
 'Rocella Jonica',
 'San Pellegrino',
 'Castelnovo di Baganzola',
 'Vicomero di Torrile',
 'Sarche di Madruzzo',
 'Saint Lorenzen',
 'Giulianova Lido',
 'Torbole',
 'Coriano Veronese',
 'Castel Toblino']

After this process, there are about 20 unique cities left that are missing their coordinates. This is a manageable number so we can proceed with the last method, that is, asking an LLM to provide them to us.

### 4.2.3 Asking coordinates to a LLM

The list of missing cities was sent to ChatGTP and was asked to provide the coordinates of these. 

Ones get them, we firstly checked if they were right, looking one by one on Google Maps, after that they were stored in another dictionary in order to replace the Nan values in the last dataframe, df_last_missing. 
We already built a function to apply the coordinates to a column, so we'll use it again. 

In [None]:
# gtp coordinates provided, checked one by one on Google Maps
city_coords_gtp = {
    'Fiera di Primiero, Trentino-South Tyrol': (46.1763, 11.8293),
    'Pozza di Fassa, Trentino-South Tyrol': (46.4299, 11.6858),
    'Sorni, Trentino-South Tyrol': (46.1746, 11.1234),
    'Fasano del Garda, Lombardy': (45.6240, 10.5716),
    'San Martino di Castrozza, Trentino-South Tyrol': (46.2617, 11.8007),
    'Torbole, Trentino-South Tyrol': (45.8711, 10.8759),
    'Giulianova Lido, Abruzzo': (42.7506, 13.9652),
    'Madonna di Campiglio, Trentino-South Tyrol': (46.2304, 10.8271),
    'Barbian, Trentino-South Tyrol': (46.6036, 11.5198),
    'Varena, Trentino-South Tyrol': (46.3052, 11.4579),
    'Ravina, Trentino-South Tyrol': (46.0378, 11.1081),
    'Saint Martin in Passeier, Trentino-South Tyrol': (46.7825, 11.2289),
    'Rocella Jonica, Calabria': (38.3185, 16.3961),
    'San Pellegrino, Trentino-South Tyrol': (46.3788, 11.7856),
    'Castelnovo di Baganzola, Emilia-Romagna': (44.8700, 10.3160),
    'Vicomero di Torrile, Emilia-Romagna': (44.8777, 10.3210),
    'Sarche di Madruzzo, Trentino-South Tyrol': (46.0470, 10.9524),
    'Saint Lorenzen, Trentino-South Tyrol': (46.7828, 11.9020),
    'Coriano Veronese, Veneto': (45.2749, 11.2878),
    'Castel Toblino, Trentino-South Tyrol': (46.0559, 10.9778)
}

# Main folder where to download HTML of each restaurant on each page
main_folder = "restaurants_html"
Path(main_folder).mkdir(exist_ok=True)

# Number of the pages
page_number = 1

# Number of restaurants in current page
restaurant_count = 0
restaurants_per_page = 20 

for url in urls:
    # Define folder for each page
    page_folder = os.path.join(main_folder, f"page_{page_number}")
    Path(page_folder).mkdir(exist_ok=True)

    try:
        # Download the HTML of each restaurant
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Name of the restaurant
            filename = url.split("/")[-1] + ".html"
            filepath = os.path.join(page_folder, filename)

            # Save the HTML
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)

            print(f"Successfully downloaded: {filename} in {page_folder}")

        else:
            print(f"Error {response.status_code} per URL: {url}")
            break

        restaurant_count += 1
        if restaurant_count % restaurants_per_page == 0:
            page_number += 1

    except Exception as e:
        print(f"Error during download of {url}: {str(e)}")

print("Download completed successfully")


### 1.3

In [8]:
def extract_restaurant_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Estrarre il nome del ristorante
    restaurant_name = soup.find("h1", class_="data-sheet__title").get_text(strip=True) if soup.find("h1", class_="data-sheet__title") else ''

    # Finding Address and Pricing
    big_div = soup.find_all("div", class_="data-sheet__block--text")
    if big_div:
        address_city__postal_country, price_range = big_div[0], big_div[1]
        address_city__postal_country = address_city__postal_country.get_text(separator=", ", strip=True) if address_city__postal_country else ''
        address_city__postal_country_list = address_city__postal_country.split(', ')

        address = address_city__postal_country_list[0]
        city = address_city__postal_country_list[1]
        postal = address_city__postal_country_list[2]
        country = address_city__postal_country_list[3]

        #Price 
        if big_div[1]:
            price_range = price_range.get_text(strip=True)
            price_range_list=price_range.split('·')

            price = price_range_list[0]
            price = re.sub(r"\s+", "", price)

            # Cuisine type
            cuisine_type = price_range_list[1]
            cuisine_type = re.sub(r"\s+", "", cuisine_type)

    # description 
    description = soup.find("div", class_="data-sheet__description").get_text(strip=True) if soup.find("div", class_="data-sheet__description") else None

    # facilities

    services_column = soup.find('div', class_='col col-12 col-lg-6')
    facilities_services = [item.get_text(strip=True) for item in services_column.find_all('li') ] if services_column else []

    #CreditCards

    creditCards_column = soup.find('div',class_='list--card')
    if creditCards_column:
        creditCards_img =[img['data-src'] for img in creditCards_column.find_all('img')]
        creditCards_names = [re.search(r'icons/([a-zA-Z]+)', cc).group(1).capitalize() for cc in creditCards_img]
    else: 
        creditCards_names = []

    # Phone Number
    phone_number = soup.find('span', class_="flex-fill").get_text(strip=True) if soup.find("span", class_="flex-fill") else None

    # Website
    Website_section = soup.find('div', class_='collapse__block-item link_item').get_text(strip=True) if soup.find('div', class_='collapse__block-item link_item') else None
    if Website_section:
        website = Website_section.find('a')['href']
    else:
        website = ''



    restaurant_info = {
        "restaurantName": restaurant_name,
        "address": address,
        "city": city,
        "postalCode": postal,
        "country": country,
        "priceRange": price,
        "cuisineType": cuisine_type,
        "description": description,
        "facilitiesServices": facilities_services,
        "creditCards": creditCards_names,
        "phoneNumber": phone_number,
        "website": website
    }
    return restaurant_info
    


In [9]:
file_path = os.path.join('downloaded_html/page_1', 'al-baccanale.html')

with open(file_path, "r", encoding="utf-8") as f:
    html_content = f.read()
    print(extract_restaurant_info(html_content))



{'restaurantName': 'Al Baccanale', 'address': 'via XX Settembre 20', 'city': 'Piombino', 'postalCode': '57025', 'country': 'Italy', 'priceRange': '€€', 'cuisineType': 'Tuscan', 'description': 'Situated in the heart of the historic centre just a stone’s throw from Piombino’s former Medici fortress, this quiet restaurant boasts a backdrop of exposed stonework and vaulted ceilings and just a few tables, so booking ahead is recommended. Here, the enthusiastic and talented owner-chef serves traditional cuisine reinterpreted with a modern and highly individual twist.', 'facilitiesServices': ['Air conditioning', 'Terrace', 'Wheelchair access'], 'creditCards': ['Amex', 'Mastercard', 'Visa'], 'phoneNumber': '+39 0565 222039', 'website': ''}


In [5]:
# Taking in input user preferences
k = int(input('Number of restaurants to view:'))
user_preferences = {
        'query': input('What do you want?'),
        'cuisines': input('Preferences about cuisines?'),
        'facilities': input('Do you need any services?')
}

# Find top k restaurants considering user preferences
top_k_restaurants = scoring.rank_restaurants(user_preferences, k, df_restaurants)

# Merging the df to keep the 'score'
matching_rows = pd.merge(
    df_restaurants_coordinates,
    top_k_restaurants[['score']],  
    left_index=True,
    right_index=True,
    how='inner'
)

# Create a folium map centered around Italy, selecting a tiles to have a better representation of the borders between the regions
m = folium.Map(location=[41.9028, 12.4964], zoom_start=6, tiles='CartoDB Positron')  


# Add markers for the top-k restaurants with all the most interesting information about them
for index,restaurant in matching_rows.iterrows():
    display(restaurant)
    color = geography.price_to_color(restaurant['priceRange'])
    popup_text = (
        f"{restaurant['restaurantName']};\n"
        f"Location: {restaurant['city']}, {restaurant['region']};\n"
        f"Price: {restaurant['priceRange']} ;\n"
        f"Cuisine: {restaurant['cuisineType']};\n"
        f"Services: {restaurant['facilitiesServices']};\n"
        f"Score: {restaurant['score']:.3f}" 
    )
    
    folium.Marker(
        location=[restaurant['lat'], restaurant['lon']],
        popup=folium.Popup(popup_text, max_width=300),
        icon=folium.Icon(color=color)
    ).add_to(m)

Reloaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['similarity'] = restaurant[1]


restaurantName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Ape Vino e Cucina
address                                                                                                                                                                                                                                                                                                                              

restaurantName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Charleston
address                                                                                                                                                                                                                                       

restaurantName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Da Bob Cook Fish
address                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

restaurantName                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                DA_MÓ
address                                                                                                                                                                                                                                                                                                                                                                                                                                             

restaurantName                                                                                                                                                                                                                                                                                                                                                                                                                                           Dama
address                                                                                                                                                                                                                                                                                                                                                                                                                    Via Mulino, località San Gaudenzio
city                                                                                                        

In [None]:
m

# Algorithmic Question (AQ)

In [None]:
'''
function find_shortest_paths(test_cases):
    results = [] # To store results for each test case

for each test_case in test_cases:
    n = number of packages in test_case
    packages = list of (x, y) coordinates for the packages

    # Step 1: Sort packages by (x, y) coordinates
    sort (packages by (x, y)) # First by x, then by y

    # Step 2: Initialize variables for robot's starting position
    current_x, current_y = 0, 0
    path = ““# To record the path taken
    possible = True # Flag to check if the path is valid

    # Step 3: Traverse the sorted list of packages
    for each (x, y) in packages:
        # Calculate the moves required to reach (x, y) from (current_x, current_y)
        right_moves = x - current_x
        up_moves = y - current_y

        # Check if the target position is reachable using only R and U
        if right_moves < 0 or up_moves < 0:
                possible = False
                break # Exit the loop as the path is invalid

        # Append the moves to the path
        path += "R" * right_moves + "U" * up_moves

        # Update the current position
        current_x, current_y = x, y

# Step 4: Record the result for this test case
if possible:
        append ("YES", path) to results
else:
        append ("NO") to results

return results
'''

•	Prove that the algorithm is correct.

Sorting the packages by (x, y) coordinates ensures we follow the lexicographically smallest path by moving right (R) before up (U). As we check each package, we verify that each one is reachable from the previous by only moving right or up. If any package requires moving left or down, we return "NO." This approach guarantees a valid path exists only when all packages can be collected in the sorted order.


•	Compute the time complexity of your algorithm in Big O notation. Break down the steps involved in the algorithm, and explain which parts contribute most to the overall time complexity.

•	Ask an LLM tool (such as ChatGPT, Claude AI, Gemini, Perplexity, etc.) to evaluate the time complexity of your code using Big O notation. Is the assessment accurate? If it differs from your previous analysis, which would be correct? Please explain your reasoning.
•	Assume now that the robot can also move towards the left or downwards and consider the greedy approach: from the current location go to the closest package. Notice that now we can always collect all packages. Prove that the greedy algorithm is optimal (i.e., it minimizes the total distance traveled), or provide a counterexample showing that it is not	
