# Web scraping

### 1. Data Collection

#### 1.1

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from pathlib import Path
import json
import re


In [None]:

first_url = "https://guide.michelin.com/en/it/restaurants/"

# Define headers to make the request look like it's coming from a regular browser.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# List to store the links 
urls=[]
# Iterate on each page and saving its content into the html_content variable
for page in range(1,101):
    # get url of each page
    url = first_url + f'page/{page}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html_content = response.text
    else:
        print("Error during download of current page:", response.status_code)

    # Pase the content of each page
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the links in the page 
    restaurant_links = soup.find_all("a", class_="link")
    url_base="https://guide.michelin.com"
    # Iterate through all links and extract the URL of each link and concatenate it with the base url
    for i in restaurant_links:
        href = i.get("href")
        href= url_base + href
        urls.append(href)

In [None]:
# Only keep the restaurants links and make sure there are no duplicates
rest_urls= [url for url in  urls if '/restaurant/' in url]
rest_urls=set(rest_urls)

In [None]:
# Verify execution 
len(rest_urls)

1983

In [None]:
# Save all links of restaurants in a file
with open("restaurant_urls.txt", "w") as file:
    for url in rest_urls:
        file.write(url + "\n")

#### 1.2

In [None]:
# Iterate through links of restaurants
with open("restaurant_urls.txt", "r") as file:
    urls = file.read().splitlines()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Main folder where to download HTML of each restaurant on each page
main_folder = "restaurants_html"
Path(main_folder).mkdir(exist_ok=True)

# Number of the pages
page_number = 1

# Number of restaurants in current page
restaurant_count = 0
restaurants_per_page = 20 

for url in urls:
    # Define folder for each page
    page_folder = os.path.join(main_folder, f"page_{page_number}")
    Path(page_folder).mkdir(exist_ok=True)

    try:
        # Download the HTML of each restaurant
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Name of the restaurant
            filename = url.split("/")[-1] + ".html"
            filepath = os.path.join(page_folder, filename)

            # Save the HTML
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)

            print(f"Successfully downloaded: {filename} in {page_folder}")

        else:
            print(f"Error {response.status_code} per URL: {url}")
            break

        restaurant_count += 1
        if restaurant_count % restaurants_per_page == 0:
            page_number += 1

    except Exception as e:
        print(f"Error during download of {url}: {str(e)}")

print("Download completed successfully")


Scaricato: da-michele.html in downloaded_html\page_1
Scaricato: zunica-1880.html in downloaded_html\page_1
Scaricato: locanda-martinelli.html in downloaded_html\page_1
Scaricato: hydra.html in downloaded_html\page_1
Scaricato: osteria-di-passignano.html in downloaded_html\page_1
Scaricato: condividere.html in downloaded_html\page_1
Scaricato: bel-ami.html in downloaded_html\page_1
Scaricato: al-baccanale.html in downloaded_html\page_1
Scaricato: la-tortuga.html in downloaded_html\page_1
Scaricato: onda-blu.html in downloaded_html\page_1
Scaricato: alessandro-mecca-al-castello-di-grinzane-cavour.html in downloaded_html\page_1
Scaricato: fre.html in downloaded_html\page_1
Scaricato: la-pergola69016.html in downloaded_html\page_1
Scaricato: marelet.html in downloaded_html\page_1
Scaricato: inkiostro.html in downloaded_html\page_1
Scaricato: burro-alici.html in downloaded_html\page_1
Scaricato: marsupino.html in downloaded_html\page_1
Scaricato: vitique.html in downloaded_html\page_1
Scari

### 1.3

In [30]:
def extract_restaurant_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Estrarre il nome del ristorante
    restaurant_name = soup.find("h1", class_="data-sheet__title").get_text(strip=True) if soup.find("h1", class_="data-sheet__title") else ''

    # Finding Address and Pricing
    big_div = soup.find_all("div", class_="data-sheet__block--text")
    if big_div:
        address_city__postal_country, price_range = big_div[0], big_div[1]
        address_city__postal_country_text = " ".join(address_city__postal_country.stripped_strings)
        address_city__postal_country_list = re.split(r',\s*', address_city__postal_country_text)


        address = address_city__postal_country_list[0]
        city = address_city__postal_country_list[1]
        postal = address_city__postal_country_list[2]
        country = address_city__postal_country_list[3]

        #Price 
        if big_div[1]:
            price_range = price_range.get_text(strip=True)
            price_range_list=price_range.split('·')

            price = price_range_list[0]
            price = re.sub(r"\s+", "", price)

            # Cuisine type
            cuisine_type = price_range_list[1]
            cuisine_type = re.sub(r"\s+", "", cuisine_type)

    # description 
    description = soup.find("div", class_="data-sheet__description").get_text(strip=True) if soup.find("div", class_="data-sheet__description") else ''

    # facilities

    services_column = soup.find('div', class_='col col-12 col-lg-6')
    facilities_services = [item.get_text(strip=True) for item in services_column.find_all('li') ] if services_column else []

    #CreditCards

    creditCards_column = soup.find('div',class_='list--card')
    if creditCards_column:
        creditCards_img =[img['data-src'] for img in creditCards_column.find_all('img')]
        creditCards_names = [re.search(r'icons/([a-zA-Z]+)', cc).group(1).capitalize() for cc in creditCards_img]
    else: 
        creditCards_names = []

    # Phone Number
    phone_number = soup.find('span', class_="flex-fill").get_text(strip=True) if soup.find("span", class_="flex-fill") else ''

    # Website
    link_div = soup.find('div', class_='collapse__block-item link-item')

    if link_div:
        website_tag = link_div.find('a', class_='link js-dtm-link')
        website = website_tag['href'] if website_tag else ''
    else:
        website = ''




    restaurant_info = {
        "restaurantName": restaurant_name,
        "address": address,
        "city": city,
        "postalCode": postal,
        "country": country,
        "priceRange": price,
        "cuisineType": cuisine_type,
        "description": description,
        "facilitiesServices": facilities_services,
        "creditCards": creditCards_names,
        "phoneNumber": phone_number,
        "website": website
    }
    return restaurant_info
    


# from tsv to dataframe (problems)

In [34]:
def find_restaurant_folder(restaurant_name, main_folder):
    for root, dirs, files in os.walk(main_folder):
        for folder in dirs:
            # Verifica se il nome del ristorante è contenuto nel nome della cartella
            if restaurant_name.lower() in folder.lower():
                print(f"Trovato: {restaurant_name} nella cartella: {os.path.join(root, folder)}")
                return os.path.join(root, folder)
    
    print(f"Ristorante '{restaurant_name}' non trovato.")
    return None

# Esempio di utilizzo
main_folder = "restaurants_htmli"
restaurant_name = "antica-osteria-la-rampina.html"
find_restaurant_folder(restaurant_name, main_folder)

Ristorante 'antica-osteria-la-rampina.html' non trovato.


In [31]:
file_path = os.path.join('restaurants_html/page_1', '20tre.html')

with open(file_path, "r", encoding="utf-8") as f:
    html_content = f.read()
    print(extract_restaurant_info(html_content))



{'restaurantName': '20Tre', 'address': 'via David Chiossone 20 r', 'city': 'Genoa', 'postalCode': '16123', 'country': 'Italy', 'priceRange': '€€', 'cuisineType': 'Farmtotable,ModernCuisine', 'description': 'Situated in the heart of Genoa’s historic centre, this contemporary-style restaurant focuses on just a few dishes, almost all fish-based, presented in a very modern style and in generous portions. Seasonal ingredients and market-fresh produce are the guiding philosophy here.', 'facilitiesServices': ['Air conditioning'], 'creditCards': ['Amex', 'Dinersclub', 'Mastercard', 'Visa'], 'phoneNumber': '+39 010 247 6191', 'website': 'https://www.ristorante20tregenova.it/'}


In [21]:
main_folder = "files_tsv"
Path(main_folder).mkdir(exist_ok=True)

# Funzione per salvare i dati in un file TSV
def save_restaurant_to_tsv(restaurant_data, index, folder):
    # Nome del file TSV
    filename = os.path.join(folder, f"restaurant_{index}.tsv")
    
    # Lista dei campi con gestione dei valori mancanti
    fields = [
        restaurant_data.get("restaurantName", ""),
        restaurant_data.get("address", ""),
        restaurant_data.get("city", ""),
        restaurant_data.get("postalCode", ""),
        restaurant_data.get("country", ""),
        restaurant_data.get("priceRange", ""),
        restaurant_data.get("cuisineType", ""),
        restaurant_data.get("description", ""),
        ", ".join(restaurant_data.get("facilitiesServices", [])),
        ", ".join(restaurant_data.get("creditCards", [])),
        restaurant_data.get("phoneNumber", ""),
        restaurant_data.get("website", "")
    ]
    
    # Salvataggio nel file TSV
    with open(filename, "w", encoding="utf-8") as f:
        # Scrive l'intestazione
        f.write("\t".join(["restaurantName", "address", "city", "postalCode", "country", "priceRange", "cuisineType", "description", "facilitiesServices", "creditCards", "phoneNumber", "website"]) + "\n")
        # Scrive i dati
        f.write("\t".join(fields) + "\n")
    
    print(f"Salvato il file {filename}")

# Processa le cartelle HTML e crea i file TSV per ogni ristorante
restaurant_index = 0  # Contatore per numerare i ristoranti
for folder in os.listdir('restaurants_html'):
    page_folder = os.path.join('restaurants_html', folder)
    
    if os.path.isdir(page_folder):
        for html_file in os.listdir(page_folder):
            file_path = os.path.join(page_folder, html_file)
            with open(file_path, "r", encoding="utf-8") as f:
                html_content = f.read()

                # Estrai informazioni sul ristorante
                restaurant_data = extract_restaurant_info(html_content)
                                
                # Salva il ristorante in un file TSV e stampa il progresso
                save_restaurant_to_tsv(restaurant_data, restaurant_index, main_folder)
                
                # Incrementa l'indice del ristorante per il prossimo file
                restaurant_index += 1


Salvato il file files_tsv\restaurant_0.tsv
Salvato il file files_tsv\restaurant_1.tsv
Salvato il file files_tsv\restaurant_2.tsv
Salvato il file files_tsv\restaurant_3.tsv
Salvato il file files_tsv\restaurant_4.tsv
Salvato il file files_tsv\restaurant_5.tsv
Salvato il file files_tsv\restaurant_6.tsv
Salvato il file files_tsv\restaurant_7.tsv
Salvato il file files_tsv\restaurant_8.tsv
Salvato il file files_tsv\restaurant_9.tsv
Salvato il file files_tsv\restaurant_10.tsv
Salvato il file files_tsv\restaurant_11.tsv
Salvato il file files_tsv\restaurant_12.tsv
Salvato il file files_tsv\restaurant_13.tsv
Salvato il file files_tsv\restaurant_14.tsv
Salvato il file files_tsv\restaurant_15.tsv
Salvato il file files_tsv\restaurant_16.tsv
Salvato il file files_tsv\restaurant_17.tsv
Salvato il file files_tsv\restaurant_18.tsv
Salvato il file files_tsv\restaurant_19.tsv
Salvato il file files_tsv\restaurant_20.tsv
Salvato il file files_tsv\restaurant_21.tsv
Salvato il file files_tsv\restaurant_22.ts

# From dataframe to tsv

In [56]:
restaurant_data = []

folder_path = 'restaurants_html'  # Assicurati che questa sia la cartella corretta

# Itera ricorsivamente attraverso tutte le sottocartelle e i file
for root, dirs, files in os.walk(folder_path):
    for html_file in files:
        # Crea il percorso completo del file
        file_path = os.path.join(root, html_file)
        
        # Verifica se il file è un file HTML
        if file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()

            # Estrazione delle informazioni per ogni ristorante
            restaurant_info = extract_restaurant_info(html_content)
            
            # Aggiungiamo il dizionario con le informazioni del ristorante alla lista
            restaurant_data.append(restaurant_info)

# Creazione del DataFrame con i dati raccolti
df_restaurants = pd.DataFrame(restaurant_data)


In [57]:
df_restaurants.head()

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,20Tre,via David Chiossone 20 r,Genoa,16123,Italy,€€,"Farmtotable,ModernCuisine",Situated in the heart of Genoa’s historic cent...,[Air conditioning],"[Amex, Dinersclub, Mastercard, Visa]",+39 010 247 6191,https://www.ristorante20tregenova.it/
1,Alessandro Feo,via Angelo Lista 24,Marina di Casal Velino,84040,Italy,€€,"Campanian,Seafood",In a beautiful stone-vaulted building (an old ...,[],"[Amex, Dinersclub, Mastercard, Visa]",+39 328 893 7083,https://www.alessandrofeoristorante.it/
2,Ape Vino e Cucina,Piazza Risorgimento 3,Alba,12051,Italy,€€,"Piedmontese,Contemporary",This attractive restaurant in the heart of Alb...,"[Air conditioning, Terrace, Wheelchair access]","[Amex, Dinersclub, Maestrocard, Mastercard, Visa]",+39 0173 363453,https://www.apewinebar.it/alba/
3,Charleston,via Generale Magliocco 19,Palermo,90141,Italy,€€€€,"ModernCuisine,Creative","Before it became famous in Mondello, the renow...","[Air conditioning, Counter dining, Terrace, Wh...","[Amex, Mastercard, Visa]",+39 091 450171,https://casacharleston.net/
4,Da Bob Cook Fish,largo Parsano vecchio 16,Sorrento,80067,Italy,€€,Seafood,Working in partnership with the nearby fishmon...,"[Air conditioning, Terrace]","[Amex, Dinersclub, Mastercard, Visa]",+39 081 1778 3873,https://www.dabobcookfish.com/


In [59]:
print(df_restaurants.isna().sum())  

restaurantName        0
address               0
city                  0
postalCode            0
country               0
priceRange            0
cuisineType           0
description           0
facilitiesServices    0
creditCards           0
phoneNumber           0
website               0
dtype: int64


In [62]:
print(df_restaurants.shape)

(1983, 12)


In [64]:
output_folder = 'files_tsv'
os.makedirs(output_folder, exist_ok=True)

# Funzione per rimuovere caratteri non validi dal nome del file
def clean_filename(filename):
    # Rimuove i caratteri invalidi come le virgolette e i backslash
    return re.sub(r'[<>:"/\\|?*]', '', filename).replace(' ', '_')

# Creazione di un file TSV per ogni ristorante
for index, row in df_restaurants.iterrows():
    # Estrai il nome del ristorante (o un altro identificatore unico)
    restaurant_name = row['restaurantName']
    
    # Pulizia del nome del ristorante per il file
    cleaned_name = clean_filename(restaurant_name)
    
    # Definisci il nome del file TSV
    tsv_filename = f"{cleaned_name}.tsv"
    
    # Crea il percorso completo per il file TSV nella cartella 'files_tsv'
    tsv_filepath = os.path.join(output_folder, tsv_filename)
    
    # Salva il ristorante come file TSV
    row.to_frame().T.to_csv(tsv_filepath, sep='\t', index=False)
    
    print(f"File {tsv_filepath} creato con successo.")

File files_tsv\20Tre.tsv creato con successo.
File files_tsv\Alessandro_Feo.tsv creato con successo.
File files_tsv\Ape_Vino_e_Cucina.tsv creato con successo.
File files_tsv\Charleston.tsv creato con successo.
File files_tsv\Da_Bob_Cook_Fish.tsv creato con successo.
File files_tsv\DA_MÓ.tsv creato con successo.
File files_tsv\Dama.tsv creato con successo.
File files_tsv\Donevandro.tsv creato con successo.
File files_tsv\Etra.tsv creato con successo.
File files_tsv\Il_Ristorante_Alain_Ducasse_Napoli.tsv creato con successo.
File files_tsv\Il_Tirabusciò.tsv creato con successo.
File files_tsv\La_Buca.tsv creato con successo.
File files_tsv\La_Trattoria_Enrico_Bartolini.tsv creato con successo.
File files_tsv\LoRo.tsv creato con successo.
File files_tsv\Ménage.tsv creato con successo.
File files_tsv\O_Me_O_Il_Mare.tsv creato con successo.
File files_tsv\Palazzo_Utini.tsv creato con successo.
File files_tsv\Procaccini.tsv creato con successo.
File files_tsv\Sa_Domu_Sarda.tsv creato con suc