# Web scraping

### 1. Data Collection

#### 1.1

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from pathlib import Path
import json
import re


In [None]:

first_url = "https://guide.michelin.com/en/it/restaurants/"

# Define headers to make the request look like it's coming from a regular browser.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# List to store the links 
urls=[]
# Iterate on each page and saving its content into the html_content variable
for page in range(1,101):
    # get url of each page
    url = first_url + f'page/{page}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html_content = response.text
    else:
        print("Error during download of current page:", response.status_code)

    # Pase the content of each page
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the links in the page 
    restaurant_links = soup.find_all("a", class_="link")
    url_base="https://guide.michelin.com"
    # Iterate through all links and extract the URL of each link and concatenate it with the base url
    for i in restaurant_links:
        href = i.get("href")
        href= url_base + href
        urls.append(href)

In [None]:
# Only keep the restaurants links and make sure there are no duplicates
rest_urls= [url for url in  urls if '/restaurant/' in url]
rest_urls=set(rest_urls)

In [None]:
# Verify execution 
len(rest_urls)

1983

In [None]:
# Save all links of restaurants in a file
with open("restaurant_urls.txt", "w") as file:
    for url in rest_urls:
        file.write(url + "\n")

#### 1.2

In [None]:
# Iterate through links of restaurants
with open("restaurant_urls.txt", "r") as file:
    urls = file.read().splitlines()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Main folder where to download HTML of each restaurant on each page
main_folder = "restaurants_html"
Path(main_folder).mkdir(exist_ok=True)

# Number of the pages
page_number = 1

# Number of restaurants in current page
restaurant_count = 0
restaurants_per_page = 20 

for url in urls:
    # Define folder for each page
    page_folder = os.path.join(main_folder, f"page_{page_number}")
    Path(page_folder).mkdir(exist_ok=True)

    try:
        # Download the HTML of each restaurant
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Name of the restaurant
            filename = url.split("/")[-1] + ".html"
            filepath = os.path.join(page_folder, filename)

            # Save the HTML
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)

            print(f"Successfully downloaded: {filename} in {page_folder}")

        else:
            print(f"Error {response.status_code} per URL: {url}")
            break

        restaurant_count += 1
        if restaurant_count % restaurants_per_page == 0:
            page_number += 1

    except Exception as e:
        print(f"Error during download of {url}: {str(e)}")

print("Download completed successfully")


Scaricato: da-michele.html in downloaded_html\page_1
Scaricato: zunica-1880.html in downloaded_html\page_1
Scaricato: locanda-martinelli.html in downloaded_html\page_1
Scaricato: hydra.html in downloaded_html\page_1
Scaricato: osteria-di-passignano.html in downloaded_html\page_1
Scaricato: condividere.html in downloaded_html\page_1
Scaricato: bel-ami.html in downloaded_html\page_1
Scaricato: al-baccanale.html in downloaded_html\page_1
Scaricato: la-tortuga.html in downloaded_html\page_1
Scaricato: onda-blu.html in downloaded_html\page_1
Scaricato: alessandro-mecca-al-castello-di-grinzane-cavour.html in downloaded_html\page_1
Scaricato: fre.html in downloaded_html\page_1
Scaricato: la-pergola69016.html in downloaded_html\page_1
Scaricato: marelet.html in downloaded_html\page_1
Scaricato: inkiostro.html in downloaded_html\page_1
Scaricato: burro-alici.html in downloaded_html\page_1
Scaricato: marsupino.html in downloaded_html\page_1
Scaricato: vitique.html in downloaded_html\page_1
Scari

### 1.3

In [3]:
def extract_restaurant_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Estrarre il nome del ristorante
    restaurant_name = soup.find("h1", class_="data-sheet__title").get_text(strip=True) if soup.find("h1", class_="data-sheet__title") else ''

    # Finding Address and Pricing
    big_div = soup.find_all("div", class_="data-sheet__block--text")
    if big_div:
        address_city__postal_country, price_range = big_div[0], big_div[1]
        address_city__postal_country = address_city__postal_country.get_text(separator=", ", strip=True) if address_city__postal_country else ''
        address_city__postal_country_list = address_city__postal_country.split(', ')

        address = address_city__postal_country_list[0]
        city = address_city__postal_country_list[1]
        postal = address_city__postal_country_list[2]
        country = address_city__postal_country_list[3]

        #Price 
        if big_div[1]:
            price_range = price_range.get_text(strip=True)
            price_range_list=price_range.split('·')

            price = price_range_list[0]
            price = re.sub(r"\s+", "", price)

            # Cuisine type
            cuisine_type = price_range_list[1]
            cuisine_type = re.sub(r"\s+", "", cuisine_type)

    # description 
    description = soup.find("div", class_="data-sheet__description").get_text(strip=True) if soup.find("div", class_="data-sheet__description") else None

    # facilities

    services_column = soup.find('div', class_='col col-12 col-lg-6')
    facilities_services = [item.get_text(strip=True) for item in services_column.find_all('li') ] if services_column else []

    #CreditCards

    creditCards_column = soup.find('div',class_='list--card')
    if creditCards_column:
        creditCards_img =[img['data-src'] for img in creditCards_column.find_all('img')]
        creditCards_names = [re.search(r'icons/([a-zA-Z]+)', cc).group(1).capitalize() for cc in creditCards_img]
    else: 
        creditCards_names = []

    # Phone Number
    phone_number = soup.find('span', class_="flex-fill").get_text(strip=True) if soup.find("span", class_="flex-fill") else None

    # Website
    Website_section = soup.find('div', class_='collapse__block-item link_item').get_text(strip=True) if soup.find('div', class_='collapse__block-item link_item') else None
    if Website_section:
        website = Website_section.find('a')['href']
    else:
        website = ''



    restaurant_info = {
        "restaurantName": restaurant_name,
        "address": address,
        "city": city,
        "postalCode": postal,
        "country": country,
        "priceRange": price,
        "cuisineType": cuisine_type,
        "description": description,
        "facilitiesServices": facilities_services,
        "creditCards": creditCards_names,
        "phoneNumber": phone_number,
        "website": website
    }
    return restaurant_info
    


In [4]:
file_path = os.path.join('downloaded_html/page_1', 'al-baccanale.html')

with open(file_path, "r", encoding="utf-8") as f:
    html_content = f.read()
    print(extract_restaurant_info(html_content))



{'restaurantName': 'Al Baccanale', 'address': 'via XX Settembre 20', 'city': 'Piombino', 'postalCode': '57025', 'country': 'Italy', 'priceRange': '€€', 'cuisineType': 'Tuscan', 'description': 'Situated in the heart of the historic centre just a stone’s throw from Piombino’s former Medici fortress, this quiet restaurant boasts a backdrop of exposed stonework and vaulted ceilings and just a few tables, so booking ahead is recommended. Here, the enthusiastic and talented owner-chef serves traditional cuisine reinterpreted with a modern and highly individual twist.', 'facilitiesServices': ['Air conditioning', 'Terrace', 'Wheelchair access'], 'creditCards': ['Amex', 'Mastercard', 'Visa'], 'phoneNumber': '+39 0565 222039', 'website': ''}


In [None]:
restaurant_data_list = []

main_folder = "files_tsf"
Path(main_folder).mkdir(exist_ok=True)

for folder in os.listdir('downloaded_html'):
    page_folder = os.path.join('downloaded_html', folder)
    
    if os.path.isdir(page_folder):
        for html_file in os.listdir(page_folder):
            file_path = os.path.join(page_folder, html_file)
            with open(file_path, "r", encoding="utf-8") as f:
                html_content = f.read()

                restaurant_data_list.append(extract_restaurant_info(html_content))
        




KeyboardInterrupt: 

In [5]:
main_folder = "files_tsf"
Path(main_folder).mkdir(exist_ok=True)

# Funzione per salvare i dati in un file TSV
def save_restaurant_to_tsv(restaurant_data, index, folder):
    # Nome del file TSV
    filename = os.path.join(folder, f"restaurant_{index}.tsv")
    
    # Lista dei campi con gestione dei valori mancanti
    fields = [
        restaurant_data.get("restaurantName", ""),
        restaurant_data.get("address", ""),
        restaurant_data.get("city", ""),
        restaurant_data.get("postalCode", ""),
        restaurant_data.get("country", ""),
        restaurant_data.get("priceRange", ""),
        restaurant_data.get("cuisineType", ""),
        restaurant_data.get("description", ""),
        ", ".join(restaurant_data.get("facilitiesServices", [])),
        ", ".join(restaurant_data.get("creditCards", [])),
        restaurant_data.get("phoneNumber", ""),
        restaurant_data.get("website", "")
    ]
    
    # Salvataggio nel file TSV
    with open(filename, "w", encoding="utf-8") as f:
        # Scrive l'intestazione
        f.write("\t".join(["restaurantName", "address", "city", "postalCode", "country", "priceRange", "cuisineType", "description", "facilitiesServices", "creditCards", "phoneNumber", "website"]) + "\n")
        # Scrive i dati
        f.write("\t".join(fields) + "\n")
    
    print(f"Salvato il file {filename}")

# Processa le cartelle HTML e crea i file TSV per ogni ristorante
restaurant_index = 0  # Contatore per numerare i ristoranti
for folder in os.listdir('downloaded_html'):
    page_folder = os.path.join('downloaded_html', folder)
    
    if os.path.isdir(page_folder):
        for html_file in os.listdir(page_folder):
            file_path = os.path.join(page_folder, html_file)
            with open(file_path, "r", encoding="utf-8") as f:
                html_content = f.read()

                # Estrai informazioni sul ristorante
                restaurant_data = extract_restaurant_info(html_content)
                                
                # Salva il ristorante in un file TSV e stampa il progresso
                save_restaurant_to_tsv(restaurant_data, restaurant_index, main_folder)
                
                # Incrementa l'indice del ristorante per il prossimo file
                restaurant_index += 1


Salvato il file files_tsf\restaurant_0.tsv
Salvato il file files_tsf\restaurant_1.tsv
Salvato il file files_tsf\restaurant_2.tsv
Salvato il file files_tsf\restaurant_3.tsv
Salvato il file files_tsf\restaurant_4.tsv
Salvato il file files_tsf\restaurant_5.tsv
Salvato il file files_tsf\restaurant_6.tsv
Salvato il file files_tsf\restaurant_7.tsv
Salvato il file files_tsf\restaurant_8.tsv
Salvato il file files_tsf\restaurant_9.tsv
Salvato il file files_tsf\restaurant_10.tsv
Salvato il file files_tsf\restaurant_11.tsv
Salvato il file files_tsf\restaurant_12.tsv
Salvato il file files_tsf\restaurant_13.tsv
Salvato il file files_tsf\restaurant_14.tsv
Salvato il file files_tsf\restaurant_15.tsv
Salvato il file files_tsf\restaurant_16.tsv
Salvato il file files_tsf\restaurant_17.tsv
Salvato il file files_tsf\restaurant_18.tsv
Salvato il file files_tsf\restaurant_19.tsv
Salvato il file files_tsf\restaurant_20.tsv
Salvato il file files_tsf\restaurant_21.tsv
Salvato il file files_tsf\restaurant_22.ts

In [3]:
#creating the dataframe

main_folder = "files_tsv"
all_dataframes = []

for file in os.listdir(main_folder):
    file_path = os.path.join(main_folder, file)

    restaurant_df = pd.read_csv(file_path, sep="\t")
    all_dataframes.append(restaurant_df)
                          
combined_df = pd.concat(all_dataframes, ignore_index=True)
combined_df.to_csv("combined_restaurants.tsv", sep="\t", index=False)
