# Web scraping

### 1. Data Collection

#### 1.1

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from pathlib import Path
import json
import re


In [None]:
first_url = "https://guide.michelin.com/en/it/restaurants/"

# Define headers to make the request look like it's coming from a regular browser.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# List to store the links 
urls=[]
# Iterate on each page and saving its content into the html_content variable
for page in range(1,101):
    # get url of each page
    url = first_url + f'page/{page}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        html_content = response.text
    else:
        print("Error during download of current page:", response.status_code)

    # Pase the content of each page
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the links in the page 
    restaurant_links = soup.find_all("a", class_="link")
    url_base="https://guide.michelin.com"
    page_urls = []
    # Iterate through all links and extract the URL of each link and concatenate it with the base url
    for i in restaurant_links:
        href = i.get("href")
        href= url_base + href
        page_urls.append(href)
    urls.append(page_urls)
urls = [url for page_urls in urls for url in page_urls]

# Only keep the restaurants links and make sure there are no duplicates
rest_urls = []
for url in urls:
    if '/restaurant/' in url and url not in rest_urls:
        rest_urls.append(url)

In [None]:
# Verify execution 
len(rest_urls)

1983

In [None]:
# Save all links of restaurants in a file
with open("restaurant_urls.txt", "w") as file:
    for url in rest_urls:
        file.write(url + "\n")

#### 1.2

In [None]:
# Iterate through links of restaurants
with open("restaurant_urls.txt", "r") as file:
    urls = file.read().splitlines()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Main folder where to download HTML of each restaurant on each page
main_folder = "restaurants_html"
Path(main_folder).mkdir(exist_ok=True)

# Number of the pages
page_number = 1

# Number of restaurants in current page
restaurant_count = 0
restaurants_per_page = 20 

for url in urls:
    # Define folder for each page
    page_folder = os.path.join(main_folder, f"page_{page_number}")
    Path(page_folder).mkdir(exist_ok=True)

    try:
        # Download the HTML of each restaurant
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Name of the restaurant
            filename = url.split("/")[-1] + ".html"
            filepath = os.path.join(page_folder, filename)

            # Save the HTML
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(response.text)

            print(f"Successfully downloaded: {filename} in {page_folder}")

        else:
            print(f"Error {response.status_code} per URL: {url}")
            break

        restaurant_count += 1
        if restaurant_count % restaurants_per_page == 0:
            page_number += 1

    except Exception as e:
        print(f"Error during download of {url}: {str(e)}")

print("Download completed successfully")


Scaricato: da-michele.html in downloaded_html\page_1
Scaricato: zunica-1880.html in downloaded_html\page_1
Scaricato: locanda-martinelli.html in downloaded_html\page_1
Scaricato: hydra.html in downloaded_html\page_1
Scaricato: osteria-di-passignano.html in downloaded_html\page_1
Scaricato: condividere.html in downloaded_html\page_1
Scaricato: bel-ami.html in downloaded_html\page_1
Scaricato: al-baccanale.html in downloaded_html\page_1
Scaricato: la-tortuga.html in downloaded_html\page_1
Scaricato: onda-blu.html in downloaded_html\page_1
Scaricato: alessandro-mecca-al-castello-di-grinzane-cavour.html in downloaded_html\page_1
Scaricato: fre.html in downloaded_html\page_1
Scaricato: la-pergola69016.html in downloaded_html\page_1
Scaricato: marelet.html in downloaded_html\page_1
Scaricato: inkiostro.html in downloaded_html\page_1
Scaricato: burro-alici.html in downloaded_html\page_1
Scaricato: marsupino.html in downloaded_html\page_1
Scaricato: vitique.html in downloaded_html\page_1
Scari

### 1.3

In [2]:
def extract_restaurant_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Estrarre il nome del ristorante
    restaurant_name = soup.find("h1", class_="data-sheet__title").get_text(strip=True) if soup.find("h1", class_="data-sheet__title") else ''

    # Finding Address and Pricing
    big_div = soup.find_all("div", class_="data-sheet__block--text")
    if big_div:
        address_city__postal_country, price_range = big_div[0], big_div[1]
        address_city__postal_country_text = " ".join(address_city__postal_country.stripped_strings)
        address_city__postal_country_list = re.split(r',\s*', address_city__postal_country_text)


        address = address_city__postal_country_list[0]
        city = address_city__postal_country_list[1]
        postal = address_city__postal_country_list[2]
        country = address_city__postal_country_list[3]

        #Price 
        if big_div[1]:
            price_range = price_range.get_text(strip=True)
            price_range_list=price_range.split('·')

            price = price_range_list[0]
            price = re.sub(r"\s+", "", price)

            # Cuisine type
            cuisine_type = price_range_list[1]
            cuisine_type = re.sub(r"\s+", "", cuisine_type)

    # description 
    description = soup.find("div", class_="data-sheet__description").get_text(strip=True) if soup.find("div", class_="data-sheet__description") else ''

    # facilities

    services_column = soup.find('div', class_='col col-12 col-lg-6')
    facilities_services = [item.get_text(strip=True) for item in services_column.find_all('li') ] if services_column else []

    #CreditCards

    creditCards_column = soup.find('div',class_='list--card')
    if creditCards_column:
        creditCards_img =[img['data-src'] for img in creditCards_column.find_all('img')]
        creditCards_names = [re.search(r'icons/([a-zA-Z]+)', cc).group(1).capitalize() for cc in creditCards_img]
    else: 
        creditCards_names = []

    # Phone Number
    phone_number = soup.find('span', class_="flex-fill").get_text(strip=True) if soup.find("span", class_="flex-fill") else ''

    # Website
    link_div = soup.find('div', class_='collapse__block-item link-item')

    if link_div:
        website_tag = link_div.find('a', class_='link js-dtm-link')
        website = website_tag['href'] if website_tag else ''
    else:
        website = ''




    restaurant_info = {
        "restaurantName": restaurant_name,
        "address": address,
        "city": city,
        "postalCode": postal,
        "country": country,
        "priceRange": price,
        "cuisineType": cuisine_type,
        "description": description,
        "facilitiesServices": facilities_services,
        "creditCards": creditCards_names,
        "phoneNumber": phone_number,
        "website": website
    }
    return restaurant_info
    


# From dataframe to tsv

In [None]:
restaurant_data = []

folder_path = 'restaurants_html'  
counter = 1

# Iterates ricursively on directories and files inside of them
for root, dirs, files in os.walk(folder_path):
    for html_file in files:
        # Creates full file path
        file_path = os.path.join(root, html_file)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

            restaurant_info = extract_restaurant_info(html_content)
            
            restaurant_data.append(restaurant_info)
            
            print(f"Extracted info for {restaurant_info['restaurantName']} number {counter}")
            counter += 1 

# Creation of data frame 
df_restaurants = pd.DataFrame(restaurant_data)


Extracted info for 20Tre number 1
Extracted info for Alessandro Feo number 2
Extracted info for Ape Vino e Cucina number 3
Extracted info for Charleston number 4
Extracted info for Da Bob Cook Fish number 5
Extracted info for DA_MÓ number 6
Extracted info for Dama number 7
Extracted info for Donevandro number 8
Extracted info for Etra number 9
Extracted info for Il Ristorante Alain Ducasse Napoli number 10
Extracted info for Il Tirabusciò number 11
Extracted info for La Buca number 12
Extracted info for La Trattoria Enrico Bartolini number 13
Extracted info for LoRo number 14
Extracted info for Ménage number 15
Extracted info for O Me O Il Mare number 16
Extracted info for Palazzo Utini number 17
Extracted info for Procaccini number 18
Extracted info for Sa Domu Sarda number 19
Extracted info for Soul & Fish number 20
Extracted info for Bàcaro Il Gusto number 21
Extracted info for Casa Rispoli number 22
Extracted info for Castello di Fighine number 23
Extracted info for Dolada number 2

In [6]:
display(df_restaurants)

Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,20Tre,via David Chiossone 20 r,Genoa,16123,Italy,€€,"Farmtotable,ModernCuisine",Situated in the heart of Genoa’s historic cent...,[Air conditioning],"[Amex, Dinersclub, Mastercard, Visa]",+39 010 247 6191,https://www.ristorante20tregenova.it/
1,Alessandro Feo,via Angelo Lista 24,Marina di Casal Velino,84040,Italy,€€,"Campanian,Seafood",In a beautiful stone-vaulted building (an old ...,[],"[Amex, Dinersclub, Mastercard, Visa]",+39 328 893 7083,https://www.alessandrofeoristorante.it/
2,Ape Vino e Cucina,Piazza Risorgimento 3,Alba,12051,Italy,€€,"Piedmontese,Contemporary",This attractive restaurant in the heart of Alb...,"[Air conditioning, Terrace, Wheelchair access]","[Amex, Dinersclub, Maestrocard, Mastercard, Visa]",+39 0173 363453,https://www.apewinebar.it/alba/
3,Charleston,via Generale Magliocco 19,Palermo,90141,Italy,€€€€,"ModernCuisine,Creative","Before it became famous in Mondello, the renow...","[Air conditioning, Counter dining, Terrace, Wh...","[Amex, Mastercard, Visa]",+39 091 450171,https://casacharleston.net/
4,Da Bob Cook Fish,largo Parsano vecchio 16,Sorrento,80067,Italy,€€,Seafood,Working in partnership with the nearby fishmon...,"[Air conditioning, Terrace]","[Amex, Dinersclub, Mastercard, Visa]",+39 081 1778 3873,https://www.dabobcookfish.com/
...,...,...,...,...,...,...,...,...,...,...,...,...
1978,Shiroya,via dei Baullari 147,Rome,00186,Italy,€€,"Japanese,Asian",One of the most popular restaurants in the his...,"[Air conditioning, Terrace]","[Amex, Mastercard, Visa]",+39 06 6476 0753,https://www.shiroya.it
1979,Sotto l'Arco,via Aretusi 5,Bologna,40132,Italy,€€€,"Italian,Creative",Villa Aretusi is a pleasant 17C villa surround...,"[Air conditioning, Car park, Garden or park, T...","[Mastercard, Visa]",+39 051 619 9848,https://www.villa-aretusi.it/ristorante-sotto-...
1980,Umami,Via Ugo Secondo Partigiano 1,Badalucco,18010,Italy,€€,ModernCuisine,A young chef with experience in renowned resta...,"[Terrace, Wheelchair access]","[Amex, Mastercard, Visa]",+39 331 338 6005,https://www.umamirestaurant.it/
1981,Visione Restaurant and Living,Strada Nicolini Basso 34,loc. Tre Stelle,Barbaresco,12050,€€€,"Contemporary,Piedmontese","At this restaurant, new, young and enthusiasti...","[Air conditioning, Car park]","[Amex, Maestrocard, Mastercard, Visa]",+39 328 134 0218,https://www.ristorantevisione.it


In [7]:
print(df_restaurants.isna().sum())  

restaurantName        0
address               0
city                  0
postalCode            0
country               0
priceRange            0
cuisineType           0
description           0
facilitiesServices    0
creditCards           0
phoneNumber           0
website               0
dtype: int64


In [8]:
print(df_restaurants.shape)

(1983, 12)


In [10]:
for i, row in df_restaurants.iterrows():
    # Define the file name using the index
    file_name = f"restaurant_{i}.tsv"

    # Prepare row data as a single line with tab-separated values
    content =  f"{row['restaurantName']}\t{row['address']}\t{row['city']}\t{row['postalCode']}\t{row['country']}\t{row['priceRange']}\t{row['cuisineType']}\t{row['description']}\t{row['facilitiesServices']}\t{row['creditCards']}\t{row['phoneNumber']}\t{row['website']}\n"

    subfolder = f"files_tsv"
    file_path = os.path.join(subfolder, file_name)

    # Check if the subfolder exists, create it if it doesn't
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)
    # Write the row data to the .tsv file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)

    print(f"Created file: {file_name}")

Created file: restaurant_0.tsv
Created file: restaurant_1.tsv
Created file: restaurant_2.tsv
Created file: restaurant_3.tsv
Created file: restaurant_4.tsv
Created file: restaurant_5.tsv
Created file: restaurant_6.tsv
Created file: restaurant_7.tsv
Created file: restaurant_8.tsv
Created file: restaurant_9.tsv
Created file: restaurant_10.tsv
Created file: restaurant_11.tsv
Created file: restaurant_12.tsv
Created file: restaurant_13.tsv
Created file: restaurant_14.tsv
Created file: restaurant_15.tsv
Created file: restaurant_16.tsv
Created file: restaurant_17.tsv
Created file: restaurant_18.tsv
Created file: restaurant_19.tsv
Created file: restaurant_20.tsv
Created file: restaurant_21.tsv
Created file: restaurant_22.tsv
Created file: restaurant_23.tsv
Created file: restaurant_24.tsv
Created file: restaurant_25.tsv
Created file: restaurant_26.tsv
Created file: restaurant_27.tsv
Created file: restaurant_28.tsv
Created file: restaurant_29.tsv
Created file: restaurant_30.tsv
Created file: rest