# Web scraping - Tripadvisor restaurants

#### Import python libraries:

In [33]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

#### Declare functions

In [31]:
def get_soup_content(gc, do):
    """Create url for every page

    :param gc: str - Geo-code for restaurants.
    :param do: int - Restaurant data offset.
    :return: str - BeautifulSoup - Parsed HTML content of the page.
    """
    URL = f"https://www.tripadvisor.com/FindRestaurants?geo={gc}&offset={do}"
    HEADERS = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            "Accept-Language": "en-US, en;q=0.5",
        }
    response = requests.get(URL, timeout=10, headers=HEADERS)
    html_content = response.text

    return BeautifulSoup(html_content, "html.parser")

def get_card(rest_cnt, soup_content):
    """Retrieve the card element for a restaurant.

    :param rest_cnt: int - The count/index of the restaurant.
    :param soup_content: BeautifulSoup - Parsed HTML content of the page.
    :return: str - BeautifulSoup object of the card element.
    """
    card_tag = f"{rest_cnt}_list_item"
    print(f"Scraping item number: {card_tag}")
    card = soup_content.find("div",{"data-test":card_tag})

    return card

def scrape_title(card):
    """Extract the title from a restaurant card.

    :param card: str - BeautifulSoup object of the card element.
    :return: str - The title of the restaurant.
    """
    title = card.find_all('div', class_ = 'biGQs _P fiohW alXOW NwcxK GzNcM ytVPx UTQMg RnEEZ ngXxk')
    if title: 
        clean = title[0].text.split(" ", 1)

        return clean[1]

def scrape_star_ratings(card):
    """Extract the star rating from a restaurant card.

    :param card: str - BeautifulSoup object of the card element.
    :return: str - The star rating of the restaurant.
    """
    star_rating = card.find_all('title',{'id':re.compile('^:lithium-')})
    if star_rating:
        clean = star_rating[-1].text.split(" ", 1)
        
        return clean[0]

def scrape_reviews(card):
    """Extract the number of reviews from a restaurant card.

    :param card: str - BeautifulSoup object of the card element.
    :return: str - The number of reviews for the restaurant.
    """
    reviews = card.find_all('span', class_ = "IiChw")
    if reviews:
        clean = reviews[-1].text.replace(",","").split(" ", 1)

        return clean[0]


def scrape_cuisines(card):
    """Extract the type of cuisines offered by the restaurant from a card.

    :param card: str - BeautifulSoup object of the card element.
    :return: str - The cuisines offered by the restaurant.
    """
    cuisines = card.find('div', class_ = 'OvkNT K u FGSTQ')
    if cuisines:
        cuisine_text = cuisines.find('span', class_='YECgr Tsrjt')
        if getattr(cuisine_text, 'text', None):
            
            return cuisine_text.text

def scrape_city_name(scraping_control_variables):
    """Retrieve the city name from the TripAdvisor page content.

    :param scraping_control_variables: dict - A dictionary containing scraping parameters.
    :return: str - The name of the city.
    """
    soup_content = get_soup_content(scraping_control_variables['geo_code'], 0)
    name = soup_content.find('div', class_ = re.compile('^ZOMeU w'))
    if getattr(name, 'text', None):
        name = name.text.split(" ", 2)
        
        return name[2]
    
def get_restaurant_data_from_card(rest_cnt, data_offset_current, page_num, card):
    """Compile restaurant data from a card into a dictionary.

    :param rest_cnt: int - The count/index of the restaurant.
    :param data_offset_current: int - The current data offset for pagination.
    :param page_num: int - The current page number in the pagination.
    :param card: str - BeautifulSoup object of the card element.
    :return: dict - A dictionary containing the restaurant's data.
    """
    restaurant_data = {
        'title': scrape_title(card),
        'cuisines': scrape_cuisines(card) ,
        'reviews': scrape_reviews(card),
        'star rating': scrape_star_ratings(card),
        'page number': page_num,
        'data offset': data_offset_current,
        'restaurant serial number': rest_cnt
    }

    return restaurant_data

def parse_tripadvisor(scraping_control_variables):
    """Parse the TripAdvisor page and extract restaurant data.

    :param scraping_control_variables: dict - A dictionary containing scraping parameters.
    :return: list - A list of dictionaries, each containing data for one restaurant.
    """
    restaurants_scraped = []
    data_offset_lower_limit = scraping_control_variables['data_offset_lower_limit']
    data_offset_upper_limit = scraping_control_variables['data_offset_upper_limit']
    page_num = scraping_control_variables['page_num']
    page_size = scraping_control_variables['page_size']
    geo_code = scraping_control_variables['geo_code']

    data_offset_current = data_offset_lower_limit
    
    while data_offset_current <= data_offset_upper_limit:
        print("Scraping Page Number: ", page_num)
        print("Scraping Data Offset: ", data_offset_current)
        page_start_offset = (page_num*page_size) + 1
        page_end_offset = (page_num*page_size) + page_size + 1
        soup_content = get_soup_content(geo_code, data_offset_current)
        for rest_cnt in range(page_start_offset , page_end_offset):
            card = get_card(rest_cnt, soup_content)
            if card is None:
                break
            restaurant_data = get_restaurant_data_from_card(rest_cnt, data_offset_current, page_num, card)
            restaurants_scraped.append(restaurant_data)
        print("Scraping Completed for Page Number: ", page_num, "\n" )
        print("Data Offset: ", data_offset_current)
        page_num = page_num + 1
        data_offset_current = data_offset_current + 30
    
    return restaurants_scraped

def save_to_csv(restaurants_scraped, scraping_control_variables):
    """Save the scraped restaurant data to a CSV file.

    :param restaurants_scraped: list - A list of dictionaries, each containing data for one restaurant.
    :param scraping_control_variables: dict - A dictionary containing scraping parameters.
    """
    title = scrape_city_name(scraping_control_variables)
    print("storing the data in csv")
    output_df = pd.DataFrame(restaurants_scraped)
    output_df.drop_duplicates(inplace=True)
    output_df.to_csv(f"ta_{title}_restaurants_scraped.csv", index= False)
    print("csv stored")

#### Run:

In [32]:
scraping_control_variables = {
    'geo_code' : '190454',
    'data_offset_lower_limit' : 0,
    'data_offset_upper_limit' : 5100,
    'page_num' : 0,
    'page_size' : 30
}

restaurants_scraped = parse_tripadvisor(scraping_control_variables)
save_to_csv(restaurants_scraped)

Scraping Page Number:  0
Scraping Data Offset:  0
Scraping item number: 1_list_item
Scraping item number: 2_list_item
Scraping item number: 3_list_item
Scraping item number: 4_list_item
Scraping item number: 5_list_item
Scraping item number: 6_list_item
Scraping item number: 7_list_item
Scraping item number: 8_list_item
Scraping item number: 9_list_item
Scraping item number: 10_list_item
Scraping item number: 11_list_item
Scraping item number: 12_list_item
Scraping item number: 13_list_item
Scraping item number: 14_list_item
Scraping item number: 15_list_item
Scraping item number: 16_list_item
Scraping item number: 17_list_item
Scraping item number: 18_list_item
Scraping item number: 19_list_item
Scraping item number: 20_list_item
Scraping item number: 21_list_item
Scraping item number: 22_list_item
Scraping item number: 23_list_item
Scraping item number: 24_list_item
Scraping item number: 25_list_item
Scraping item number: 26_list_item
Scraping item number: 27_list_item
Scraping item 