In [459]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import re
from tqdm import tqdm

In [None]:
webpages = []
for response in responses:
    webpages.append(BeautifulSoup(response.content, 'html.parser'))

len(webpages)  # Should be 24 for restaurants

In [None]:
# Business urls
# https://www.yelp.co.uk/search?cflt=homeservices&find_loc=Berlin%2C%20Germany
# https://www.yelp.co.uk/search?cflt=restaurants&find_loc=Berlin%2C%20Germany

businesses = ["restaurants", "homeservices"]

main_url = f"https://www.yelp.co.uk/search?cflt={businesses[0]}&find_loc=Berlin%2C%20Germany"

# other pages url (for restaurants case)
# https://www.yelp.co.uk/search?cflt=restaurants&find_loc=Berlin%2C%20Germany&start=10
# pages --> main_url + (start = range(10,231,10)), can automate finding ending index

responses = [requests.get(main_url)]
for index in range(10,231,10):
    responses.append(requests.get(f"{main_url}&start={index}"))
    delay = np.random.randint(1,6)
    time.sleep(delay)

# Class to get hrefs
span_class = "css-1pxmz4g"

# Needed regex here as strip was leaving some characters
# Strangely those characters only appeared on appending but 
# not in a direct print

regex = re.compile('[^a-zA-Z]')
child_urls = []
names = []
for webpage in webpages:
    items = webpage.findAll('span', class_ = span_class)
    
    for item in items:
        names.append(regex.sub('', item.text))
        child_urls.append(f"https://www.yelp.co.uk{item.find('a')['href']}")


In [None]:
num_page_class = "css-e81eai"
response = requests.get("https://www.yelp.de/search?find_desc=restaurants&find_loc=Berlin&start=0")
soup = BeautifulSoup(response.content,'html.parser')

num_pages = soup.findAll('span', class_ = num_page_class)

print(num_pages[-3].text.strip('1 von'))

In [None]:
df = pd.DataFrame({f"{businesses[0]}_name".title(): names, "url": child_urls})

df.shape # Should be (240,2)
df.to_csv('yelp_dataset.csv', index = False)

In [None]:
# Soupify all business pages
df = pd.read_csv('yelp_dataset.csv')

business_pages = []
for session in tqdm(range(2)):
    session_delay = np.random.randint(600,1200)

    for url in tqdm(df['url']):
        response = requests.get(url)
        delay = np.random.randint(20,40)
        time.sleep(delay)
        business_pages.append(BeautifulSoup(response.content, 'html.parser'))
    
    time.sleep(session_delay)
    
print(len(business_pages)) # Should be 240

In [None]:
business_pages = []
index = 0
response = requests.get(df['url'][index])
delay = np.random.randint(20,40)
time.sleep(delay)
soup = BeautifulSoup(response.content, 'html.parser')
business_pages.append(soup)



In [None]:
with open(f"./webpages/business_page{index}.html", "w") as file:
    file.write(str(soup))

In [None]:
# Number of photos
# photos_class = "display--inline__373c0__1DbOG margin-l2__373c0__wvUpT border-color--default__373c0__2oFDT"
photos_class = "css-ardur"
regex = re.compile('[^0-9]')

df['photos_count'] = 0

num_photos = business_pages[0].findAll('span', class_ = photos_class)[4]
num_photos = int(regex.sub('', num_photos.text))

df.loc[index, 'photos_count'] = num_photos

df.to_csv('yelp_dataset.csv', mode = 'a', index = False)

In [None]:
df.head()

In [None]:
# Stars
# ratings_class ="i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"
stars_class = "i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"
regex = re.compile('[^0-9.]')

df['stars'] = "0"

stars = business_pages[0].findAll('div', class_ = stars_class)
stars = stars[0]['aria-label']
stars = regex.sub('', stars)

df.loc[index, 'stars'] = stars
df.to_csv('yelp_dataset.csv', index = False)

In [None]:
# Review count
reviews_class =  "css-bq71j2"

regex = re.compile('[^0-9]')

df['review_count'] = 0

review_count = business_pages[0].findAll('span', class_ = reviews_class)
review_count = review_count[0].text
review_count = int(regex.sub('', review_count))

df.loc[index, 'review_count'] = review_count
df.to_csv('yelp_dataset.csv', index = False)

In [None]:
# Claimed vs Unclaimed
claim_class ="css-166la90"

df['claimed'] = 0

claimed = business_pages[0].findAll('a', class_ = claim_class)
if claimed[0].text == "Claimed":
    claimed = 1
else: 
    claimed = 0

df.loc[index, 'claimed'] = claimed
df.to_csv('yelp_dataset.csv', index = False)

In [None]:
# Address
address_class = "css-e81eai"

df['address'] = "0"

address = business_pages[0].findAll('p', class_ = address_class)
address = address[1].text

df.loc[index, 'address'] = address
df.to_csv('yelp_dataset.csv', index = False)

In [None]:
# Hours
day_class = "day-of-the-week__373c0__124RF css-1h1j0y3"
time_class = "no-wrap__373c0__2vNX7 css-1h1j0y3"

df['hours'] = "0"

hours_table = business_pages[0].findAll('table')[0]

rows = hours_table.findChildren('tr')

times = []
for idx, row in enumerate(rows):

    times_text = row.findAll('p', class_ = time_class)
    if times_text != []:
        times.append(row.findAll('p', class_ = time_class)[0].text)

days = ['Monday','Tuesday','Wednesday','Thursday','Friday']
hours = {day:time for day in days for time in times}

df.loc[index, 'hours'] = str(hours)
df.to_csv('yelp_dataset.csv', index = False)

In [None]:
# Attributes
# div parent class = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq layout-wrap__373c0__34d4b layout-2-units__373c0__3CiAk border-color--default__373c0__2oFDT"

# "margin-b3__373c0__q1DuY border-color--default__373c0__2oFDT"

outer_div = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq layout-wrap__373c0__34d4b layout-2-units__373c0__3CiAk border-color--default__373c0__2oFDT"
# inner_div = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-baseline__373c0__2s3Ze border-color--default__373c0__2oFDT"
inner_div = "arrange-unit__373c0__1piwO border-color--default__373c0__2oFDT"
attributes = business_pages[0].find_all('div', class_ = outer_div)

for attribute in attributes:
    # print(attribute.find_all('span'))
    print(attribute.text)

    # for item in attribute.find_all('span'):
    #     print()

# inner_div2 = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-baseline__373c0__2s3Ze border-color--default__373c0__2oFDT"

df['attributes'] = "0"

# print(inner_div == inner_div2)


# icon_greyed - 0, 1

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [None]:
url = business_pages[0]

def get_cpf():
    driver = webdriver.Chrome("C:/Users/rajat/Downloads/chromedriver")
    driver.get("file:///C:/Users/rajat/OneDrive/Documents/GitHub/Strive/BuildWeeks/Week1/ymvc-berlin/webpages/business_page0.html")
    (driver.page_source).encode('utf-8')
    # css_selector = "button[class = ' css-174jypu']"
    # driver.find_element_by_css_selector(css_selector).click()
    # time.sleep(np.random.randint(5,10))
    # outer_div = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq layout-wrap__373c0__34d4b layout-2-units__373c0__3CiAk border-color--default__373c0__2oFDT"
    # text = driver.find_elements_by_class_name(outer_div)
    # print(text)

get_cpf()

In [None]:
index = 0
with open(f"./webpages/business_page{index}.html") as f:
    soup = BeautifulSoup(f, 'html.parser')

In [None]:
photos_class = "css-ardur"
stars_class = "i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"
reviews_class =  "arrange-unit__373c0__1piwO arrange-unit-fill__373c0__17z0h border-color--default__373c0__2oFDT nowrap__373c0__1_N1j"
claim_class = "border-color--default__373c0__2oFDT nowrap__373c0__1_N1j"
day_class = "day-of-the-week__373c0__124RF css-1h1j0y3"
time_class = "no-wrap__373c0__2vNX7 css-1h1j0y3"
stars_class = "display--inline__373c0__2SfH_ border-color--default__373c0__30oMI"
categories_class = "css-bq71j2" 
attributes_outer_div = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq layout-wrap__373c0__34d4b layout-2-units__373c0__3CiAk border-color--default__373c0__2oFDT"


days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']

total_webpages = range(68,241)
n = 8
sessions = [total_webpages[i:i+n] for i in range(0, len(total_webpages), n)]

# print(sessions)
for session, session_range in tqdm(enumerate(sessions)):
    # print(session)
    for index in tqdm(session_range):

        # # with open(f"./webpages/business_page{index}.html", encoding = "utf8") as f:
        # #     soup = BeautifulSoup(f, 'html.parser')
        df['url'][index]
        response = requests.get(df['url'][index])
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Numbre of photos
        try:
            regex = re.compile('[^0-9]')

            num_photos = soup.findAll('span', class_ = photos_class)[4]
            num_photos = int(regex.sub('', num_photos.text))

            df.loc[index, 'photos_count'] = num_photos
            
        
        except (ValueError, IndexError):
            df.loc[index, 'photos_count'] = None

        # Stars
        # ratings_class ="i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz" 
        # stars_class = "i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"
        regex = re.compile('[^0-9.]')

        stars = soup.findAll('span', class_ = stars_class)[0]
        stars = stars.findChildren('div')
        stars = stars[0]['aria-label']
        stars = regex.sub('', stars)

        df.loc[index, 'stars'] = stars

        # Review count

        regex = re.compile('[^0-9]')

        review_count = soup.findAll('div', class_ = reviews_class)
        review_count = review_count[0].text
        review_count = int(regex.sub('', review_count))

        df.loc[index, 'review_count'] = review_count

        # Claimed vs Unclaimed

        claimed = soup.findAll('div', class_ = claim_class)

        if claimed[0].text.strip() == "Claimed":
            claimed = 1
        else: 
            claimed = 0

        df.loc[index, 'claimed'] = claimed

        # Address

        address = soup.findAll('address', class_ = "")

        location = ""
        for element in address:

            for p in element.findAll('p')[:-1]:
                location += p.text

        df.loc[index, 'address'] = location

        # Hours

        hours_table = soup.findAll('table')[0]

        rows = hours_table.findChildren('tr')

        times = []
        for idx, row in enumerate(rows):

            times_text = row.findAll('p', class_ = time_class)
            if times_text != []:
                times.append(row.findAll('p', class_ = time_class)[0].text)

        
        hours = {day:time for day in days for time in times}

        df.loc[index, 'hours'] = str(hours)

        # Categories
        categories_elements = soup.findAll('span', class_ = categories_class)

        categories = []
        for element in categories_elements:
            category.findAll('a')

            for category in element.findAll('a'):
                categories.append(category.text)

        categories = ",".join(categories)

        df.loc[index,'categories'] = categories

        # Attributes

        attributes_elements = soup.findAll('div', class_ = attributes_outer_div)

        attributes = []
        for div_element in attributes_elements:

            for span_element in div_element.findAll('span')[1::2]:
                attributes.append(span_element.text)
        attributes = ",".join(attributes)

        df.loc[index,'attributes'] = attributes
        
        # Saving to dataset
        df.to_csv('yelp_dataset.csv', index = False)
        delay = np.random.randint(40,100)
        time.sleep(delay)
    
    time.sleep(np.random.randint(150,1200))



In [None]:
with open(f"./webpages/business_page{index}.html", encoding = "utf8") as f:
    soup = BeautifulSoup(f, 'html.parser')
    # Stars
    # ratings_class ="i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"
stars_class = "display--inline__373c0__2SfH_ border-color--default__373c0__30oMI"
# stars_class = "i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"
regex = re.compile('[^0-9.]')

stars = soup.findAll('span', class_ = stars_class)[0]
stars = stars.findChildren('div')
stars = stars[0]['aria-label']
stars = regex.sub('', stars)
stars

In [None]:
"i-stars__373c0__1T6rz i-stars--large-5__373c0__1GcGD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz" == "i-stars__373c0__1T6rz i-stars--large-4-half__373c0__2lYkD border-color--default__373c0__30oMI overflow--hidden__373c0__2B0kz"

In [None]:
index

In [None]:
# Review count
reviews_class =  "arrange-unit__373c0__1piwO arrange-unit-fill__373c0__17z0h border-color--default__373c0__2oFDT nowrap__373c0__1_N1j"

regex = re.compile('[^0-9]')

review_count = soup.findAll('div', class_ = reviews_class)
review_count[0].text
# review_count = review_count[0].text
# review_count = int(regex.sub('', review_count))

In [None]:
response = requests.get(df['url'][1])

claim_class = "border-color--default__373c0__2oFDT nowrap__373c0__1_N1j"

soup = BeautifulSoup(response.content, 'html.parser')
claimed = soup.findAll('div', class_ = claim_class)

# if claimed[0].text.strip() == "Claimed":
#     claimed = 1
# else: 
#     claimed = 0



In [None]:
claimed

In [None]:
address_class = "arrange-unit__373c0__1piwO arrange-unit-fill__373c0__17z0h border-color--default__373c0__2oFDT"

response = requests.get(df['url'][1])

soup = BeautifulSoup(response.content, 'html.parser')
address = soup.findAll('address', class_ = "")

location = ""
for element in address:

    for p in element.findAll('p')[:-1]:
        location += p.text

In [None]:
# Cost_category

In [None]:
# Categories
categories_class = "css-bq71j2" 

response = requests.get(df['url'][2])

soup = BeautifulSoup(response.content, 'html.parser')

categories = soup.findAll('span', class_ = categories_class)
categories

In [None]:
# Categories
categories_class = "css-bq71j2" 

response = requests.get(df['url'][2])

soup = BeautifulSoup(response.content, 'html.parser')
categories_elements = soup.findAll('span', class_ = categories_class)

categories = []
# for category in categories_element:
#     categories += category.text
for element in categories_elements:
    category.findAll('a')

    for category in element.findAll('a'):
        categories.append(category.text)

categories = ",".join(categories)

In [None]:
attributes_outer_div = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq layout-wrap__373c0__34d4b layout-2-units__373c0__3CiAk border-color--default__373c0__2oFDT"
# inner_div = "arrange__373c0__UHqhV gutter-2__373c0__3Zpeq vertical-align-baseline__373c0__2s3Ze border-color--default__373c0__2oFDT"
inner_div = "arrange-unit__373c0__1piwO border-color--default__373c0__2oFDT"

response = requests.get(df['url'][2])

soup = BeautifulSoup(response.content, 'html.parser')

attributes_elements = soup.findAll('div', class_ = attributes_outer_div)

attributes = []
for div_element in attributes_elements:

    for span_element in div_element.findAll('span')[1::2]:
        attributes.append(span_element.text)
attributes = ",".join(attributes)
attributes

In [None]:
    business = 'japanese'
    num_pages = 2

    main_url = f"https://www.yelp.co.uk/search?cflt={business}&find_loc=Berlin%2C%20Germany"
    responses = [requests.get(main_url)]

    for index in tqdm(range(10, num_pages*10, 10), desc=f"Requesting pages for {business}"):
        responses.append(requests.get(f"{main_url}&start={index}"))
        delay = np.random.randint(1,6)
        time.sleep(delay)

    webpages = []
    for response in responses:
        webpages.append(BeautifulSoup(response.content, 'html.parser'))

    # Class to get hrefs
    span_class = "css-1pxmz4g"

    # Needed regex here as strip was leaving some characters Strangely those
    # characters only appeared on appending but not in a direct print

    regex = re.compile('[^a-zA-Z]')
    child_urls = []
    names = []
    for webpage in webpages:
        items = webpage.findAll('span', class_ = span_class)
        print(items)

        for item in items:
            names.append(regex.sub('', item.text))
            if item.find('a') != None:
                print(item.find('a'))
            # print(item.findAll('a', class_ = "css-166la90"))
            # child_urls.append(f"https://www.yelp.co.uk{item.find('a')['href']}")

In [466]:
day_class = "day-of-the-week__373c0__124RF css-1h1j0y3"
time_class = "no-wrap__373c0__2vNX7 css-1h1j0y3"

days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
response = requests.get("https://www.yelp.co.uk/biz/lorenz-adlon-esszimmer-berlin")
soup = BeautifulSoup(response.content, 'html.parser')

hours_table = soup.findAll('table')[0]

rows = hours_table.findChildren('tr')

times = []
for idx, row in enumerate(rows):

    times_text = row.findAll('p', class_ = time_class)
    if times_text != []:
        times.append(row.findAll('p', class_ = time_class)[0].text)

print(times)
hours = {day:time for day,time in zip(days,times)}
for time in times:
    print(time)
print(hours)

['Closed', 'Closed', '7:00 PM - 12:00 AM (Next day)', '7:00 PM - 12:00 AM (Next day)', '7:00 PM - 12:00 AM (Next day)', '7:00 PM - 12:00 AM (Next day)', 'Closed']
Closed
Closed
7:00 PM - 12:00 AM (Next day)
7:00 PM - 12:00 AM (Next day)
7:00 PM - 12:00 AM (Next day)
7:00 PM - 12:00 AM (Next day)
Closed
{'Monday': 'Closed', 'Tuesday': 'Closed', 'Wednesday': '7:00 PM - 12:00 AM (Next day)', 'Thursday': '7:00 PM - 12:00 AM (Next day)', 'Friday': '7:00 PM - 12:00 AM (Next day)', 'Saturday': '7:00 PM - 12:00 AM (Next day)', 'Sunday': 'Closed'}


In [470]:
# Euros
euros_class = "css-1xxismk"

euros_category = soup.findAll('span', class_ = euros_class)
euros_category = len(euros_category[0].text.strip())

4