# Scraping Grocery Website
https://www.brasil-latino.de/

Wallace G. Ferreira - Project: Omdena Berlin - Groceries Recommendation System

# Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup


# Function to extract dynamic data

Created by **Kahled Atef** with minor updates to accomodate driver path

In [2]:
def get_name_rate_reviews(url, driver_path):
    options = Options()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(executable_path=driver_path, options=options)
    driver.get(url)
    #Product Name
    p = '//*[@id="product-name"]'
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, p))
    )
    p_name = element.get_attribute("textContent")
    # Rating
    try:
        p = '//*[@id="product-intro"]/div[1]/a/div/div[2]/span[1]'
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, p))
        )
        text = element.get_attribute("textContent")
    except:
        text = None #product doesnt have rating
    #Reviews
    try:
        wait = WebDriverWait(driver, 10)
        p ='//*[@id="trustedshops-productreviews-sticker-wrapper"]/div[1]/div[2]/ul'
        ul = wait.until(EC.presence_of_element_located((By.XPATH, p)))
        inner_html = ul.get_attribute('innerHTML')
        soup = BeautifulSoup(inner_html, 'html.parser')
        li = soup.find_all(class_='ts-review-text')
        li = [item.text for item in li]
        li = list(filter(lambda x: x.strip(), li))
    except:
        li = []
        #Product has no reviews
    driver.quit()
    return p_name ,text,li

In [15]:
url = 'https://www.brasil-latino.de/de/ponto-brasil-latino-pan-america-biere-12-fl-set-cervezas-pan-america-set-regalo-12-botellas'
name , rating , review = get_name_rate_reviews(url, 'C:\\Windows\\chromedriver_win32\\chromedriver.exe')

In [16]:
name

'PONTO BRASIL LATINO Pan America Biere-12 FL Set Cervezas Pan America Set Regalo 12 botellas'

In [17]:
rating

'4.80'

In [18]:
review

['Immer auf der Suche nach unbekannten Bieren zum Probieren, diese Mischung schien (und ist) dafür ideal.',
 'Top! Bin sehr zufrieden. Bier-Sortiment in dieser Form einzigartig',
 'Schnelle Lieferung u tolles Produkt',
 'Sie sind alle lecker. Schade, dass die Flasche klein ist.']

# Collecting 2nd level data for all product pages

In [2]:
TEAM_MEMBER = 'Wallace Ferreira'
STORE_NAME = 'Ponto Brasil & Latino'
STORE_SITE = 'https://www.brasil-latino.de/'

In [3]:
import pandas as pd
df = pd.read_csv('ponto_brasil-latino_1st-level.csv')

In [5]:
df.head()

Unnamed: 0,DATE,TEAM_MEMBER,PROD_CATEGORY,PROD_DESCRIPTION,PROD_PRICE,PROD_LINK,PROD_IMAGE_URL,STORE_NAME,STORE_SITE
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Pan America Biere-12 FL Se...,"29,80 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...,Ponto Brasil & Latino,https://www.brasil-latino.de/
1,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Pan America Biere- 9 FL Se...,"25,50 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...,Ponto Brasil & Latino,https://www.brasil-latino.de/
2,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Biere: Welt Lateinamerikas...,"30,15 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...,Ponto Brasil & Latino,https://www.brasil-latino.de/
3,Apr-22-2023,Wallace Ferreira,Bier,"FIESTA DE LOS MUERTOS IPA, 355ml, 6,5% vol. -...","3,20 EUR*",https://www.brasil-latino.de/de/fiesta-de-los-...,https://www.brasil-latino.de/media/images/ewsu...,Ponto Brasil & Latino,https://www.brasil-latino.de/
4,Apr-22-2023,Wallace Ferreira,Bier,"FIESTA DE LOS MUERTOS Porter, 355ml, 5,4% vol...","3,20 EUR*",https://www.brasil-latino.de/de/fiesta-de-los-...,https://www.brasil-latino.de/media/images/ewsu...,Ponto Brasil & Latino,https://www.brasil-latino.de/


In [4]:
DATE = str(df['DATE'].unique()).split('\'')[1]
DATE

'Apr-22-2023'

In [7]:
URLs = list(df['PROD_LINK'].unique())
URLs

['https://www.brasil-latino.de/de/ponto-brasil-latino-pan-america-biere-12-fl-set-cervezas-pan-america-set-regalo-12-botellas',
 'https://www.brasil-latino.de/de/ponto-brasil-latino-pan-america-biere-9-fl-set-cervezas-pan-america-set-regalo-9-botellas',
 'https://www.brasil-latino.de/de/ponto-brasil-latino-biere-welt-lateinamerikas-12-flasche-set-cervezas-de-latinoamrica',
 'https://www.brasil-latino.de/de/fiesta-de-los-muertos-ipa-355ml-6-5-vol-dpg1',
 'https://www.brasil-latino.de/de/fiesta-de-los-muertos-porter-355ml-5-4-vol-dpg1',
 'https://www.brasil-latino.de/de/fiesta-de-los-muertos-amber-ale-355ml-5-5-vol-dpg1',
 'https://www.brasil-latino.de/de/fiesta-de-los-muertos-pale-ale-355ml-5vol-dpg1',
 'https://www.brasil-latino.de/de/cerveza-modelo-especial-4-5-vol-355ml-dpg1',
 'https://www.brasil-latino.de/de/cerveza-negra-modelo-5-3-vol-355ml-dpg1',
 'https://www.brasil-latino.de/de/polar-pilsener-bier-355ml-dpg1',
 'https://www.brasil-latino.de/de/cerveza-pacifico-clara-4-5-vol-32

In [5]:
from tqdm import tqdm

In [9]:
df_2 = pd.DataFrame()
df_log_2 = pd.DataFrame()
for url in tqdm(URLs):
    try:
        PROD_NAME , PROD_RATING , PROD_REVIEWS = get_name_rate_reviews(url, 'C:\\Windows\\chromedriver_win32\\chromedriver.exe')
        df_2 = pd.concat([df_2,pd.DataFrame({'PROD_NAME': PROD_NAME,
                                     'PROD_RATING': PROD_RATING,
                                     'PROD_REVIEWS': str(PROD_REVIEWS),
                                     'TEAM_MEMBER': TEAM_MEMBER,
                                     'STORE_NAME': STORE_NAME,
                                     'STORE_SITE': STORE_SITE,
                                     'DATE': DATE},index=[0])],axis=0)
        df_2.to_csv('ponto_brasil-latino_2nd-level_run.csv',index=False)
    except:
        LOG = 'Fail to scrape: ' + url
        df_log_2 = pd.concat([df_log_2, pd.DataFrame({'LOG': LOG},index=[0])],axis=0)
        df_log_2.to_csv('log_2.csv',index=False)
    
df_2.shape

100%|██████████| 1174/1174 [5:56:32<00:00, 18.22s/it] 


(1172, 7)

In [10]:
df_2.to_csv('ponto_brasil-latino_2nd-level.csv',index=False)

## Failed runs

In [8]:
df_2= pd.read_csv('ponto_brasil-latino_2nd-level.csv')

In [10]:
URLs = ['https://www.brasil-latino.de/de/el-plebeyo-getrocknete-gelbe-kartoffeln-papa-seca-amarilla-500g1',
'https://www.brasil-latino.de/de/la-meridana-rote-habanero-chilisauce-salsa-roja-chile-habanero-150ml']

In [11]:
df_log_2 = pd.DataFrame()
for url in tqdm(URLs):
    try:
        PROD_NAME , PROD_RATING , PROD_REVIEWS = get_name_rate_reviews(url, 'C:\\Windows\\chromedriver_win32\\chromedriver.exe')
        df_2 = pd.concat([df_2,pd.DataFrame({'PROD_NAME': PROD_NAME,
                                     'PROD_RATING': PROD_RATING,
                                     'PROD_REVIEWS': str(PROD_REVIEWS),
                                     'TEAM_MEMBER': TEAM_MEMBER,
                                     'STORE_NAME': STORE_NAME,
                                     'STORE_SITE': STORE_SITE,
                                     'DATE': DATE},index=[0])],axis=0)
        df_2.to_csv('ponto_brasil-latino_2nd-level_run.csv',index=False)
    except:
        LOG = 'Fail to scrape: ' + url
        df_log_2 = pd.concat([df_log_2, pd.DataFrame({'LOG': LOG},index=[0])],axis=0)
        df_log_2.to_csv('log_2.csv',index=False)
    
df_2.shape

100%|██████████| 2/2 [00:00<00:00, 661.56it/s]


(1172, 7)