# Scraping Grocery Website
https://www.brasil-latino.de/

Wallace G. Ferreira - Project: Omdena Berlin - Groceries Recommendation System

# Imports

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Constant Fields

In [59]:
TEAM_MEMBER = 'Wallace Ferreira'
STORE_NAME = 'Ponto Brasil & Latino'
STORE_SITE = 'https://www.brasil-latino.de/'

In [58]:
from datetime import date
today = date.today()
DATE = today.strftime("%b-%d-%Y")
DATE

'Apr-22-2023'

# Product Category - Bier - Find Scrapping Rules for Fields

In [2]:
url = 'https://www.brasil-latino.de/de/bier'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}
page = requests.get( url, headers=headers )

In [3]:
soup = BeautifulSoup( page.text, 'html.parser' )

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="de">
 <head>
  <base href="https://www.brasil-latino.de/"/>
  <title>
   Biere aus Lateinamerika online kaufen | Riesen Auswahl
  </title>
  <link href="https://www.brasil-latino.de/de/bier" rel="canonical"/>
  <meta content="index,follow" name="robots"/>
  <meta content="Biere aus Lateinamerika bequem online kaufen | Die größte Auswahl an brasilianischen und lateinamerikanischen Produkten im Internet | Über 100.000 zufriedene Kunden!" name="description"/>
  <meta content="Bier, Cerveca, Lateinamerika, Latino america, biera, beer, latin america, cerveza, cerveja" name="keywords"/>
  <meta content="xt:Commerce 4 - 4.2.00" name="generator"/>
  <meta content="kgg9janz9wxg4n0p95g9oufuww5zdz" name="facebook-domain-verification"/>
  <!-- RESPONSIVE SETUP -->
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
  <!-- FILE::jquery.min.js -->
  <script type

In [29]:
PROD_CATEGORY = soup.find('li',{'class': 'active'}).find('a',class_='visible-xs ripple nav-name').get_text()
PROD_CATEGORY

'Bier'

In [6]:
products = soup.find_all('div', class_="image-wrap")

In [7]:
len(products)

25

In [10]:
type(products[0].a)

bs4.element.Tag

In [16]:
a_tag = products[0].find("a", {"class": "vertical-helper image-link"})

In [30]:
PROD_LINK = a_tag["href"]
PROD_LINK

'https://www.brasil-latino.de/de/ponto-brasil-latino-pan-america-biere-12-fl-set-cervezas-pan-america-set-regalo-12-botellas'

In [31]:
PROD_IMAGE_URL = a_tag.find("img")["src"]
PROD_IMAGE_URL

'https://www.brasil-latino.de/media/images/ewsucosthumb/x8153_1-biere-pan-america-12FL-geschenk-set.jpg'

In [32]:
PROD_DESCRIPTION = a_tag.find("img")["alt"]
PROD_DESCRIPTION

'PONTO BRASIL LATINO Pan America Biere-12 FL Set Cervezas Pan America Set Regalo 12 botellas'

In [52]:
PROD_PRICE = soup.find_all('div',class_="panel-body-info text-center")[0].find('p',class_='product-price').get_text()

' 29,80 EUR*'

# Scrapping All Product Listing Pages  - 1st Level 

## List of URLs

In [96]:
URLs = ['https://www.brasil-latino.de/de/bier',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=1',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=2',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=3',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=4',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=5',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=6',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=7',
'https://www.brasil-latino.de/de/categorie?cat=198&next_page=8',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=1',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=2',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=3',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=4',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=5',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=6',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=7',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=6',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=9',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=10',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=11',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=12',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=13',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=14',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=15',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=16',
'https://www.brasil-latino.de/de/categorie?cat=199&next_page=17',
'https://www.brasil-latino.de/de/categorie?cat=200&next_page=1',
'https://www.brasil-latino.de/de/categorie?cat=200&next_page=2',
'https://www.brasil-latino.de/de/yerba-mate-kaffee-tee?cat=201&next_page=1',
'https://www.brasil-latino.de/de/yerba-mate-kaffee-tee?cat=201&next_page=2',
'https://www.brasil-latino.de/de/yerba-mate-kaffee-tee?cat=201&next_page=3',
'https://www.brasil-latino.de/de/wein-sekt',
'https://www.brasil-latino.de/de/kosmetik',
'https://www.brasil-latino.de/de/non-food?cat=203&next_page=1',
'https://www.brasil-latino.de/de/non-food?cat=203&next_page=2',
'https://www.brasil-latino.de/de/geschenke?cat=212&next_page=1',
'https://www.brasil-latino.de/de/geschenke?cat=212&next_page=2',
'https://www.brasil-latino.de/de/geschenke?cat=212&next_page=3',
'https://www.brasil-latino.de/de/sparpacks',
'https://www.brasil-latino.de/de/gesunde-ernaehrung?cat=326&next_page=1',
'https://www.brasil-latino.de/de/gesunde-ernaehrung?cat=326&next_page=2',
'https://www.brasil-latino.de/de/gesunde-ernaehrung?cat=326&next_page=3',
'https://www.brasil-latino.de/de/tiefkuehlprodukte1?cat=323&next_page=1',
'https://www.brasil-latino.de/de/tiefkuehlprodukte1?cat=323&next_page=2',
'https://www.brasil-latino.de/de/dia-de-los-muertos',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=1',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=2',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=3',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=4',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=5',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=6',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=7',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=8',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=9',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=10',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=11',
'https://www.brasil-latino.de/de/brasilien?cat=196&next_page=12',
'https://www.brasil-latino.de/de/dominikanische-republik?cat=204&next_page=1',
'https://www.brasil-latino.de/de/dominikanische-republik?cat=204&next_page=2',
'https://www.brasil-latino.de/de/dominikanische-republik?cat=204&next_page=3',
'https://www.brasil-latino.de/de/dominikanische-republik?cat=204&next_page=4',
'https://www.brasil-latino.de/de/argentinien?cat=205&next_page=1',
'https://www.brasil-latino.de/de/argentinien?cat=205&next_page=2',
'https://www.brasil-latino.de/de/argentinien?cat=205&next_page=3',
'https://www.brasil-latino.de/de/argentinien?cat=205&next_page=4',
'https://www.brasil-latino.de/de/kolumbien?cat=206&next_page=1',
'https://www.brasil-latino.de/de/kolumbien?cat=206&next_page=2',
'https://www.brasil-latino.de/de/kolumbien?cat=206&next_page=3',
'https://www.brasil-latino.de/de/kuba?cat=207&next_page=1',
'https://www.brasil-latino.de/de/kuba?cat=207&next_page=2',
'https://www.brasil-latino.de/de/venezuela-bolivien',
'https://www.brasil-latino.de/de/peru?cat=209&next_page=1',
'https://www.brasil-latino.de/de/peru?cat=209&next_page=2',
'https://www.brasil-latino.de/de/mexiko?cat=210&next_page=1',
'https://www.brasil-latino.de/de/mexiko?cat=210&next_page=2',
'https://www.brasil-latino.de/de/mexiko?cat=210&next_page=3',
'https://www.brasil-latino.de/de/mexiko?cat=210&next_page=4',
'https://www.brasil-latino.de/de/mexiko?cat=210&next_page=5',
'https://www.brasil-latino.de/de/mexiko?cat=210&next_page=6',
'https://www.brasil-latino.de/de/sonstige-laender?cat=211&next_page=1',
'https://www.brasil-latino.de/de/sonstige-laender?cat=211&next_page=2',
'https://www.brasil-latino.de/de/gutscheine']

## Loop for all URLs

In [97]:
#URLs = URLs[0:3]
#URLs


In [103]:
df = pd.DataFrame()
df_log = pd.DataFrame()
for url in URLs:
  headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}
  page = requests.get( url, headers=headers )
  soup = BeautifulSoup( page.text, 'html.parser')

  PROD_CATEGORY = soup.find('li',{'class': 'active'}).find('a',class_='visible-xs ripple nav-name').get_text()

  products = soup.find_all('div', class_="image-wrap")
  num_prod = len(products)
  for i in range(num_prod):
    a_tag = products[i].find("a", {"class": "vertical-helper image-link"})
    if (a_tag):
      PROD_DESCRIPTION = a_tag.find("img")["alt"]
      PROD_LINK = a_tag["href"]
      PROD_IMAGE_URL = a_tag.find("img")["src"]
      PROD_PRICE = soup.find_all('div',class_="panel-body-info text-center")[i].find('p',class_='product-price').get_text()
      df = pd.concat([df,pd.DataFrame({'DATE': DATE, 
                                     'TEAM_MEMBER': TEAM_MEMBER,
                                     'PROD_CATEGORY': PROD_CATEGORY,
                                     'PROD_DESCRIPTION': PROD_DESCRIPTION,
                                     'PROD_PRICE': PROD_PRICE,
                                     'PROD_LINK': PROD_LINK,
                                     'PROD_IMAGE_URL': PROD_IMAGE_URL
                                     },index=[0])],axis=0)
      df.to_csv('ponto_brasil-latino_1st-level_run.csv',index=False)
    else:
      LOG = 'Fail to scrape: ' + url
      df_log = pd.concat([df_log, pd.DataFrame({'LOG': LOG},index=[0])],axis=0)

df.shape

(2649, 7)

In [104]:
df_log.shape

(3, 1)

In [106]:
df_log

Unnamed: 0,LOG
0,Fail to scrape: https://www.brasil-latino.de/d...
0,Fail to scrape: https://www.brasil-latino.de/d...
0,Fail to scrape: https://www.brasil-latino.de/d...


In [100]:
df.head()

Unnamed: 0,DATE,TEAM_MEMBER,PROD_CATEGORY,PROD_DESCRIPTION,PROD_PRICE,PROD_LINK,PROD_IMAGE_URL
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Pan America Biere-12 FL Se...,"29,80 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Pan America Biere- 9 FL Se...,"25,50 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Biere: Welt Lateinamerikas...,"30,15 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,"FIESTA DE LOS MUERTOS IPA, 355ml, 6,5% vol. -...","3,20 EUR*",https://www.brasil-latino.de/de/fiesta-de-los-...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,"FIESTA DE LOS MUERTOS Porter, 355ml, 5,4% vol...","3,20 EUR*",https://www.brasil-latino.de/de/fiesta-de-los-...,https://www.brasil-latino.de/media/images/ewsu...


In [101]:
df.tail()

Unnamed: 0,DATE,TEAM_MEMBER,PROD_CATEGORY,PROD_DESCRIPTION,PROD_PRICE,PROD_LINK,PROD_IMAGE_URL
0,Apr-22-2023,Wallace Ferreira,Lebensmittel,MADE IN MARKET Mini Hähnchen & Käse Krokette -...,"13,10 EUR*",https://www.brasil-latino.de/de/made-in-market...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Lebensmittel,NESCAU Kakaohaltiges Instant-Getränkepulver - ...,"3,95 EUR*",https://www.brasil-latino.de/de/nescau-kakaoha...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Lebensmittel,FRIGOSTO - Stockfisch Kroketten - Bolinhos de ...,"8,97 EUR*",https://www.brasil-latino.de/de/frigosto-stock...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Lebensmittel,"Massa de Pastel de Feira FERRETH, Rolo 750g (...","11,95 EUR*",https://www.brasil-latino.de/de/massa-de-paste...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Lebensmittel,VILLA D AGRI TK-Blätterteig für Empanadas zum ...,"5,65 EUR*",https://www.brasil-latino.de/de/villa-d-agri-t...,https://www.brasil-latino.de/media/images/ewsu...


In [108]:
df.to_csv('ponto_brasil-latino_1st-level.csv',index=False)

In [120]:
df_log

Unnamed: 0,LOG
0,Fail to scrape: https://www.brasil-latino.de/d...
0,Fail to scrape: https://www.brasil-latino.de/d...
0,Fail to scrape: https://www.brasil-latino.de/d...


## Scraping failed pages manually

In [126]:
URLs = ['https://www.brasil-latino.de/de/argentinien?cat=205&next_page=3']

df_log = pd.DataFrame()
for url in URLs:
  headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}
  page = requests.get( url, headers=headers )
  soup = BeautifulSoup( page.text, 'html.parser')

  PROD_CATEGORY = soup.find('li',{'class': 'active'}).find('a',class_='visible-xs ripple nav-name').get_text()

  products = soup.find_all('div', class_="image-wrap")
  num_prod = len(products)
  for i in range(num_prod):
    a_tag = products[i].find("a", {"class": "vertical-helper image-link"})
    if (a_tag):
      PROD_DESCRIPTION = a_tag.find("img")["alt"]
      PROD_LINK = a_tag["href"]
      PROD_IMAGE_URL = a_tag.find("img")["src"]
      PROD_PRICE = soup.find_all('div',class_="panel-body-info text-center")[i].find('p',class_='product-price').get_text()
      df = pd.concat([df,pd.DataFrame({'DATE': DATE, 
                                     'TEAM_MEMBER': TEAM_MEMBER,
                                     'PROD_CATEGORY': PROD_CATEGORY,
                                     'PROD_DESCRIPTION': PROD_DESCRIPTION,
                                     'PROD_PRICE': PROD_PRICE,
                                     'PROD_LINK': PROD_LINK,
                                     'PROD_IMAGE_URL': PROD_IMAGE_URL
                                     },index=[0])],axis=0)
      df.to_csv('ponto_brasil-latino_1st-level_run.csv',index=False)
    else:
      LOG = 'Fail to scrape: ' + url
      df_log = pd.concat([df_log, pd.DataFrame({'LOG': LOG},index=[0])],axis=0)

df.shape

(2961, 7)

In [122]:
df_log

Unnamed: 0,LOG
0,Fail to scrape: https://www.brasil-latino.de/d...


In [127]:
df.tail()

Unnamed: 0,DATE,TEAM_MEMBER,PROD_CATEGORY,PROD_DESCRIPTION,PROD_PRICE,PROD_LINK,PROD_IMAGE_URL
0,Apr-22-2023,Wallace Ferreira,Argentinien,"BLACK RANCH Hüftdeckel - Picanha (TK-Produkt),...","99,00 EUR*",https://www.brasil-latino.de/de/black-ranch-hu...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Argentinien,"BLACK RANCH Hüftdeckel - Picanha (TK-Produkt),...","49,50 EUR*",https://www.brasil-latino.de/de/black-ranch-hu...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Argentinien,"BLACK RANCH Hüftdeckel - Picanha (TK-Produkt),...","54,00 EUR*",https://www.brasil-latino.de/de/black-ranch-hu...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Argentinien,"BLACK RANCH Hüftdeckel - Picanha (TK-Produkt),...","58,50 EUR*",https://www.brasil-latino.de/de/black-ranch-hu...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Argentinien,"BLACK RANCH Hüftdeckel - Picanha (TK-Produkt),...","63,00 EUR*",https://www.brasil-latino.de/de/black-ranch-hu...,https://www.brasil-latino.de/media/images/ewsu...


In [128]:
df = df.drop_duplicates()

In [129]:
df.shape

(2610, 7)

In [130]:
df.to_csv('ponto_brasil-latino_1st-level.csv',index=False)

In [131]:
df.head()

Unnamed: 0,DATE,TEAM_MEMBER,PROD_CATEGORY,PROD_DESCRIPTION,PROD_PRICE,PROD_LINK,PROD_IMAGE_URL
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Pan America Biere-12 FL Se...,"29,80 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Pan America Biere- 9 FL Se...,"25,50 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,PONTO BRASIL LATINO Biere: Welt Lateinamerikas...,"30,15 EUR*",https://www.brasil-latino.de/de/ponto-brasil-l...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,"FIESTA DE LOS MUERTOS IPA, 355ml, 6,5% vol. -...","3,20 EUR*",https://www.brasil-latino.de/de/fiesta-de-los-...,https://www.brasil-latino.de/media/images/ewsu...
0,Apr-22-2023,Wallace Ferreira,Bier,"FIESTA DE LOS MUERTOS Porter, 355ml, 5,4% vol...","3,20 EUR*",https://www.brasil-latino.de/de/fiesta-de-los-...,https://www.brasil-latino.de/media/images/ewsu...


In [132]:
df['STORE_NAME'] = STORE_NAME
df['STORE_SITE'] = STORE_SITE

In [133]:
df.to_csv('ponto_brasil-latino_1st-level.csv',index=False)

In [134]:
df.columns

Index(['DATE', 'TEAM_MEMBER', 'PROD_CATEGORY', 'PROD_DESCRIPTION',
       'PROD_PRICE', 'PROD_LINK', 'PROD_IMAGE_URL', 'STORE_NAME',
       'STORE_SITE'],
      dtype='object')