# Scalable Web Scraping Pipeline

## Imports

In [321]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pandas import DataFrame as DataFrame
import urllib.parse
import re

## Opens the link with the tables -> PAGE 1

In [322]:
url = "https://www.europages.co.uk/en/search?isPserpFirst=1&q=winery+supplies"
headers = {"User-Agent": ("KoupBot/0.0" "Contact:georgekoupni@gmail.com")}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html')

## Prints the prettify soup

In [323]:
# print(soup.prettify())

## Finds all the tags except for the last one that is not necessary -> PAGE 1

In [324]:
h2_tags = soup.find_all('h2')
for h2tag in h2_tags[:-1]:
    print(h2tag.text)

BARRICASDEMADERA S.L.
MA TAPAS TRADING
PRINTMASTA
Sélection Patrick Clerget
ETICA CRITICA
TECI & COM
CAÑIZOS ALPE SL
ASCO CARBON DIOXIDE LTD
CANARY ISLAND WORLDWIDE S.L.
Bodega Les Freses
TRANSFLUID
PROINNOVA CONSULTORÍA TÉCNICA
Agricole Pietraventosa di Annio Marianna
NURIMAR
Green Life Revolution sl
BUCHER VASLIN
R.G. MANIFATTURE S.R.L.
Sustainable Grove FlexCo
API PLASTIQUES
Bodegas Vega Norte
CONCEPT-S LADENBAU & OBJEKTDESIGN GMBH
LIQUOSYSTEMS GMBH
FERD. BARDENHEWER GMBH & CO. KG
ARTEBOTTI  SRLS
HAUSSMANN FAMILLE
Barnacork
IBEHI SUMINISTROS,S.L.
VINS CAN MAS SL
GOJUTE FRANCE
ADEGA COOPERATIVA DE ALMEIRIM, CRL.


##Finds the .html link of each company -> PAGE 1

In [325]:
company_tiles = soup.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')

for tile in company_tiles:
    company_link = tile.find('a', attrs={'data-test': 'company-name'})
    if company_link:
        print("Found the URL:", company_link['href'])

Found the URL: /BARRICASDEMADERA-SL/00000004977382-548775001.html
Found the URL: /en/company/ma-tapas-trading-22323622
Found the URL: /en/company/printmasta-20041825
Found the URL: /SELECTION-CLERGET-PATRICK/FRA372319-00101.html
Found the URL: /en/company/etica-critica-22304935
Found the URL: /TECI-COM/00000005327374-624251001.html
Found the URL: /CANIZOS-ALPE-SL/00000005469141-842256001.html
Found the URL: /ASCO-CARBON-DIOXIDE-LTD/CHE051501-001.html
Found the URL: /CANARY-ISLAND-WORLDWIDE-SL/00000005425544-763896001.html
Found the URL: /en/company/bodega-les-freses-22317857
Found the URL: /TRANSFLUID/FRA318315-00101.html
Found the URL: /en/company/proinnova-consultoria-tecnica-22316274
Found the URL: /en/company/agricole-pietraventosa-di-annio-marianna-22351109
Found the URL: /NURIMAR/ESP194408-000005860001.html
Found the URL: /en/company/green-life-revolution-sl-22282039
Found the URL: /BUCHER-VASLIN/FRA008441-000018053001.html
Found the URL: /RG-MANIFATTURE-SRL/00000004696894-495231

## Finds the country -> PAGE 1

In [326]:
countriespage1 = []

country_tags1 = soup.find_all('div', class_='truncate')

for tag in country_tags1:
    country = tag.get_text(strip=True)
    countriespage1.append(country)
    print(country)

Spain,Esquivias
Netherlands,Sint-Oedenrode
Poland,Warsaw
France,Beaune Cedex
Spain,Chella VALENCIA
France,Sainte-Sigolène
Spain,Alcoletge
Switzerland,Wittenbach
Spain,Candelaria
Spain,Jesus Pobre - Denia,
France,Ste Hélène Du Lac
Spain,Jumilla
Italy,Gioia del Colle
Spain,Montichelvo-Valencia
Spain,iznalloz
France,Chalonnes Sur Loire
Italy,Galatina
Austria,Wien
France,BRENELLE
Spain,Tijarafe
Germany,Schorndorf
Germany,Kirchheim Am Neckar
Germany,Kiel
Italy,Ispica
France,St Andre De Cubzac
Spain,BARCELONA
Spain,Tomelloso
Spain,Girona
France,Gujan-Mestras
Portugal,Almeirim


## Makes a .csv -> page vs html links -> PAGE 1

In [327]:
names = [h2tag.text for h2tag in h2_tags[:-1]]

links = []
company_tiles = soup.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')
for tile in company_tiles:
    company_link = tile.find('a', attrs={'data-test': 'company-name'})
    if company_link:
        links.append(company_link['href'])

if len(names) != len(links):
    print("Warning: Mismatch between number of names and links found.")

    min_len = min(len(names), len(links))
    names = names[:min_len]
    links = links[:min_len]

df = pd.DataFrame({
    'NAME': names,
    'LINKS': links,
})

query_params = urllib.parse.urlparse(url).query
parsed_query = urllib.parse.parse_qs(query_params)
sector = parsed_query.get('q', ['default_sector'])[0].replace(' ', '_')

df.to_csv(f'links_{sector}.csv', index=False)
print(df)

                                        NAME  \
0                      BARRICASDEMADERA S.L.   
1                           MA TAPAS TRADING   
2                                 PRINTMASTA   
3                  Sélection Patrick Clerget   
4                              ETICA CRITICA   
5                                 TECI & COM   
6                            CAÑIZOS ALPE SL   
7                    ASCO CARBON DIOXIDE LTD   
8               CANARY ISLAND WORLDWIDE S.L.   
9                          Bodega Les Freses   
10                                TRANSFLUID   
11             PROINNOVA CONSULTORÍA TÉCNICA   
12  Agricole Pietraventosa di Annio Marianna   
13                                   NURIMAR   
14                  Green Life Revolution sl   
15                             BUCHER VASLIN   
16                   R.G. MANIFATTURE S.R.L.   
17                  Sustainable Grove FlexCo   
18                            API PLASTIQUES   
19                        Bodegas Vega N

## Opens the link with the tables -> PAGE 2

In [328]:
url2 = "https://www.europages.co.uk/en/search/page/2?isPserpFirst=1&q=winery%20supplies"
page2 = requests.get(url2)
soup2 = BeautifulSoup(page2.text, 'html')

## Finds all the tags except for the last one that is not necessary -> PAGE 2

In [329]:
h2_tags2 = soup2.find_all('h2')
for h2tag2 in h2_tags2[:-1]:
    print(h2tag2.text)

G. WEIN GMBH & CO.KG
ACRI WINE SERVICE
ELECTRON INDUSTRIAL MACHINES
VOLUTEX
INDUSTRIEBEDARF CASTAN GMBH
KELLEREIARTIKEL BEISTEINER INH. THOMAS BEISTEINER
JSA GROUP / LANZAROTE UNIVERSAL TRADE S.L.
MONTIONI
MIROS GROUP
CANTINA SOCIALE LUCA GENTILE SOC. COOP.
CPS CONVERTING PACKAGING & SERVICE
CASALE MATTIA
PARRAMON EXPORTAP S.L.
INOXSA SRL
TARDITO S.A.S. DI TARDITO LUCA E C.
FERDINAND PRÜCKLER
BODEGAS VINIVAL SL
MH WINES
RASIN IT (INTERNATIONAL TRADING) GMBH
DIJSO
CAVALLI GIUSEPPE & VINCENZO S.N.C.
LEVINI HANDELSGESELLSCHAFT MBH
FORATIA EUROPE OU
RTB SRL
C.O.A. DI ELIO RAPINO
ESQUIO INGENIERÍA
PUBLICA
MARSILINOX INDUSTRIA METALURGICA LDA.
WIJNHANDEL ANNICAERT
Europagricultproduct  (EAP)


## Finds the country -> PAGE 2

In [330]:
countriespage2 = []

country_tags2 = soup2.find_all('div', class_='truncate')

for tag2 in country_tags2:
    country2 = tag2.get_text(strip=True)
    countriespage2.append(country2)
    print(country2)

Germany,Bönnigheim
Italy,Gussago
Italy,Foiano Della Chiana
France,Ozoir-La-Ferrière
Germany,Freiberg
Austria,Neckenmarkt
Spain,Socuéllamos, Ciudad Real
Italy,Montefalco
Italy,Fano
Italy,Cassano Delle Murge
Italy,Brindisi
Italy,Frascati
Spain,Cassà de la Selva
Italy,San Pancrazio Salentino
Italy,San Marzano Oliveto
Austria,Langenlois
Spain,Chiva
Netherlands,Amsterdam
Austria,Vienna
Netherlands,Andelst
Italy,Bagnara Di Romagna
Germany,Weinstadt
Finland,Turku
Italy,Gallo D'alba
Italy,Ortona
Spain,Vigo (pontevedra)
Belgium,Dessel
Portugal,Vale De Cambra
Belgium,Sint-Andries Brugge
Spain,VALENCIA


## Finds the .html link of each company -> PAGE 2

In [331]:
company_tiles2 = soup2.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')

for tile2 in company_tiles2:
    company_link2 = tile2.find('a', attrs={'data-test': 'company-name'})
    if company_link2:
        print("Found the URL:", company_link2['href'])

Found the URL: /G-WEIN-GMBH-COKG/00000005312746-001.html
Found the URL: /ACRI-WINE-SERVICE/00000005388822-707741001.html
Found the URL: /ELECTRON-INDUSTRIAL-MACHINES/00000005414028-751813001.html
Found the URL: /VOLUTEX/00000005291526-001.html
Found the URL: /INDUSTRIEBEDARF-CASTAN-GMBH/DEU056533-001.html
Found the URL: /KELLEREIARTIKEL-BEISTEINER-INH-THOMAS-BEISTEINER/00000005477076-001.html
Found the URL: /JSA-GROUP-LANZAROTE-UNIVERSAL-TRADE-SL/00000004754689-540110001.html
Found the URL: /MONTIONI/00000004504897-316973001.html
Found the URL: /MIROS-GROUP/SEAC001707068-001.html
Found the URL: /CANTINA-SOCIALE-LUCA-GENTILE-SOC-COOP/SEAC000066200-001.html
Found the URL: /CPS-CONVERTING-PACKAGING-SERVICE/00000004487150-313221001.html
Found the URL: /CASALE-MATTIA/00000005413296-750243001.html
Found the URL: /en/company/parramon-exportap-sl-22283245
Found the URL: /INOXSA-SRL/00000005364643-664330001.html
Found the URL: /TARDITO-SAS-DI-TARDITO-LUCA-E-C/SEAC005441106-001.html
Found the UR

## Makes a .csv -> page vs html links -> PAGE 2


In [332]:
names2 = [h2tag2.text for h2tag2 in h2_tags2[:-1]]

links2 = []
company_tiles2 = soup2.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')
for tile2 in company_tiles2:
    company_link2 = tile2.find('a', attrs={'data-test': 'company-name'})
    if company_link2:
        links2.append(company_link2['href'])

if len(names2) != len(links2):
    print("Warning: Mismatch between number of names and links found.")

    min_len2 = min(len(names2), len(links2))
    names2 = names2[:min_len2]
    links2 = links2[:min_len2]

df2 = pd.DataFrame({
    'NAME': names2,
    'LINKS': links2,
})

query_params2 = urllib.parse.urlparse(url2).query
parsed_query2 = urllib.parse.parse_qs(query_params2)
sector2 = parsed_query2.get('q', ['default_sector'])[0].replace(' ', '_')

df2.to_csv(f'links_{sector2}.csv', index=False)
print(df2)

                                                 NAME  \
0                                G. WEIN GMBH & CO.KG   
1                                   ACRI WINE SERVICE   
2                        ELECTRON INDUSTRIAL MACHINES   
3                                             VOLUTEX   
4                         INDUSTRIEBEDARF CASTAN GMBH   
5   KELLEREIARTIKEL BEISTEINER INH. THOMAS BEISTEINER   
6          JSA GROUP / LANZAROTE UNIVERSAL TRADE S.L.   
7                                            MONTIONI   
8                                         MIROS GROUP   
9             CANTINA SOCIALE LUCA GENTILE SOC. COOP.   
10                 CPS CONVERTING PACKAGING & SERVICE   
11                                      CASALE MATTIA   
12                             PARRAMON EXPORTAP S.L.   
13                                         INOXSA SRL   
14                TARDITO S.A.S. DI TARDITO LUCA E C.   
15                                 FERDINAND PRÜCKLER   
16                             

## RIGHT ORDER -> PAGE 1 = PAGE 1

In [333]:
names = [h2tag.text for h2tag in h2_tags[:-1]]

links = []
company_tiles = soup.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')
for tile in company_tiles:
    company_link = tile.find('a', attrs={'data-test': 'company-name'})
    if company_link:
        links.append(company_link['href'])

if len(names) != len(links):
    print("Warning: Mismatch between number of names and links found.")

    min_len = min(len(names), len(links))
    names = names[:min_len]
    links = links[:min_len]

df = pd.DataFrame({
    'NAME': names,
    'LINKS': links,
})

query_params = urllib.parse.urlparse(url).query
parsed_query = urllib.parse.parse_qs(query_params)
sector = parsed_query.get('q', ['default_sector'])[0].replace(' ', '_')

df.to_csv(f'links_{sector}_page_1.csv', index=False)
print(df)

                                        NAME  \
0                      BARRICASDEMADERA S.L.   
1                           MA TAPAS TRADING   
2                                 PRINTMASTA   
3                  Sélection Patrick Clerget   
4                              ETICA CRITICA   
5                                 TECI & COM   
6                            CAÑIZOS ALPE SL   
7                    ASCO CARBON DIOXIDE LTD   
8               CANARY ISLAND WORLDWIDE S.L.   
9                          Bodega Les Freses   
10                                TRANSFLUID   
11             PROINNOVA CONSULTORÍA TÉCNICA   
12  Agricole Pietraventosa di Annio Marianna   
13                                   NURIMAR   
14                  Green Life Revolution sl   
15                             BUCHER VASLIN   
16                   R.G. MANIFATTURE S.R.L.   
17                  Sustainable Grove FlexCo   
18                            API PLASTIQUES   
19                        Bodegas Vega N

## RIGHT ORDER -> PAGE 2 = PAGE 2

In [334]:
names = [h2tag.text for h2tag in h2_tags[:-1]]

links = []
company_tiles = soup.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')
for tile in company_tiles:
    company_link = tile.find('a', attrs={'data-test': 'company-name'})
    if company_link:
        links.append(company_link['href'])

if len(names) != len(links):
    print("Warning: Mismatch between number of names and links found.")

    min_len = min(len(names), len(links))
    names = names[:min_len]
    links = links[:min_len]

df = pd.DataFrame({
    'NAME': names,
    'LINKS': links,
})

query_params = urllib.parse.urlparse(url).query
parsed_query = urllib.parse.parse_qs(query_params)
sector = parsed_query.get('q', ['default_sector'])[0].replace(' ', '_')

df.to_csv(f'links_{sector}_page_1.csv', index=False)
print(df)

names2 = [h2tag2.text for h2tag2 in h2_tags2[:-1]]

links2 = []
company_tiles2 = soup2.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')
for tile2 in company_tiles2:
    company_link2 = tile2.find('a', attrs={'data-test': 'company-name'})
    if company_link2:
        links2.append(company_link2['href'])

if len(names2) != len(links2):
    print("Warning: Mismatch between number of names and links found.")

    min_len2 = min(len(names2), len(links2))
    names2 = names2[:min_len2]
    links2 = links2[:min_len2]

df2 = pd.DataFrame({
    'NAME': names2,
    'LINKS': links2,
})

query_params2 = urllib.parse.urlparse(url2).query
parsed_query2 = urllib.parse.parse_qs(query_params2)
sector2 = parsed_query2.get('q', ['default_sector'])[0].replace(' ', '_')

df2.to_csv(f'links_{sector2}_page_2.csv', index=False)
print(df2)

                                        NAME  \
0                      BARRICASDEMADERA S.L.   
1                           MA TAPAS TRADING   
2                                 PRINTMASTA   
3                  Sélection Patrick Clerget   
4                              ETICA CRITICA   
5                                 TECI & COM   
6                            CAÑIZOS ALPE SL   
7                    ASCO CARBON DIOXIDE LTD   
8               CANARY ISLAND WORLDWIDE S.L.   
9                          Bodega Les Freses   
10                                TRANSFLUID   
11             PROINNOVA CONSULTORÍA TÉCNICA   
12  Agricole Pietraventosa di Annio Marianna   
13                                   NURIMAR   
14                  Green Life Revolution sl   
15                             BUCHER VASLIN   
16                   R.G. MANIFATTURE S.R.L.   
17                  Sustainable Grove FlexCo   
18                            API PLASTIQUES   
19                        Bodegas Vega N

## Opens the link with the tables -> PAGE 3

In [335]:
url3 = "https://www.europages.co.uk/en/search/page/3?isPserpFirst=1&q=winery%20supplies"
page3 = requests.get(url3)
soup3 = BeautifulSoup(page3.text, 'html')

## Finds all the tags except for the last one that is not necessary -> PAGE 3

In [336]:
h2_tags3 = soup3.find_all('h2')
for h2tag3 in h2_tags3[:-1]:
    print(h2tag3.text)

RENATE ENGEL
PAPOUNTI
WEINGUT JOHANNES HAAS
Ega Perfil SL
WATERWORLD-ITALY
LABORATOIRE COGNAC ŒNOLOGIE
VITISERVE GMBH
NET MARKETING S.R.L.
D.A.R. Metall AG
LEOPOLD SIEGRIST GMBH
BROADLAND WINERIES LIMITED
PATAZA PTY LTD
TECNIWOOD
EMIL WISSING GMBH WEINKELLEREI
HUMBERTO RIVERA&ASOC


## Finds the country -> PAGE 3

In [337]:
countriespage3 = []
country_tags3 = soup3.find_all('div', class_='truncate')

for tag3 in country_tags3:
    country3 = tag3.get_text(strip=True)
    countriespage3.append(country3)
    print(country3)

Germany,Malente
Italy,Pavia
Germany,Langenlonsheim
Spain,Villatuerta
Italy,Basciano
France,Cognac
Germany,Sommerhausen
Italy,Forli'
Germany,Ebeleben
Germany,Karlsruhe
United Kingdom,Norwich Norfolk
Australia,Sydney
France,Lézignan-Corbières
Germany,Oberotterbach
Spain,Martos


## Finds the .html link of each company -> PAGE 3

In [338]:
company_tiles3 = soup3.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')

for tile3 in company_tiles3:
    company_link3 = tile3.find('a', attrs={'data-test': 'company-name'})
    if company_link3:
        print("Found the URL:", company_link3['href'])

Found the URL: /RENATE-ENGEL/00000005036715-001.html
Found the URL: /PAPOUNTI/00000003885759-178420001.html
Found the URL: /WEINGUT-JOHANNES-HAAS/00000004936661-001.html
Found the URL: /en/company/ega-perfil-sl-22345975
Found the URL: /WATERWORLDITALY/00000005442461-779979001.html
Found the URL: /LABORATOIRE-COGNAC-%25C5%2592NOLOGIE/FRA723582-000018794001.html
Found the URL: /VITISERVE-GMBH/00000005541169-001.html
Found the URL: /NET-MARKETING-SRL/SEAC005411911-001.html
Found the URL: /en/company/dar-metall-ag-22333891
Found the URL: /LEOPOLD-SIEGRIST-GMBH/00000005318985-001.html
Found the URL: /BROADLAND-WINERIES-LIMITED/00000005356760-654504001.html
Found the URL: /PATAZA-PTY-LTD/00000005267441-581096001.html
Found the URL: /TECNIWOOD/00000005404184-001.html
Found the URL: /EMIL-WISSING-GMBH-WEINKELLEREI/DEU466192-00101.html
Found the URL: /HUMBERTO-RIVERAASOC/00000004659356-460497001.html


## RIGHT ORDER -> PAGE 3 = PAGE 3

In [339]:
names3 = [h2tag3.text for h2tag3 in h2_tags3[:-1]]

links3 = []
company_tiles3 = soup3.find_all('div', class_='flex flex-col shadow-100 rounded bg-white company-tile')
for tile3 in company_tiles3:
    company_link3 = tile3.find('a', attrs={'data-test': 'company-name'})
    if company_link3:
        links3.append(company_link3['href'])

if len(names3) != len(links3):
    print("Warning: Mismatch between number of names and links found.")

    min_len3 = min(len(names3), len(links3))
    names3 = names3[:min_len3]
    links3 = links3[:min_len3]

df3 = pd.DataFrame({
    'NAME': names3,
    'LINKS': links3,
})

query_params3 = urllib.parse.urlparse(url3).query
parsed_query3 = urllib.parse.parse_qs(query_params3)
sector3 = parsed_query3.get('q', ['default_sector'])[0].replace(' ', '_')

df3.to_csv(f'links_{sector3}_page_3.csv', index=False)
print(df3)

                              NAME  \
0                     RENATE ENGEL   
1                         PAPOUNTI   
2            WEINGUT JOHANNES HAAS   
3                    Ega Perfil SL   
4                 WATERWORLD-ITALY   
5      LABORATOIRE COGNAC ŒNOLOGIE   
6                   VITISERVE GMBH   
7             NET MARKETING S.R.L.   
8                 D.A.R. Metall AG   
9            LEOPOLD SIEGRIST GMBH   
10      BROADLAND WINERIES LIMITED   
11                  PATAZA PTY LTD   
12                       TECNIWOOD   
13  EMIL WISSING GMBH WEINKELLEREI   
14            HUMBERTO RIVERA&ASOC   

                                                LINKS  
0               /RENATE-ENGEL/00000005036715-001.html  
1             /PAPOUNTI/00000003885759-178420001.html  
2      /WEINGUT-JOHANNES-HAAS/00000004936661-001.html  
3                  /en/company/ega-perfil-sl-22345975  
4      /WATERWORLDITALY/00000005442461-779979001.html  
5   /LABORATOIRE-COGNAC-%25C5%2592NOLOGIE/FRA72358...  

## Csv Merge

In [340]:
dfall = pd.concat(
    map(pd.read_csv, ['/content/links_winery_supplies_page_1.csv', '/content/links_winery_supplies_page_2.csv', '/content/links_winery_supplies_page_3.csv']), ignore_index=True)
print(dfall)
dfall.to_csv('links_winery_supplies_all.csv', index=False)

                              NAME  \
0            BARRICASDEMADERA S.L.   
1                 MA TAPAS TRADING   
2                       PRINTMASTA   
3        Sélection Patrick Clerget   
4                    ETICA CRITICA   
..                             ...   
70      BROADLAND WINERIES LIMITED   
71                  PATAZA PTY LTD   
72                       TECNIWOOD   
73  EMIL WISSING GMBH WEINKELLEREI   
74            HUMBERTO RIVERA&ASOC   

                                                LINKS  
0   /BARRICASDEMADERA-SL/00000004977382-548775001....  
1               /en/company/ma-tapas-trading-22323622  
2                     /en/company/printmasta-20041825  
3     /SELECTION-CLERGET-PATRICK/FRA372319-00101.html  
4                  /en/company/etica-critica-22304935  
..                                                ...  
70  /BROADLAND-WINERIES-LIMITED/00000005356760-654...  
71      /PATAZA-PTY-LTD/00000005267441-581096001.html  
72                 /TECNIWOOD/000000054

## Reads the allcv

In [341]:
dfall2 = pd.read_csv('/content/links_winery_supplies_all.csv')
print(dfall2.to_string())

                                                 NAME                                                                      LINKS
0                               BARRICASDEMADERA S.L.                         /BARRICASDEMADERA-SL/00000004977382-548775001.html
1                                    MA TAPAS TRADING                                      /en/company/ma-tapas-trading-22323622
2                                          PRINTMASTA                                            /en/company/printmasta-20041825
3                           Sélection Patrick Clerget                            /SELECTION-CLERGET-PATRICK/FRA372319-00101.html
4                                       ETICA CRITICA                                         /en/company/etica-critica-22304935
5                                          TECI & COM                                    /TECI-COM/00000005327374-624251001.html
6                                     CAÑIZOS ALPE SL                             /CANIZOS-ALPE-S

## Finds the whole link + europages + html

In [342]:
onlylinks = dfall2["LINKS"].tolist()
print(onlylinks)
updatedstring = []
for linksepparate in onlylinks:
  updatedstring = "https://www.europages.co.uk" + linksepparate
  print(updatedstring)


['/BARRICASDEMADERA-SL/00000004977382-548775001.html', '/en/company/ma-tapas-trading-22323622', '/en/company/printmasta-20041825', '/SELECTION-CLERGET-PATRICK/FRA372319-00101.html', '/en/company/etica-critica-22304935', '/TECI-COM/00000005327374-624251001.html', '/CANIZOS-ALPE-SL/00000005469141-842256001.html', '/ASCO-CARBON-DIOXIDE-LTD/CHE051501-001.html', '/CANARY-ISLAND-WORLDWIDE-SL/00000005425544-763896001.html', '/en/company/bodega-les-freses-22317857', '/TRANSFLUID/FRA318315-00101.html', '/en/company/proinnova-consultoria-tecnica-22316274', '/en/company/agricole-pietraventosa-di-annio-marianna-22351109', '/NURIMAR/ESP194408-000005860001.html', '/en/company/green-life-revolution-sl-22282039', '/BUCHER-VASLIN/FRA008441-000018053001.html', '/RG-MANIFATTURE-SRL/00000004696894-495231001.html', '/en/company/sustainable-grove-flexco-22286852', '/API-PLASTIQUES/00000003735547-122221001.html', '/en/company/bodegas-vega-norte-22311492', '/CONCEPTS-LADENBAU-OBJEKTDESIGN-GMBH/00000005362575-

## Makes Csv Whole .html

In [343]:
base_url = "https://www.europages.co.uk"
dfall2['LINKS'] = dfall2['LINKS'].apply(lambda x: base_url + x)

dfall2.to_csv('links_winery_supplies_all_absolute.csv', index=False)
print(dfall2.head())

                        NAME  \
0      BARRICASDEMADERA S.L.   
1           MA TAPAS TRADING   
2                 PRINTMASTA   
3  Sélection Patrick Clerget   
4              ETICA CRITICA   

                                               LINKS  
0  https://www.europages.co.uk/BARRICASDEMADERA-S...  
1  https://www.europages.co.uk/en/company/ma-tapa...  
2  https://www.europages.co.uk/en/company/printma...  
3  https://www.europages.co.uk/SELECTION-CLERGET-...  
4  https://www.europages.co.uk/en/company/etica-c...  


## Scrapes data and saves in a soulist

In [344]:
souplist = []

headers = {"User-Agent": ("KoupBot/0.0" "Contact:georgekoupni@gmail.com")}

for index, row in dfall2.iterrows():
    eachlink = row['LINKS']
    try:
        page = requests.get(eachlink, headers=headers, timeout=10)
        soup = BeautifulSoup(page.content, "html.parser")
        souplist.append(soup)
        print(f"Successfully scraped: {eachlink}")
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {eachlink}: {e}")

print(f"\nTotal {len(souplist)} pages scraped and stored in souplist.")


Successfully scraped: https://www.europages.co.uk/BARRICASDEMADERA-SL/00000004977382-548775001.html
Successfully scraped: https://www.europages.co.uk/en/company/ma-tapas-trading-22323622
Successfully scraped: https://www.europages.co.uk/en/company/printmasta-20041825
Successfully scraped: https://www.europages.co.uk/SELECTION-CLERGET-PATRICK/FRA372319-00101.html
Successfully scraped: https://www.europages.co.uk/en/company/etica-critica-22304935
Successfully scraped: https://www.europages.co.uk/TECI-COM/00000005327374-624251001.html
Successfully scraped: https://www.europages.co.uk/CANIZOS-ALPE-SL/00000005469141-842256001.html
Successfully scraped: https://www.europages.co.uk/ASCO-CARBON-DIOXIDE-LTD/CHE051501-001.html
Successfully scraped: https://www.europages.co.uk/CANARY-ISLAND-WORLDWIDE-SL/00000005425544-763896001.html
Successfully scraped: https://www.europages.co.uk/en/company/bodega-les-freses-22317857
Successfully scraped: https://www.europages.co.uk/TRANSFLUID/FRA318315-00101.h

## Finds URL for each company.

In [345]:
linkspermanent = []
for soup in souplist:
  company_titles = soup.find_all('a', class_='btn btn--subtle btn--md website-button')
  if company_titles:
    for title in company_titles:
      href = title.get('href')
      linkspermanent.append(href)
      print(href)
  else:
    linkspermanent.append(None)
    print("No website button found for this company.")

http://www.barricasdemadera.com
https://tapastrading.com/
http://cdprintmasta.com/
https://www.patrick-clerget.com/
https://www.lacasadeltabaco.es
http://www.tecietcom.fr
https://www.canizosalpe.com
http://www.ascoco2.com
https://www.canaryislandworldwide.es
https://www.lesfreses.com/
https://www.transfluid.fr/
https://proinnovacon.es/
https://www.pietraventosa.it
http://www.nurimar.com
https://eurogreenlife@gmail.com
http://www.buchervaslin.com
http://www.rgservice.it
https://sustainablegrove.com/
http://www.apiplast.fr
https://veganorte.es/
https://www.concept-s-design.com/en/
No website button found for this company.
http://www.bardenhewer.de
https://www.artebotti.com/
http://www.haussmannfamille.com/
https://www.barnacork.com/
https://www.ibehi.com
https://vinscanmas.es/
https://gojuteinternational.com/fr
http://www.adegaalmeirim.pt
http://www.gwein.de
https://www.acriwineservice.com
http://www.electronweb.it/etichettatura/
https://www.volutex.fr/
http://www.industriebedarf-castan.

## Makes a csv with all the data |Name|Links|

In [346]:
if len(dfall2) != len(linkspermanent):
    print("Warning: The number of new links does not match the number of rows in the DataFrame. Truncating/padding linkspermanent.")
    if len(dfall2) > len(linkspermanent):
        linkspermanent.extend([None] * (len(dfall2) - len(linkspermanent)))
    else:
        linkspermanent = linkspermanent[:len(dfall2)]

dfall2['LINKS'] = linkspermanent

dfall2.to_csv('links_winery_supplies_all_absolute.csv', index=False)
print(dfall2.head())

                        NAME                             LINKS
0      BARRICASDEMADERA S.L.   http://www.barricasdemadera.com
1           MA TAPAS TRADING         https://tapastrading.com/
2                 PRINTMASTA          http://cdprintmasta.com/
3  Sélection Patrick Clerget  https://www.patrick-clerget.com/
4              ETICA CRITICA    https://www.lacasadeltabaco.es


## Merges all the countries + CSV |Name|Emails|Country

In [347]:
dfall2 = pd.read_csv('/content/links_winery_supplies_all_absolute.csv')

headers = {"User-Agent": ("KoupBot/0.0" "Contact:georgekoupni@gmail.com")}

# Page 1
page1_response = requests.get(url, headers=headers)
soup_page1 = BeautifulSoup(page1_response.text, 'html')
countriespage1 = [tag.get_text(strip=True) for tag in soup_page1.find_all('div', class_='truncate')]

# Page 2
page2_response = requests.get(url2, headers=headers)
soup_page2 = BeautifulSoup(page2_response.text, 'html')
countriespage2 = [tag.get_text(strip=True) for tag in soup_page2.find_all('div', class_='truncate')]

# Page 3
page3_response = requests.get(url3, headers=headers)
soup_page3 = BeautifulSoup(page3_response.text, 'html')
countriespage3 = [tag.get_text(strip=True) for tag in soup_page3.find_all('div', class_='truncate')]

# Merge them
all_countries_combined = countriespage1 + countriespage2 + countriespage3

# Fix possible mismatch
if len(all_countries_combined) != len(dfall2):
    print(f"Mismatch: {len(all_countries_combined)} countries extracted, df has {len(dfall2)} rows")
    min_len = min(len(all_countries_combined), len(dfall2))
    all_countries_combined = all_countries_combined[:min_len]
    dfall2 = dfall2.iloc[:min_len].copy()
    print(f"Adjusted: now using {min_len} entries.")

dfall2["COUNTRY"] = all_countries_combined

print("\nCountries aligned:\n")
print(dfall2[["NAME", "LINKS", "COUNTRY"]].head())

email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

all_emails = []

print("\nStarting email extraction...\n")

for idx, row in dfall2.iterrows():
    url = row["LINKS"]
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # extract emails from raw text
        found = re.findall(email_regex, response.text)

        # remove duplicates
        clean_emails = list(set(found))

        if len(clean_emails) == 0:
            clean_emails = None

        all_emails.append(clean_emails)

        print(f"[{idx}] {row['NAME']} → emails: {clean_emails}")

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        all_emails.append(None)

dfall2["EMAILS"] = all_emails

final_df = dfall2[["NAME", "EMAILS", "COUNTRY"]]
final_df.to_csv("final_name_emails_country.csv", index=False)

print("\nSaved final CSV: final_name_emails_country.csv")
print(final_df.head())



Countries aligned:

                        NAME                             LINKS  \
0      BARRICASDEMADERA S.L.   http://www.barricasdemadera.com   
1           MA TAPAS TRADING         https://tapastrading.com/   
2                 PRINTMASTA          http://cdprintmasta.com/   
3  Sélection Patrick Clerget  https://www.patrick-clerget.com/   
4              ETICA CRITICA    https://www.lacasadeltabaco.es   

                      COUNTRY  
0             Spain,Esquivias  
1  Netherlands,Sint-Oedenrode  
2               Poland,Warsaw  
3         France,Beaune Cedex  
4       Spain,Chella VALENCIA  

Starting email extraction...

[0] BARRICASDEMADERA S.L. → emails: None
[1] MA TAPAS TRADING → emails: None
[2] PRINTMASTA → emails: ['info@cdprintmasta.com']
[3] Sélection Patrick Clerget → emails: ['info@patrick-clerget.com']
[4] ETICA CRITICA → emails: None
[5] TECI & COM → emails: None
[6] CAÑIZOS ALPE SL → emails: ['nuria_alpe23@hotmail.com', 'info@canizosalpe.com']
Error scraping h

## DIFFICULTIES
- Some webpages didn't own a 'Visit Webpage' button so an early approach resulted to having some slots empty in the email CSV.

- To "route" to different webpages.

- "Green Life Revolution sl" had an email instead of a 'Visit Webpage'. No tests were made.

- In the end, the tasks required to find the countries too, so I backtracked to make it work.
- Logic & Code

## REFERENCES
1. https://www.notion.so/digiole/Scalable-Web-Scraping-Pipeline-21425969342680b7a99ef9f999a96f06
2. https://www.europages.co.uk/en/search?isPserpFirst=1&q=winery+supplies
3. https://beautiful-soup-4.readthedocs.io/en/latest/#quick-start
4. https://pandas.pydata.org/docs/user_guide/index.html
5. W3schools
5. Stackoverflow
6. Youtube
7. Chat GPT
8. Gemini AI
9. Geeks for Geeks