In [114]:
!pwd

/Users/vadramandisang/Documents/Vadrama/data/Product/ViaMichelin_Scraping


In [207]:
!pip install sqlalchemy



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import uuid
import hashlib
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
import json

In [None]:
# Load the Snowflake configuration from the JSON file
with open("snow_cred.json") as f:
    config = json.load(f)
    
    
# Creating Snowflake Connection
snow_engine = create_engine(URL(
    account = config['account'],
    user = config['user'],
    password = config['password'],
    database = config['database'],
    schema = config['schema'],
    warehouse = config['warehouse'],
    role= config['role']
))

snow_connection = snow_engine.connect()
snow_connection

<sqlalchemy.engine.base.Connection at 0x10b923a90>

In [None]:
country_url_list = ['https://www.viamichelin.fr/web/Stations-service?address=Czechia',]
countries = ['France', 'United Kingdom', 'Switzerland', 'Germany', 'Belgium', 'Netherlands', 'Spain', 'Luxembourg', 'Czech Republic', 'Italy']
# 'https://www.viamichelin.fr/web/Stations-service?address=France',

In [None]:
countries = ['France', 'United Kingdom', 'Switzerland', 'Germany', 'Belgium', 'Netherlands', 'Spain', 'Luxembourg', 'Czech Republic', 'Italy']

country_url_list = ['https://www.viamichelin.fr/web/Stations-service?address=France', 'https://www.viamichelin.fr/web/Stations-service?address=united%20kingdom', 'https://www.viamichelin.fr/web/Stations-service?address=switzerland', 'https://www.viamichelin.fr/web/Stations-service?address=germany', 'https://www.viamichelin.fr/web/Stations-service?address=Belgium', 'https://www.viamichelin.fr/web/Stations-service?address=netherlands', 'https://www.viamichelin.fr/web/Stations-service?address=spain', 'https://www.viamichelin.fr/web/Stations-service?address=luxemburg', 'https://www.viamichelin.fr/web/Stations-service?address=Czechia', 'https://www.viamichelin.fr/web/Stations-service?address=Italy']



## Working Script

In [6]:
#!/usr/bin/env python3


import requests
from bs4 import BeautifulSoup
import pandas as pd
import uuid
import hashlib
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
import json
from deepparse.parser import AddressParser
from deepparse.dataset_container import CSVDatasetContainer

address_parser = AddressParser(model_type="bpemb", device=0)


# Load the Snowflake configuration from the JSON file
with open("snow_cred.json") as f:
    config = json.load(f)
    
    
# Creating Snowflake Connection
snow_engine = create_engine(URL(
    account = config['account'],
    user = config['user'],
    password = config['password'],
    database = config['database'],
    schema = config['schema'],
    warehouse = config['warehouse'],
    role= config['role']
))

snow_connection = snow_engine.connect()
snow_connection


countries = ['France', 'United Kingdom', 'Switzerland', 'Germany', 'Belgium', 'Netherlands', 'Spain', 'Luxembourg', 'Czech Republic', 'Italy']
country_url_list = ['https://www.viamichelin.fr/web/Stations-service?address=France', 'https://www.viamichelin.fr/web/Stations-service?address=united%20kingdom', 'https://www.viamichelin.fr/web/Stations-service?address=switzerland', 'https://www.viamichelin.fr/web/Stations-service?address=germany', 'https://www.viamichelin.fr/web/Stations-service?address=Belgium', 'https://www.viamichelin.fr/web/Stations-service?address=netherlands', 'https://www.viamichelin.fr/web/Stations-service?address=spain', 'https://www.viamichelin.fr/web/Stations-service?address=luxemburg', 'https://www.viamichelin.fr/web/Stations-service?address=Czechia', 'https://www.viamichelin.fr/web/Stations-service?address=Italy']


def get_rqt(url):
    r = requests.get(url)
    sp = BeautifulSoup(r.content, 'html.parser')
    return sp


def get_pagi_url_max_num(page_n):
    pg = BeautifulSoup(str(page_n[0]), 'html.parser')
    pg_a = pg.findAll('a')

    if len(pg_a) > 0:
        pg_a = pg_a[-1]
        pgination_url = pg_a['href']
        pgination_url = pgination_url[0:pgination_url.index('=')+1]
        max_pg = int(pg_a.get_text())
        return pgination_url, max_pg
    
    
def get_pagination(url):
    r = requests.get(url)
    #print(r)
    soup = BeautifulSoup(r.content, 'html.parser')
    page_n = soup.findAll("p", class_="pagination-second-line")
    return page_n


def save_data(data_df):
    target_table = 'viamichelin_stations'
    del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')') 
    try:
        # Delete existing rows to be updated with new version
        del_existing_rows_query = "DELETE FROM " +target_table+ " WHERE id IN " + del_id_lst
        #print('Del Query is ', del_existing_rows_query)
        with snow_engine.connect() as conn:
            conn.execute(del_existing_rows_query)

        # Save data in the DataFrame    
        data_df.to_sql(target_table, con=snow_engine, schema='EXTERNAL_TABLES', index=False, if_exists='append')
        print(data_df.loc[0])
        data_df = pd.DataFrame()
    except Exception as err:
        print(err)    
        

# Parse Addresses and get 'Street_Number', 'Street_Name', 'City', and 'Postal_Code'
def parse_addresses(address):
    try:
        parse_res = address_parser(address)
        result =  [str(parse_res.StreetNumber), str(parse_res.StreetName).title(), str(parse_res.Municipality).title(), str(parse_res.PostalCode)]
        return result
    except:
        return ['', '', '', '']


    
def get_and_save_data(url, ctry):
    r = requests.get(url)
    print(ctry, url, r)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Get station Names
    station_names = soup.findAll('div', class_='poi-item-name truncate')
    station_names = [val.get_text() for val in station_names]

    # Get station Addresses
    station_address = soup.findAll('div', class_="poi-item-details-address truncate")
    station_address = [val.get_text() for val in station_address]
    print(len(station_names), len(station_address))
    #print(station_names,'\n\n', station_address)

    # Combine the Scrapped data into a list ready to be converted into a DataFrame
    dat_zip = zip(station_names, station_address)

    zipped_dat = list(dat_zip)
    dat_zip = ''
    data = [list(i) for i in zipped_dat]
    zipped_dat.clear()
    
    # Clear the list to save memory        
    station_names.clear()
    station_address.clear()
    

    # Create DataFrame from the data list
    data_df = pd.DataFrame(data, columns=['Station_Name', 'Station_Address'])
    data.clear()

    # Generate a UUID for each row based on the values in 'Station_Name' and 'Station_Address'
    generate_uuid = lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['Station_Name']) + str(row['Station_Address'])).hex
    
    # Enrich Station Address here
    enriched_df = pd.DataFrame(data_df.apply(lambda x: parse_addresses(x['Station_Address']), axis=1 ).tolist(), columns = ['Street_Number', 'Street_Name', 'City', 'Postal_Code'])


    # Add new columns to the created DataFrame
    data_df['id'] = data_df.apply(generate_uuid, axis=1)

    # Add the new DataFrame to the old one by concatenating the dataframes vertically
    # Concatenate the two DataFrames and Rearrange the new DataFrame
    data_df = pd.concat([data_df, enriched_df], axis=1)[['id','Station_Name', 'Station_Address', 'Street_Number', 'Street_Name', 'City', 'Postal_Code']]

    #print('Len data_fr -> ', len(data_df), 'Len enriched_df -> ', len(enriched_df))
    
 
    # Add the column country to the DataFrame
    data_df["country"] = ctry
    
    #print('Len data_df -> ', len(data_df))
    # Save the Data to DB
    save_data(data_df)
    enriched_df = pd.DataFrame()
    print('SAVED')

        

for url, ctry in zip(country_url_list, countries):
    try:
        # Scrape the page and save data
        get_and_save_data(url, ctry)        
        
        # Get and prepare Pagination        
        page_n = get_pagination(url)
        if get_pagi_url_max_num(page_n):
            #print('More')
            pgination_url, max_pg = get_pagi_url_max_num(page_n)
            for i in range(2, max_pg + 1):  # range(2, max_pg+1)
                get_and_save_data(pgination_url+str(i), ctry)
        else:
            #print('less')
            continue
        
    except Exception as err:
        print(err)
        

ModuleNotFoundError: No module named 'deepparse'

In [12]:
#!/usr/bin/env python3


import requests
from bs4 import BeautifulSoup
import pandas as pd
import uuid
import hashlib
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
import json
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="viamichelin")


# Load the Snowflake configuration from the JSON file
with open("snow_cred.json") as f:
    config = json.load(f)
    
    
# Creating Snowflake Connection
snow_engine = create_engine(URL(
    account = config['account'],
    user = config['user'],
    password = config['password'],
    database = config['database'],
    schema = config['schema'],
    warehouse = config['warehouse'],
    role= config['role']
))

snow_connection = snow_engine.connect()
snow_connection


countries = ['France', 'United Kingdom', 'Switzerland', 'Germany', 'Belgium', 'Netherlands', 'Spain', 'Luxembourg', 'Czech Republic', 'Italy']
country_url_list = ['https://www.viamichelin.fr/web/Stations-service?address=France', 'https://www.viamichelin.fr/web/Stations-service?address=united%20kingdom', 'https://www.viamichelin.fr/web/Stations-service?address=switzerland', 'https://www.viamichelin.fr/web/Stations-service?address=germany', 'https://www.viamichelin.fr/web/Stations-service?address=Belgium', 'https://www.viamichelin.fr/web/Stations-service?address=netherlands', 'https://www.viamichelin.fr/web/Stations-service?address=spain', 'https://www.viamichelin.fr/web/Stations-service?address=luxemburg', 'https://www.viamichelin.fr/web/Stations-service?address=Czechia', 'https://www.viamichelin.fr/web/Stations-service?address=Italy']


def get_rqt(url):
    r = requests.get(url)
    sp = BeautifulSoup(r.content, 'html.parser')
    return sp


def get_pagi_url_max_num(page_n):
    pg = BeautifulSoup(str(page_n[0]), 'html.parser')
    pg_a = pg.findAll('a')

    if len(pg_a) > 0:
        pg_a = pg_a[-1]
        pgination_url = pg_a['href']
        pgination_url = pgination_url[0:pgination_url.index('=')+1]
        max_pg = int(pg_a.get_text())
        return pgination_url, max_pg
    
    
def get_pagination(url):
    r = requests.get(url)
    #print(r)
    soup = BeautifulSoup(r.content, 'html.parser')
    page_n = soup.findAll("p", class_="pagination-second-line")
    return page_n


def save_data(data_df):
    target_table = 'viamichelin_stations'
    del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')') 
    try:
        # Delete existing rows to be updated with new version
        del_existing_rows_query = "DELETE FROM " +target_table+ " WHERE id IN " + del_id_lst
        #print('Del Query is ', del_existing_rows_query)
        with snow_engine.connect() as conn:
            conn.execute(del_existing_rows_query)

        # Save data in the DataFrame    
        data_df.to_sql(target_table, con=snow_engine, schema='public', index=False, if_exists='append')
        print(data_df.loc[0])
        data_df = pd.DataFrame()
    except Exception as err:
        print(err)    


def get_street(address, station_address):
    if 'house_number' in address:
        print('Yes')
        if address['house_number'] in station_address.split():
            street = str(address['house_number']) + ' ' + str(address['road']) if 'road' in address else ''
        else:
            street = str(address['road']) if 'road' in address else ''
        print('Yes ', street)
    else:
        street = str(address['road']) if 'road' in address else ''
        
    return street


def get_city(address):
    if 'city' in address:
        city = address['city']
    elif 'town' in address: 
        city = address['town']
    elif 'village' in address: 
        city = address['village']    
    else:
        city = ''
    return city



# Enrich Station Address
def split_address(station_address):
    enriched_address = []
    for i in station_address:
        new_station_address = []
        location = geolocator.geocode(i) # 18 Hauptstrasse, 3752 Wimmis
        print(type(location))
        if location:
            print(location.raw)
            lat = location.raw['lat']
            lon = location.raw['lon']
            location_geop = geolocator.reverse(f"{lat}, {lon}")
            address = location_geop.raw['address']
            postcode = str(address['postcode']) if 'postcode' in address else ''
            place_id = str(location_geop.raw['place_id']) if 'place_id' in location_geop.raw else ''
            station_street = get_street(address, i)
            station_city = get_city(address)
            #print(location_geop.raw)    
            new_station_address.extend([station_street, station_city, postcode, lat, lon, place_id])
        else:
            print("Location not found")
            new_station_address.extend(['', '', '', '', '', ''])
    print(type(enriched_address), '\nNow Appending enriched_address...')                
    enriched_address.append(new_station_address)
    enriched_address_df = pd.DataFrame(enriched_address, columns = ['Street', 'City', 'Postcode', 'Latitude', 'Longitude', 'Place_id'])
    return enriched_address_df



    
def get_and_save_data(url, ctry):
    r = requests.get(url)
    print(ctry, url, r)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Get station Names
    station_names = soup.findAll('div', class_='poi-item-name truncate')
    station_names = [val.get_text() for val in station_names]

    # Get station Addresses
    station_address = soup.findAll('div', class_="poi-item-details-address truncate")
    station_address = [val.get_text() for val in station_address]
    print(len(station_names), len(station_address))
    #print(station_names,'\n\n', station_address)

    # Combine the Scrapped data into a list ready to be converted into a DataFrame
    dat_zip = zip(station_names[:10], station_address[:10])

    zipped_dat = list(dat_zip)
    dat_zip = ''
    data = [list(i) for i in zipped_dat]
    zipped_dat.clear()
    
    # Clear the list to save memory        
    station_names.clear()
    station_address.clear()
    

    # Create DataFrame from the data list
    data_df = pd.DataFrame(data, columns=['Station_Name', 'Station_Address'])
    data.clear()

    # Generate a UUID for each row based on the values in 'Station_Name' and 'Station_Address'
    generate_uuid = lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['Station_Name']) + str(row['Station_Address'])).hex
    
    # Enrich Station Address here
    enriched_address_df = split_address(list(data_df['Station_Address']))        

    # Add new columns to the created DataFrame    
    data_df['id'] = data_df.apply(generate_uuid, axis=1)

    #Rearrange DataFrame
    data_df = data_df[['id', 'Station_Name', 'Station_Address']]
    
    # Add the new DataFrame to the old one by concatenating the dataframes vertically
    data_df = pd.concat([data_df, enriched_address_df])
    data_df["country"] = ctry
    
    print('DataFrame to SAVE\n\n',data_df.head(5))
    
    # Save the Data to DB
    save_data(data_df)

        

for url, ctry in zip(country_url_list, countries):
    try:
        # Scrape the page and save data
        get_and_save_data(url, ctry)        
        
        # Get and prepare Pagination        
        page_n = get_pagination(url)
        if get_pagi_url_max_num(page_n):
            print('More')
            pgination_url, max_pg = get_pagi_url_max_num(page_n)
            for i in range(2, max_pg + 1):  # range(2, max_pg+1)
                get_and_save_data(pgination_url+str(i), ctry)
        else:
            print('less')
            continue
        
    except Exception as err:
        print(err)
        

France https://www.viamichelin.fr/web/Stations-service?address=France <Response [200]>
48 48
<class 'NoneType'>
Location not found
<class 'NoneType'>
Location not found
<class 'NoneType'>
Location not found
<class 'geopy.location.Location'>
{'place_id': 100363813, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'way', 'osm_id': 4710350, 'boundingbox': ['48.8349979', '48.8353329', '-1.2192066', '-1.2187055'], 'lat': '48.8350853', 'lon': '-1.2187997', 'display_name': 'Route de Vire, La Croix au Grand, Villedieu-les-Poêles, Villedieu-les-Poêles-Rouffigny, Saint-Lô, Manche, Normandie, France métropolitaine, 50800, France', 'class': 'highway', 'type': 'primary', 'importance': 0.6000099999999999}
<class 'geopy.location.Location'>
{'place_id': 278455470, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'way', 'osm_id': 925966642, 'boundingbox': ['43.6879612', '43.6880001', '4.2731883', '4.2732285'

KeyboardInterrupt: 

In [152]:
#!/usr/bin/env python3


import requests
from bs4 import BeautifulSoup
import pandas as pd
import uuid
import hashlib
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
import json



# Load the Snowflake configuration from the JSON file
with open("snow_cred.json") as f:
    config = json.load(f)
    
    
# Creating Snowflake Connection
snow_engine = create_engine(URL(
    account = config['account'],
    user = config['user'],
    password = config['password'],
    database = config['database'],
    schema = config['schema'],
    warehouse = config['warehouse'],
    role= config['role']
))

snow_connection = snow_engine.connect()
snow_connection


countries = ['France', 'United Kingdom', 'Switzerland', 'Germany', 'Belgium', 'Netherlands', 'Spain', 'Luxembourg', 'Czech Republic', 'Italy']
country_url_list = ['https://www.viamichelin.fr/web/Stations-service?address=France', 'https://www.viamichelin.fr/web/Stations-service?address=united%20kingdom', 'https://www.viamichelin.fr/web/Stations-service?address=switzerland', 'https://www.viamichelin.fr/web/Stations-service?address=germany', 'https://www.viamichelin.fr/web/Stations-service?address=Belgium', 'https://www.viamichelin.fr/web/Stations-service?address=netherlands', 'https://www.viamichelin.fr/web/Stations-service?address=spain', 'https://www.viamichelin.fr/web/Stations-service?address=luxemburg', 'https://www.viamichelin.fr/web/Stations-service?address=Czechia', 'https://www.viamichelin.fr/web/Stations-service?address=Italy']


def get_rqt(url):
    r = requests.get(url)
    sp = BeautifulSoup(r.content, 'html.parser')
    return sp


def get_pagi_url_max_num(page_n):
    pg = BeautifulSoup(str(page_n[0]), 'html.parser')
    pg_a = pg.findAll('a')

    if len(pg_a) > 0:
        pg_a = pg_a[-1]
        pgination_url = pg_a['href']
        pgination_url = pgination_url[0:pgination_url.index('=')+1]
        max_pg = int(pg_a.get_text())
        return pgination_url, max_pg
    
    
def get_pagination(url):
    r = requests.get(url)
    #print(r)
    soup = BeautifulSoup(r.content, 'html.parser')
    page_n = soup.findAll("p", class_="pagination-second-line")
    return page_n


def save_data(data_df):
    target_table = 'viamichelin_stations'
    del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')') 
    try:
        # Delete existing rows to be updated with new version
        del_existing_rows_query = "DELETE FROM " +target_table+ " WHERE id IN " + del_id_lst
        #print('Del Query is ', del_existing_rows_query)
        with snow_engine.connect() as conn:
            conn.execute(del_existing_rows_query)

        # Save data in the DataFrame    
        data_df.to_sql(target_table, con=snow_engine, schema='public', index=False, if_exists='append')
        print(data_df.loc[0])
        data_df = pd.DataFrame()
    except Exception as err:
        print(err)    

    
def get_and_save_data(url, ctry):
    r = requests.get(url)
    print(ctry, url, r)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Get station Names
    station_names = soup.findAll('div', class_='poi-item-name truncate')
    station_names = [val.get_text() for val in station_names]

    # Get station Addresses
    station_address = soup.findAll('div', class_="poi-item-details-address truncate")
    station_address = [val.get_text() for val in station_address]
    print(len(station_names), len(station_address))
    #print(station_names,'\n\n', station_address)

    # Combine the Scrapped data into a list ready to be converted into a DataFrame
    dat_zip = zip(station_names, station_address)

    zipped_dat = list(dat_zip)
    dat_zip = ''
    data = [list(i) for i in zipped_dat]
    zipped_dat.clear()
    
    # Clear the list to save memory        
    station_names.clear()
    station_address.clear()
    

    # Create DataFrame from the data list
    data_df = pd.DataFrame(data, columns=['Station_Name', 'Station_Address'])
    data.clear()

    # Generate a UUID for each row based on the values in 'Station_Name' and 'Station_Address'
    generate_uuid = lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['Station_Name']) + str(row['Station_Address'])).hex

    # Add new columns to the created DataFrame
    data_df["country"] = ctry
    data_df['id'] = data_df.apply(generate_uuid, axis=1)

    #Rearrange DataFrame
    data_df = data_df[['id', 'Station_Name', 'Station_Address', 'country']]
    
    # Save the Data to DB
    save_data(data_df)

        

for url, ctry in zip(country_url_list, countries):
    try:
        # Scrape the page and save data
        get_and_save_data(url, ctry)        
        
        # Get and prepare Pagination        
        page_n = get_pagination(url)
        if get_pagi_url_max_num(page_n):
            print('More')
            pgination_url, max_pg = get_pagi_url_max_num(page_n)
            for i in range(2, max_pg + 1):  # range(2, max_pg+1)
                get_and_save_data(pgination_url+str(i), ctry)
        else:
            print('less')
            continue
        
    except Exception as err:
        print(err)
        

France https://www.viamichelin.fr/web/Stations-service?address=France <Response [200]>
48 48
id                 2392873b80645709b79ebc7afaf057f0
Station_Name                       CARREFOUR Market
Station_Address       route de vire, 50000 Saint-Lô
country                                      France
Name: 0, dtype: object
More
France https://www.viamichelin.fr/web/Stations-service/Stations-service-France?page=2 <Response [200]>
48 48
id                             6b055a4ea7005feb9b34de6f00d6668d
Station_Name                                            HYPER U
Station_Address    Voie des Alliés, 14440 DOUVRES-LA-DéLIVRANDE
country                                                  France
Name: 0, dtype: object
France https://www.viamichelin.fr/web/Stations-service/Stations-service-France?page=3 <Response [200]>
48 48
id                 c04539d04e295e25aaceb15cfb5f9e79
Station_Name                            AVIA XPRESS
Station_Address            RD N 3 -, 21260 SELONGEY
country           

KeyboardInterrupt: 

In [6]:
for url, ctry in zip(country_url_list, countries):
    print(ctry,'\n',url)

France 
 https://www.viamichelin.fr/web/Stations-service?address=France
United Kingdom 
 https://www.viamichelin.fr/web/Stations-service?address=united%20kingdom
Switzerland 
 https://www.viamichelin.fr/web/Stations-service?address=switzerland
Germany 
 https://www.viamichelin.fr/web/Stations-service?address=germany
Belgium 
 https://www.viamichelin.fr/web/Stations-service?address=Belgium
Netherlands 
 https://www.viamichelin.fr/web/Stations-service?address=netherlands
Spain 
 https://www.viamichelin.fr/web/Stations-service?address=spain
Luxembourg 
 https://www.viamichelin.fr/web/Stations-service?address=luxemburg
Czech Republic 
 https://www.viamichelin.fr/web/Stations-service?address=Czechia
Italy 
 https://www.viamichelin.fr/web/Stations-service?address=Italy


In [113]:
def get_rqt(url):
    r = requests.get(url)
    sp = BeautifulSoup(r.content, 'html.parser')
    return sp


def get_pagi_url_max_num(page_n):
    pg = BeautifulSoup(str(page_n[0]), 'html.parser')
    pg_a = pg.findAll('a')

    if len(pg_a) > 0:
        pg_a = pg_a[-1]
        pgination_url = pg_a['href']
        pgination_url = pgination_url[0:pgination_url.index('=')+1]
        max_pg = int(pg_a.get_text())
        return pgination_url, max_pg
    
    
def get_pagination(url):
    r = requests.get(url)
    #print(r)
    soup = BeautifulSoup(r.content, 'html.parser')
    page_n = soup.findAll("p", class_="pagination-second-line")
    return page_n


def save_data(data_df):
    target_table = 'viamichelin_stations'
    del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')')    
    try:
        # Delete existing rows to be updated with new version
        del_existing_rows_query = "DELETE FROM " +target_table+ " WHERE id IN " + del_id_lst
        with snow_engine.connect() as conn:
            conn.execute(del_existing_rows_query)

        # Save data in the DataFrame    
        data_df.to_sql(target_table, con=snow_engine, schema='public', index=False, if_exists='append')
        print('done')
    except Exception as err:
        print(err)    

    
def get_and_save_data(url, ctry):
    r = requests.get(url)
    print(url, r)
    soup = BeautifulSoup(r.content, 'html.parser')

    # Get station Names
    station_names = soup.findAll('div', class_='poi-item-name truncate')
    station_names = [val.get_text() for val in station_names]

    # Get station Addresses
    station_address = soup.findAll('div', class_="poi-item-details-address truncate")
    station_address = [val.get_text() for val in station_address]
    print(len(station_names), len(station_address))

    # Combine the Scrapped data into a list ready to be converted into a DataFrame
    dat_zip = zip(station_names[:2], station_address[:2])

    # Clear the list to save memory        
    station_names.clear()
    station_address.clear()

    zipped_dat = list(dat_zip)
    dat_zip = ''
    data = [list(i) for i in zipped_dat]
    zipped_dat.clear()

    # Create DataFrame from the data list
    data_df = pd.DataFrame(data, columns=['Station_Name', 'Station_Address'])
    data.clear()

    # Generate a UUID for each row based on the values in 'Station_Name' and 'Station_Address'
    generate_uuid = lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['Station_Name']) + str(row['Station_Address'])).hex

    # Add new columns to the created DataFrame
    data_df["country"] = ctry
    data_df['id'] = data_df.apply(generate_uuid, axis=1)

    #Rearrange DataFrame
    data_df = data_df[['id', 'Station_Name', 'Station_Address', 'country']]
    
    # Save the Data to DB
    save_data(data_df)
    data_df
    print(data_df)
        

for url, ctry in zip(country_url_list, countries):
    try:
        # Scrape the page and save data
        get_and_save_data(url, ctry)        
        
        # Get and prepare Pagination        
        page_n = get_pagination(url)
        if get_pagi_url_max_num(page_n):
            print('More')
            pgination_url, max_pg = get_pagi_url_max_num(page_n)
            for i in range(2, (max_pg-max_pg) + 4):  # range(2, max_pg+1)
                get_and_save_data(pgination_url+str(i), ctry)
        else:
            print('less')
            continue
        
    except Exception as err:
        print(err)
        

https://www.viamichelin.fr/web/Stations-service?address=Czechia <Response [200]>
48 48
done
                                 id        Station_Name  \
0  0db0c81b1cba5668b692cc09a66d2264           MOL Ruská   
1  ef3a0ce755ce52a8b4e51f15b127281e  EuroOil Havlíčkova   

                   Station_Address country  
0            Ruská, 10000 Praha 10  France  
1  1673 Havlíčkova, 58301 Chotěboř  France  
More
https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=2 <Response [200]>
48 48
done
                                 id       Station_Name  \
0  93cdced41f9252f98757d1fce95f6477  EuroOil Břidličná   
1  be43f06ea5835deba9e45c4376fcd489         MOL Karlov   

                     Station_Address country  
0    488 Rýmařovská, 79351 Břidličná  France  
1  1247 Karlov, 59401 Velké Meziříčí  France  
https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=3 <Response [200]>
48 48
done
                                 id   Station_Name  \
0 

In [11]:
for url, ctry in zip(country_url_list, countries):
    print(url, ctry)

https://www.viamichelin.fr/web/Stations-service?address=Czechia France


In [13]:
ulr = 'https://www.viamichelin.fr/web/Stations-service?address=Czechia'

In [14]:
r = requests.get(ulr)
r

<Response [200]>

In [15]:
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())

In [16]:
type(soup)

bs4.BeautifulSoup

In [17]:
[i.get_text() for i in soup.findAll('div', class_='poi-item-details-address truncate')[:5]]

['Ruská, 10000 Praha 10',
 '1673 Havlíčkova, 58301 Chotěboř',
 '',
 '1452/20 Dvorní, 57101 Moravská Třebová',
 '825/4 Roviny, 64300 Brno-Chrlice']

In [18]:
station_names = soup.findAll('div', class_='poi-item-name truncate')
station_names = [val.get_text() for val in station_names]
station_names[:5]

['MOL Ruská',
 'EuroOil Havlíčkova',
 'Benzina Nová Houžná',
 'OMV Moravská Třebová',
 'AD']

In [19]:
len(station_names)

48

In [124]:
df_data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df_data

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [127]:
print(df_data.loc[0])

A    1
B    4
Name: 0, dtype: int64


In [20]:
type(station_names[3])

str

In [21]:
station_address = soup.findAll('div', class_="poi-item-details-address truncate")
station_address = [val.get_text() for val in station_address]
station_address[:5]

['Ruská, 10000 Praha 10',
 '1673 Havlíčkova, 58301 Chotěboř',
 '',
 '1452/20 Dvorní, 57101 Moravská Třebová',
 '825/4 Roviny, 64300 Brno-Chrlice']

In [22]:
needed_data = [station_names, station_address]
#needed_data

In [23]:
dat_zip = zip(station_names[:2], station_address[:2])
zipped_dat = list(dat_zip)
dat_zip = ''
data = [list(i) for i in zipped_dat]
zipped_dat.clear()
print(data)

[['MOL Ruská', 'Ruská, 10000 Praha 10'], ['EuroOil Havlíčkova', '1673 Havlíčkova, 58301 Chotěboř']]


In [24]:
type(zipped_dat)

list

In [27]:
import uuid
import hashlib

namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
input_string = "my_stringt"

uuid_val = uuid.uuid5(namespace, input_string)

print(uuid_val)


d0e00062-2ca8-5e7f-9915-e837894327d2


In [28]:
def generate_id(input_string):
    namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
    uuid_val = uuid.uuid5(namespace, input_string)
    return str(uuid_val)

In [29]:
def generate_id(df):
    namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
    uuid_val = uuid.uuid5(namespace, str(df['Station_Name'])+str(df['Station_Address']) )
    return str(uuid_val)

In [31]:
uuid_value = generate_id(str(data_df['Station_Name'])+str(data_df['Station_Address']))
uuid_value

NameError: name 'data_df' is not defined

In [172]:
str(uuid_value)

'd98ee33c-9e04-5e58-aae8-4173c4e6842a'

In [32]:
# Generate a UUID for each row based on the values in 'Station_Name' and 'Station_Address'
generate_uuid = lambda row: uuid.uuid5(uuid.NAMESPACE_DNS, str(row['Station_Name']) + str(row['Station_Address'])).hex

data_df = pd.DataFrame(data, columns=['Station_Name', 'Station_Address'])
data_df["country"] = 'France'
data_df['id'] = data_df.apply(generate_uuid, axis=1)

#Rearrange DataFrame
data_df = data_df[['id', 'Station_Name', 'Station_Address', 'country']]
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [33]:
data_df = data_df[['id', 'Station_Name', 'Station_Address', 'country']]
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [34]:
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [35]:
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [36]:
snow_engine = create_engine(URL(
    account = config['account'],
    user = config['user'],
    password = config['password'],
    database = config['database'],
    schema = config['schema'],
    warehouse = config['warehouse'],
    role= config['role']
))

snow_connection = snow_engine.connect()
snow_connection

<sqlalchemy.engine.base.Connection at 0x1a2d58fd0>

In [40]:
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [None]:
snow_engine

In [64]:
lst = list(data_df['id'])
lst

['0db0c81b1cba5668b692cc09a66d2264', 'ef3a0ce755ce52a8b4e51f15b127281e']

In [None]:
for i in lst

In [93]:
import numpy as np
data_df['country'] = np.where(data_df['id'] == '0db0c81b1cba5668b692cc09a66d2264', 'Czech Republic', data_df['country'])
data_df = data_df.head(1)
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",Czech Republic


In [84]:
data_df[data_df['id'] == '0db0c81b1cba5668b692cc09a66d2264'].country = 'Czech Republic'
data_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[data_df['id'] == '0db0c81b1cba5668b692cc09a66d2264'].country = 'Czech Republic'


Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [94]:
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",Czech Republic


In [105]:
target_table = 'viamichelin_stations'
del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')')    
try:
    # Delete existing rows to be updated with new version
    del_existing_rows_query = "DELETE FROM " +target_table+ " WHERE id IN " + del_id_lst
    with snow_engine.connect() as conn:
        conn.execute(del_existing_rows_query)
        
    # Save data in the DataFrame    
    data_df.to_sql(target_table, con=snow_engine, schema='public', index=False, if_exists='append')
    print('done')
except Exception as err:
    print(err)    

done


In [107]:
def save_data(data_df):
    target_table = 'viamichelin_stations'
    del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')')    
    try:
        # Delete existing rows to be updated with new version
        del_existing_rows_query = "DELETE FROM " +target_table+ " WHERE id IN " + del_id_lst
        with snow_engine.connect() as conn:
            conn.execute(del_existing_rows_query)

        # Save data in the DataFrame    
        data_df.to_sql(target_table, con=snow_engine, schema='public', index=False, if_exists='append')
        print('done')
    except Exception as err:
        print(err)    

In [97]:
str(tuple(data_df['id']))

"('0db0c81b1cba5668b692cc09a66d2264',)"

In [104]:
del_id_lst = str(list(data_df['id'])).replace('[','(').replace(']',')')
del_id_lst

"('0db0c81b1cba5668b692cc09a66d2264')"

In [103]:
str(list(data_df['id'])).replace('[','(').replace(']',')')

"('0db0c81b1cba5668b692cc09a66d2264')"

In [74]:
target_table

'viamichelin_stations'

In [75]:

# MY_DATABASE
try:
    data_df.to_sql(target_table, con=snow_engine, schema='public', index=False, if_exists='append')
    print('done')
except Exception as err:
    print(err)

done


In [47]:
data_df

Unnamed: 0,id,Station_Name,Station_Address,country
0,0db0c81b1cba5668b692cc09a66d2264,MOL Ruská,"Ruská, 10000 Praha 10",France
1,ef3a0ce755ce52a8b4e51f15b127281e,EuroOil Havlíčkova,"1673 Havlíčkova, 58301 Chotěboř",France


In [70]:
%%time
tab = 'sno_engine_conn_insert'
data_df.to_sql(f'{tab}', snow_engine, if_exists='replace', index=False)

CPU times: user 91.6 ms, sys: 14.2 ms, total: 106 ms
Wall time: 3.49 s


In [38]:
page_n = soup.findAll("p", class_="pagination-second-line")
page_n

[<p class="pagination-second-line"> <a data-pagination="20" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=20">20</a> <a data-pagination="30" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=30">30</a> <a data-pagination="40" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=40">40</a> <a data-pagination="50" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=50">50</a></p>]

In [104]:
page_n = soup.findAll("p", class_="pagination-second-line")
def get_pagi_url_max_num(page_n):
    pg = BeautifulSoup(str(page_n[0]), 'html.parser')
    pg_a = pg.findAll('a')

    if len(pg_a) < 0:
        pg_a = pg_a[-1]
        pgination_url = pg_a['href']
        pgination_url = pgination_url[0:pgination_url.index('=')+1]
        max_pg = int(pg_a.get_text())
        return pgination_url, max_pg
        

In [105]:
try:
    get_pagi_url_max_num(page_n)
    print('More')
except:
    print('less')

More


In [101]:
if get_pagi_url_max_num(page_n):
    print('More')
    pgination_url, max_pg = get_pagi_url_max_num(page_n)
else:
    print('less')

More


In [None]:
page_n = soup.findAll("p", class_="pagination-second-line")
pg = BeautifulSoup(str(page_n[0]), 'html.parser')
pg_a = pg.findAll('a')

if len(pg_a) > 0:
    pg_a = pg_a[-1]
    pgination_url = pg_a['href']
    pgination_url = pgination_url[0:pgination_url.index('=')+1]
    max_pg = int(pg_a.get_text())
    pgination_url max_pg

In [90]:
#page_n =
pg = BeautifulSoup(str(page_n[0]), 'html.parser')
pg

<p class="pagination-second-line"> <a data-pagination="20" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=20">20</a> <a data-pagination="30" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=30">30</a> <a data-pagination="40" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=40">40</a> <a data-pagination="50" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=50">50</a></p>

In [91]:
pg_a = pg.findAll('a')
pg_a

[<a data-pagination="20" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=20">20</a>,
 <a data-pagination="30" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=30">30</a>,
 <a data-pagination="40" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=40">40</a>,
 <a data-pagination="50" href="https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page=50">50</a>]

In [92]:
if len(pg_a) > 0:
    pg_a = pg_a[-1]
    pgination_url = pg_a['href']
    pgination_url = pgination_url[0:pgination_url.index('=')+1]
    max_pg = int(pg_a.get_text())

In [93]:
pgination_url 

'https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page='

In [94]:
max_pg

50

In [89]:
pgination_url[0:pgination_url.index('=')+1]

'https://www.viamichelin.fr/web/Stations-service/Stations-service-Tchequie?page='