In [None]:
offline_mode="False"
n=20
upper=1500

In [None]:
import logging
import requests
import pandas as pd
import numpy as np
import random
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:

wine_cols = ["ID", "Winery","Name", "Vintage","Country", "Region", "Wine_Style", "Wine_Type", "Wine_Category", "Grape_Type",
            "Grape_ID", 'Rating', 'Review_Count','Price', 'Acidity', "Fizziness", "Intensity", "Sweetness", "Tannin", "Scrape_Date"]

if offline_mode == "True":
    logging.info("Reading from static file")
    file_path="/opt/airflow/resources/Scraped_12k.csv"
    wine_df = pd.read_csv(file_path, sep=",")
else:
    #  This way we can run the pipeline daily and grow naturally and randomly
    # But lower numbers are more common than higher numbers
    # thus we need a higher density among the lower numbers
    def generate_numbers(n, max_val):
        min = np.floor(max_val*0.1).astype(int)
        small_numbers = np.random.exponential(scale=1.0, size=np.floor(n*0.5).astype(int))
        small_numbers = np.floor(small_numbers * 0.5*min).astype(int)
        large_numbers = np.random.exponential(scale=1.0, size=np.floor(n*0.5).astype(int))
        large_numbers = np.floor(large_numbers * 0.5*max_val).astype(int)
        numbers = np.concatenate((small_numbers, large_numbers))
        return list(numbers)

    random_grapes = generate_numbers(int(n), int(upper))

    temp_df = []

    number_of_pages = 10

    for y in random_grapes: #y range is the number of grape types (up to 200)  
        for z in [1,2,3,4,7,24]: # z is the wine type (1: red, 2: white, 3: sparkling, 4: rosé 7: dessert wine 24: fortified wine) 
            logging.info(f"Scraping grape {y} and wine type {z}")
            for x in range(1, number_of_pages): # x range is the number of pages (up to ?? - depends on grape)  
                # instead of parsing we found a somewhat unofficial API that we can use to get the data
                # But normally one would only get 2000 results (https://stackoverflow.com/questions/71264253/web-scraping-vivino-using-python)
                # thats why we analyzed all the data one can use as payload to design restarts for the random walk of API scraping
                
                r = requests.get(
                "https://www.vivino.com/api/explore/explore",
                params = {
                    #"country_code": "en",
                    'grape_ids[]':y,
                    #"country_codes[]":["pt", "es", "fr", "de"],
                    "currency_code":"EUR",
                    #"grape_filter":"varietal",
                    "min_rating":"1",
                    #"order_by":"price", #  "ratings_average"
                    #"order":"asc",
                    "page": x,
                    "price_range_max":"1500",
                    "price_range_min":"0",
                    "wine_type_ids[]":z,
                    "language":"en",
                    "per_page":50
                },
                    headers= {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",  
                    'Accept': 'application/json',
                    'Accept-Language': 'en-US,en;q=0.5',
                }

                )
                try:
                        
                    results = [
                        (
                        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',#ID
                        t["vintage"]["wine"]["winery"]["name"], #winery
                        t["vintage"]["wine"]["name"], #Name
                        t["vintage"]["year"], #Vintage
                        t["vintage"]["wine"]["region"]["country"]["name"], #Country
                        t["vintage"]["wine"]["region"]["name"], #region
                        t["vintage"]["wine"]["style"]["seo_name"], # wine style
                        t["vintage"]["wine"]["style"]["varietal_name"], # wine type
                        t["vintage"]["wine"]["type_id"], #wine type by id
                        r.json()["selected_filters"][0]["items"][0]["name"], # grape type
                        r.json()["selected_filters"][0]["items"][0]["id"], # grape id
                        t["vintage"]["statistics"]["ratings_average"], #rating
                        t["vintage"]["statistics"]["ratings_count"],# number of ratings
                        t["price"]["amount"],#price
                        t["vintage"]["wine"]["taste"]["structure"]["acidity"], # wine dimensions 1
                        t["vintage"]["wine"]["taste"]["structure"]["fizziness"],# wine dimensions 2
                        t["vintage"]["wine"]["taste"]["structure"]["intensity"], # wine dimensions 3
                        t["vintage"]["wine"]["taste"]["structure"]["sweetness"],# wine dimensions 4
                        t["vintage"]["wine"]["taste"]["structure"]["tannin"],    # wine dimensions 5
                        # add scrape date as date
                        pd.to_datetime('today').strftime("%d-%m-%Y")
                        )
                        for t in r.json()["explore_vintage"]["matches"]
                        ]
                    temp_df.append(results)
                except:
                        pass

    if all(isinstance(i, list) for i in temp_df):
        temp_df = [item for sublist in temp_df for item in sublist]  # Flatten the list of lists
        wine_df = pd.DataFrame(temp_df, columns=wine_cols)
wine_df.to_parquet('/opt/airflow/resources/wine_data_raw.parquet', engine='fastparquet')
logging.info(f"Scraped {len(wine_df)} wines")