In [5]:
import requests
import pandas as pd
import numpy as np
import random
import pyarrow as pa
import pyarrow.parquet as pq

wine_cols = ["ID", "Winery","Name", "Vintage","Country", "Region", "Wine_Style", "Wine_Type", "Wine_Category", "Grape_Type",
            "Grape_ID", 'Rating', 'Review_Count','Price', 'Acidity', "Fizziness", "Intensity", "Sweetness", "Tannin", "Scrape_Date"]

#  This way we can run the pipeline daily and grow naturally and randomly
# But lower numbers are more common than higher numbers
# thus we need a higher density among the lower numbers
def generate_numbers(n, max_val):
    min = np.floor(max_val*0.1).astype(int)
    small_numbers = np.random.exponential(scale=1.0, size=np.floor(n*0.5).astype(int))
    small_numbers = np.floor(small_numbers * 0.5*min).astype(int)
    large_numbers = np.random.exponential(scale=1.0, size=np.floor(n*0.5).astype(int))
    large_numbers = np.floor(large_numbers * 0.5*max_val).astype(int)
    numbers = np.concatenate((small_numbers, large_numbers))
    return list(numbers)

random_grapes = generate_numbers(20, 1500)

temp_df = []

number_of_pages = 15

for y in random_grapes: #y range is the number of grape types (up to 200)  
    for z in [1,2,3,4,7,24]: # z is the wine type (1: red, 2: white, 3: sparkling, 4: rosé 7: dessert wine 24: fortified wine) 
        for x in range(1, number_of_pages): # x range is the number of pages (up to ?? - depends on grape)  
            # instead of parsing we found a somewhat unofficial API that we can use to get the data
            # But normally one would only get 2000 results (https://stackoverflow.com/questions/71264253/web-scraping-vivino-using-python)
            # thats why we analyzed all the data one can use as payload to design restarts for the random walk of API scraping
            
            r = requests.get(
            "https://www.vivino.com/api/explore/explore",
            params = {
                #"country_code": "en",
                'grape_ids[]':y,
                #"country_codes[]":["pt", "es", "fr", "de"],
                "currency_code":"EUR",
                #"grape_filter":"varietal",
                "min_rating":"1",
                #"order_by":"price", #  "ratings_average"
                #"order":"asc",
                "page": x,
                "price_range_max":"1500",
                "price_range_min":"0",
                "wine_type_ids[]":z,
                "language":"en",
                "per_page":50
            },
                headers= {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",  
                'Accept': 'application/json',
                'Accept-Language': 'en-US,en;q=0.5',
            }

            )
            try:
                    
                results = [
                    (
                    f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',#ID
                    t["vintage"]["wine"]["winery"]["name"], #winery
                    t["vintage"]["wine"]["name"], #Name
                    t["vintage"]["year"], #Vintage
                    t["vintage"]["wine"]["region"]["country"]["name"], #Country
                    t["vintage"]["wine"]["region"]["name"], #region
                    t["vintage"]["wine"]["style"]["seo_name"], # wine style
                    t["vintage"]["wine"]["style"]["varietal_name"], # wine type
                    t["vintage"]["wine"]["type_id"], #wine type by id
                    r.json()["selected_filters"][0]["items"][0]["name"], # grape type
                    r.json()["selected_filters"][0]["items"][0]["id"], # grape id
                    t["vintage"]["statistics"]["ratings_average"], #rating
                    t["vintage"]["statistics"]["ratings_count"],# number of ratings
                    t["price"]["amount"],#price
                    t["vintage"]["wine"]["taste"]["structure"]["acidity"], # wine dimensions 1
                    t["vintage"]["wine"]["taste"]["structure"]["fizziness"],# wine dimensions 2
                    t["vintage"]["wine"]["taste"]["structure"]["intensity"], # wine dimensions 3
                    t["vintage"]["wine"]["taste"]["structure"]["sweetness"],# wine dimensions 4
                    t["vintage"]["wine"]["taste"]["structure"]["tannin"],    # wine dimensions 5
                    # add scrape date as date
                    pd.to_datetime('today').strftime("%d-%m-%Y")
                    )
                    for t in r.json()["explore_vintage"]["matches"]
                    ]
                temp_df.append(results)
            except:
                    pass

if all(isinstance(i, list) for i in temp_df):
    temp_df = [item for sublist in temp_df for item in sublist]  # Flatten the list of lists
    wine_df = pd.DataFrame(temp_df, columns=wine_cols)


In [1]:
wine_df

NameError: name 'wine_df' is not defined

In [104]:
# map wine type id to wine type
wine_df["Wine_Category"] = wine_df["Wine_Category"].replace({1: "Red", 2: "White", 3: "Sparkling", 4: "Rosé", 7: "Dessert Wine", 24: "Fortified Wine"})

# Remove duplicates
wine_df = wine_df.drop_duplicates(subset=['ID'])

In [105]:
#cleaning region data
wine_df['Region'] = wine_df['Region'].str.replace('Grand Cru', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine_df['Region'] = wine_df['Region'].str.replace('Grand Cru', '')


In [106]:
#clean NaN values
wine_df['Rating'] = wine_df['Rating'].replace(0, np.nan)
wine_df['Vintage'] = wine_df['Vintage'].replace("N.V.", np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine_df['Rating'] = wine_df['Rating'].replace(0, np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine_df['Vintage'] = wine_df['Vintage'].replace("N.V.", np.nan)


In [121]:
# drop rows of wine_df['Vintage'] with value '' (empty string)
wine_df = wine_df[wine_df['Vintage'] != '']
wine_df['Vintage'] = wine_df['Vintage'].fillna(0)
wine_df['Vintage'].unique()

array([2016, 2017, 2003, 2019, 1998, 2018, 2007, 2013, 2021, 2020, 2022,
       2015, 2006, 2014, 2011, 2008, 2012, 2010, 2004, 1993, 2000, 1988,
       1997, 1990, 1989, 1992, 2009, 2001, 1999, 2005, 1996, 1980, 2002,
       1995,    0, 1987, 1994, 1966, 1975, 1985, 1991, 1968, 1934, 1952,
       1965, 1969, 1963, 1941, 1940, 1978, 1944, 1967, 1982, 1977, 1962,
       1928, 1947, 1896, 1919, 1920, 1924, 1936, 1927, 1899, 1932, 1922,
       1937, 1983, 1986, 1976, 1935, 1950, 1970, 1959, 1972, 1961, 1949,
       1981, 1979, 1971, 1948, 1925, 1938, 1913, 1908, 1933, 1926, 1891,
       1956, 1939, 1954, 1984, 1929, 1921, 1953, 1955, 1964, 1973, 2023])

In [122]:


# redefine column types for parquet (otherwise it will be object)
wine_df['Vintage'] = wine_df['Vintage'].astype(int)
wine_df['Grape_ID'] = wine_df['Grape_ID'].astype(int)
wine_df['Rating'] = wine_df['Rating'].astype(float)
wine_df['Review_Count'] = wine_df['Review_Count'].astype(int)
wine_df['Price'] = wine_df['Price'].astype(float)
wine_df['Acidity'] = wine_df['Acidity'].astype(float)
wine_df['Fizziness'] = wine_df['Fizziness'].astype(float)
wine_df['Intensity'] = wine_df['Intensity'].astype(float)
wine_df['Sweetness'] = wine_df['Sweetness'].astype(float)
wine_df['Tannin'] = wine_df['Tannin'].astype(float)

wine_df.dtypes


ID                object
Winery            object
Name              object
Vintage            int64
Country           object
Region            object
Wine_Style        object
Wine_Type         object
Wine_Category     object
Grape Type        object
Grape_ID           int64
Rating           float64
Review_Count       int64
Price            float64
Acidity          float64
Fizziness        float64
Intensity        float64
Sweetness        float64
Tannin           float64
dtype: object

In [123]:
wine_df.to_parquet('data.parquet', engine='fastparquet')