# Scraping the vintage charts of wine spectator

We scrape the website winespectator.com and its sub pages and combine all information in a structured JSON file.

In [None]:
import requests
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
import json
import re


final_data = {}

base = "https://www.winespectator.com"
# Make a request to the website
r = requests.get(base+"/vintage-charts")

# Parse the HTML content
soup = BeautifulSoup(r.text, 'html.parser')

# Find all divs with the class "col-3 p-8"
divs = soup.find_all('div', class_='col-3 p-8')

# Loop over each div and find the href attribute of the <a> tag within
for div in divs:
    href = div.find('a')['href']
    #print("https://www.winespectator.com"+href)
    # Make a request to the website
    

    r = requests.get(base+href)

    # Parse the HTML content
    soup = BeautifulSoup(r.text, 'html.parser')
    # Check if there is a "tbody" element on the page
    tbody = soup.find(class_='table table-condensed')

    if tbody is None:
        # Find all divs with the new class
        divs = soup.find_all('div', class_='border-t-0 border-b border-l-0 border-r-0 border-solid col-12 border-grey-light p-15')

        # Loop over each div and find the href attribute of the <a> tag within
        for div in divs:
            href = div.find('a')['href']
            html = base+href
            
            vintage_list = []
            # Make a request to the website
            r = requests.get(html)

            # Parse the HTML content
            soup = BeautifulSoup(r.text, 'html.parser')

            country = soup.find('a', class_='text-black text-decoration-none hover:underline').text

            if country not in final_data:
                    final_data[country] = {}

            # Find the span with class "glyphicon glyphicon-chevron-right "
            span = soup.find('span', class_="glyphicon glyphicon-chevron-right")

            # Find the next <a> tag after the span
            region = span.find_next('a').text

            if region not in final_data[country]:
                final_data[country][region] = []

            # Find the table
            table = soup.find('table', class_='table table-condensed')

            # Find the header row and extract the text from each th
            header = [th.text.strip() for th in table.find_all('tr')[0].find_all('th')]
            vintage_list.append(header)
            # Find all rows in the table
            rows = table.find_all('tr')

            # Loop over each row
            for row in rows:
                # Find all columns in the row
                cols = row.find_all('td')
                
                # Filter out columns with class="d-md-none"
                cols = [col for col in cols if 'd-md-none' not in col.get('class', [])]
                
            # Extract the text from each column
                data = []
                for col in cols:
                    text_nodes = [node for node in col.contents if isinstance(node, NavigableString)]
                    text = ' '.join(node.strip() for node in text_nodes)
                    last_a = col.find_all('a')[-1].text.strip() if col.find_all('a') else None
                    last_span = col.find_all('span')[-1].text.strip() if col.find_all('span') else None
                    p_text = col.find('p').text.strip() if col.find('p') else None
                    combined_text = ' '.join([text, last_a or '', last_span or '', p_text or '']).strip()
                    data.append(combined_text)

                # append the data
                vintage_list.append(data)
            
            # Filter out empty lists
            vintage_list = [row for row in vintage_list if row]

            # Convert to DataFrame
            #df = pd.DataFrame(vintage_list, columns=['Year', 'Score', 'Drink Window', 'Description'])
            final_data[country][region] = vintage_list
    else:   
        
        # Find the element with class "md:text-3xl"
        element = soup.find(class_='md:text-3xl')
        # Extract and print the text of the first child
        text = element.find().get_text(strip=True)

        parts = re.split(r'/|: ', text)
        country = parts[0]

        if country not in final_data:
                final_data[country] = {}

        # check if parts has length >1
        if len(parts) > 1:
            region = parts[1]
        else:
            region = None

        if region not in final_data[country]:
            final_data[country][region] = []

        # Find the table
        table = soup.find('table', class_='table table-condensed')

        # Find the header row and extract the text from each th
        header = [th.text.strip() for th in table.find_all('tr')[0].find_all('th')]
        vintage_list.append(header)
        # Find all rows in the table
        rows = table.find_all('tr')

        # Loop over each row
        for row in rows:
            # Find all columns in the row
            cols = row.find_all('td')
            
            # Filter out columns with class="d-md-none"
            cols = [col for col in cols if 'd-md-none' not in col.get('class', [])]
            
        # Extract the text from each column
            data = []
            for col in cols:
                text_nodes = [node for node in col.contents if isinstance(node, NavigableString)]
                text = ' '.join(node.strip() for node in text_nodes)
                last_a = col.find_all('a')[-1].text.strip() if col.find_all('a') else None
                last_span = col.find_all('span')[-1].text.strip() if col.find_all('span') else None
                p_text = col.find('p').text.strip() if col.find('p') else None
                combined_text = ' '.join([text, last_a or '', last_span or '', p_text or '']).strip()
                data.append(combined_text)

            # append the data
            vintage_list.append(data)
        
        # Filter out empty lists
        vintage_list = [row for row in vintage_list if row]

        # Convert to DataFrame
        #df = pd.DataFrame(vintage_list, columns=['Year', 'Score', 'Drink Window', 'Description'])
        final_data[country][region] = vintage_list


# Write the data to a JSON file
with open('data/data.json', 'w') as f:
    json.dump(final_data, f)