In [7]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from io import StringIO
from collections import defaultdict
import json

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

# Scrape Men's data
url_men = "https://en.wikipedia.org/wiki/List_of_Grand_Slam_men%27s_singles_champions"
response_men = requests.get(url_men, headers=headers)
soup_men = BeautifulSoup(response_men.content, 'html.parser')

# Scrape Women's data
url_women = "https://en.wikipedia.org/wiki/List_of_Grand_Slam_women%27s_singles_champions"
response_women = requests.get(url_women, headers=headers)
soup_women = BeautifulSoup(response_women.content, 'html.parser')

In [11]:
def extract_champions_with_countries(soup, table_index, gender):
    """Extract player names and countries from table"""
    tables = soup.find_all('table', class_='wikitable')
    table = tables[table_index]
    
    data = []
    rows = table.find_all('tr')[1:]  # Skip header
    
    for row in rows:
        cells = row.find_all(['td', 'th'])
        if len(cells) < 5:
            continue
            
        year_cell = cells[0]
        try:
            year = int(year_cell.get_text().strip())
        except:
            continue
            
        if year < 1968:  # Only Open Era
            continue
        
        row_data = {'Year': year, 'Gender': gender}
        
        # Process each tournament (AO, FO, W, US)
        tournaments = ['Australian Open', 'French Open', 'Wimbledon', 'US Open']
        for i, tournament in enumerate(tournaments, 1):
            if i < len(cells):
                cell = cells[i]
                
                # Extract country from flag image alt attribute
                country = None
                img = cell.find('img')
                if img and 'alt' in img.attrs:
                    country = img['alt']
                
                row_data[tournament] = country
        
        data.append(row_data)
    
    return data

men_data = extract_champions_with_countries(soup_men, 2, "Men")
women_data = extract_champions_with_countries(soup_women, 1, "Women")

# Combine data
combined_data = men_data + women_data

# Clean country names - combine Yugoslavia variants
for entry in combined_data:
    for tournament in ['Australian Open', 'French Open', 'Wimbledon', 'US Open']:
        if entry.get(tournament) and 'Yugoslavia' in entry[tournament]:
            entry[tournament] = 'Yugoslavia'

# Count titles by country, tournament, AND gender
country_tournament_gender_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for entry in combined_data:
    gender = entry['Gender']
    for tournament in ['Australian Open', 'French Open', 'Wimbledon', 'US Open']:
        country = entry.get(tournament)
        if country:
            country_tournament_gender_counts[country][tournament][gender] += 1

# Convert to list format for heatmap - FIX IS HERE
heatmap_data = []  # Initialize empty list
for country, tournaments in country_tournament_gender_counts.items():  # Use the correct variable
    for tournament, genders in tournaments.items():
        for gender, count in genders.items():  # Loop through genders
            heatmap_data.append({
                'Country': country,
                'Tournament': tournament,
                'Gender': gender,
                'Titles': count
            })

# Save to JSON
with open('grand_slam_heatmap_data.json', 'w') as f:
    json.dump(heatmap_data, f, indent=2)