In [None]:
!pip install wikipedia-api
!pip install requests requests-cache

In [2]:
import re
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
import requests
import requests_cache
import wikipediaapi
from bs4 import BeautifulSoup

In [None]:
DATA_PATH = "./data"
OLYMPIC_DATA_PATH = os.path.join(DATA_PATH, "olympic_sw_1896_2022")

In [3]:
df = pd.read_csv(os.path.join(OLYMPIC_DATA_PATH, "olympic_athletes.csv"))

In [None]:
import time

requests_cache.install_cache("wikipedia_cache", backend="sqlite", expire_after=86400)


def split_name(athlete_name):
    name_parts = athlete_name.split()
    if len(name_parts) >= 2:
        return f"{name_parts[0]} {name_parts[1]}"

    return athlete_name


def normalize_name(athlete_name):
    athlete_name = split_name(athlete_name)
    normalized_name = athlete_name.title()
    normalized_name = normalized_name.replace(" ", "_")
    return normalized_name


def get_wiki_page(athlete_name):
    athlete_name = normalize_name(athlete_name)
    user_agent = "DataVisPrj (email@gmail.com)"
    wiki_wiki = wikipediaapi.Wikipedia(user_agent, "en")
    page = wiki_wiki.page(athlete_name)

    if page.exists():
        return page.fullurl

    return None


def extract_birth_country(wikipedia_url):
    response = requests.get(wikipedia_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        infobox = soup.find("table", {"class": "infobox"})
        if infobox:
            birthplace_info = infobox.find(class_="birthplace")
            if birthplace_info:
                birth_info = birthplace_info.get_text()
                # print("birth_info ",birth_info)
                parts = birth_info.split(",")
                if len(parts) > 1:
                    country = parts[-1].strip()
                    country_name = re.sub(r"\[.*?\]", "", country).strip()
                    return country_name
    return None


def get_athlete_birth_country(athlete_name):
    wikipedia_url = get_wiki_page(athlete_name)
    if wikipedia_url:
        return extract_birth_country(wikipedia_url)
    return None


def fetch_countries_parallel(athletes):
    with ThreadPoolExecutor(max_workers=1) as executor:
        birth_countries = list(executor.map(get_athlete_birth_country, athletes))
    return birth_countries

In [5]:
athlete_names_list = df["athlete_full_name"].tolist()

In [None]:
start_time = time.perf_counter()
birth_countries = fetch_countries_parallel(athlete_names_list)
end_time = time.perf_counter()
print("time ", end_time - start_time)

In [None]:
df["birth_country"] = birth_countries