In [1]:
# Load medals.csv and print the head
from pathlib import Path

try:
    import pandas as pd
except Exception as e:
    raise ImportError("pandas is required to load CSV files. Please install pandas in this notebook kernel.") from e

# Resolve candidate paths relative to the notebook working directory
nb_dir = Path.cwd()
candidates = [
    nb_dir / "medals.csv",
    nb_dir.parent / "medals.csv",
    nb_dir / "data" / "medals.csv",
    nb_dir.parent / "data" / "medals.csv",
]

csv_path = next((p for p in candidates if p.exists()), None)
if csv_path is None:
    tried = "\n".join(str(p) for p in candidates)
    raise FileNotFoundError(f"Could not find medals.csv. Tried:\n{tried}")

df = pd.read_csv(csv_path)
print(f"Loaded: {csv_path}")
print(df.head())

Loaded: c:\Tomas\Materials\Talks\2025\simple-data-exploration\code\medals.csv
           Games  Year     Sport Discipline            Athlete Team Gender  \
0  Athens (1896)  1896  Aquatics   Swimming       Alfred Hajos  HUN    Men   
1  Athens (1896)  1896  Aquatics   Swimming    Otto Herschmann  AUT    Men   
2  Athens (1896)  1896  Aquatics   Swimming   Dimitrios Drivas  GRE    Men   
3  Athens (1896)  1896  Aquatics   Swimming  Ioannis Malokinis  GRE    Men   
4  Athens (1896)  1896  Aquatics   Swimming  Spiridon Chasapis  GRE    Men   

                            Event   Medal  Gold  Silver  Bronze  
0              100m freestyle men    Gold     1       0       0  
1              100m freestyle men  Silver     0       1       0  
2  100m freestyle for sailors men  Bronze     0       0       1  
3  100m freestyle for sailors men    Gold     1       0       0  
4  100m freestyle for sailors men  Silver     0       1       0  


In [None]:
# Aggregate gold medals per athlete and show top
from pathlib import Path
import pandas as pd

# Ensure df is available; if not, try loading medals.csv
if "df" not in globals():
    nb_dir = Path.cwd()
    candidates = [
        nb_dir / "medals.csv",
        nb_dir.parent / "medals.csv",
        nb_dir / "data" / "medals.csv",
        nb_dir.parent / "data" / "medals.csv",
    ]
    csv_path = next((p for p in candidates if p.exists()), None)
    if csv_path is None:
        tried = "\n".join(str(p) for p in candidates)
        raise FileNotFoundError(f"Could not find medals.csv. Tried:\n{tried}")
    df = pd.read_csv(csv_path)

# Create gold indicator if missing
if "Gold" not in df.columns:
    if "Medal" in df.columns:
        df = df.assign(Gold=(df["Medal"].astype(str).
               str.lower() == "gold").astype(int))
    else:
        raise KeyError("Dataset must contain 'Gold' or 'Medal'.")

top_gold = (
    df.groupby("Athlete", as_index=False)["Gold"]
      .sum()
      .sort_values("Gold", ascending=False)
)

print("Top 10 athletes by number of gold medals:")
print(top_gold.head(10).to_string(index=False))



Top 10 athletes by number of gold medals:
        Athlete  Gold
 Michael Phelps    23
     Mark Spitz     9
    Paavo Nurmi     9
Larisa Latynina     9
     Carl Lewis     9
     Usain Bolt     9
       Ray Ewry     8
 Birgit Fischer     8
 Jenny Thompson     8
 Matthew Biondi     8
