In [1]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv(
    "../data/olympic_athlete_events.csv",
    usecols=[
        "Age",
        "Height",
        "Team",
        "Year",
        "Season",
        "City",
        "Sport",
        "Event",
        "Medal",
    ],
    index_col=["Year", "Season", "Sport", "Event"],
)
df = df.sort_index()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Age,Height,Team,City,Medal
Year,Season,Sport,Event,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1896,Summer,Athletics,"Athletics Men's 1,500 metres",24.0,,United States,Athina,Silver
1896,Summer,Athletics,"Athletics Men's 1,500 metres",,,Greece,Athina,
1896,Summer,Athletics,"Athletics Men's 1,500 metres",22.0,,Australia,Athina,Gold
1896,Summer,Athletics,"Athletics Men's 1,500 metres",23.0,154.0,Germany,Athina,
1896,Summer,Athletics,"Athletics Men's 1,500 metres",21.0,,Greece,Athina,


# Olympic questions
1. What is the average age of winning athletes in summer games held from 1936 to 2000?
2. What team has won the most medals in all archery events?
3. Starting in 1980, what is the average height of the Table Tennis Women's Team event?
4. Starting in 1980, what is the average height of the Men's and Women's Table Tennis event?
5. How tall was the tallest ever tennis player from 1980 to 2016?

In [None]:
# average age of all winning athletes in summer games from 1936 to 2000
(df.loc[(slice(1936, 2000), "Summer"), "Age"]).mean()

np.float64(25.026883940421765)

In [None]:
# what team has won the most medals in all archery events?
(
    df.dropna(subset="Medal").loc[  # only want medal winners, so drop NaNs
        (slice(None), slice(None), "Archery"), "Team"
    ]
).value_counts().head(3)

Team
South Korea    69
Belgium        52
France         48
Name: count, dtype: int64

In [None]:
# starting in 1980 what is the average height of the table tennis women's team event?
df.loc[
    (slice(1980, None), "Summer", "Table Tennis", "Table Tennis Women's Team"), "Height"
].mean()

# could be simplified using the `.xs` cross section method
# cross section grabs all rows that match that criteria
df.xs("Table Tennis Women's Team", level="Event")["Height"].mean()

# if multiple parts of the index are being used these are passed in order, e.g.
# df.xs(("Summer", "Table Tennis Women's Team"), level=("Season", "Event"))["Height"].mean()

np.float64(165.04827586206898)

In [63]:
# Starting in 1980 men and women's table tennis team average heights
df.loc[
    (
        slice(1980, None),
        "Summer",
        "Table Tennis",
        ["Table Tennis Women's Team", "Table Tennis Men's Team"],
    ),
    "Height",
].mean()


np.float64(171.26643598615917)

In [None]:
# tallest ever tennis player from 1980 to 2016
df.loc[(slice(None), slice(None), "Tennis"), "Height"].max()

# syntactic sugar over slice()
from pandas import IndexSlice as idx

df.loc[idx[:, :, "Tennis"], "Height"].max()

np.float64(208.0)

# Extension questions
1. Events occur in Summer or Winter but not both. Remove the "Season" level from the index and then find (again) the tallest tennis player between 1980 and 2016
2. In which city were the most gold medals awarded from 1980 onwards?
3. How many gold medals were received by the United States since 1980? (Use the index to select the values)

In [None]:
# 1. remove the Season level from the index
df = df.reset_index().set_index(["Year", "Sport", "Event"])
# you can just remove one part of the index by passing it to reset_index: `.reset_index("Season")`

In [67]:
# Find the tallest tennis player from 1980 to 2016
df.xs("Tennis", level="Sport")["Height"].max()

np.float64(208.0)

In [None]:
# 2. Which city was the most gold medals awarded from 1980 onwards?
(
    df.loc[1980:]
    .reset_index()
    .set_index("Medal")
    .xs("Gold")["City"]
    .value_counts()
    .head(3)
)
# this works, forgot to add the slice for the year, but is a bit more complex than this:
# df.loc[1980:].loc[lambda df_: df_["Medal"] == "Gold", "City"].value_counts()

City
Beijing           671
Rio de Janeiro    665
Athina            664
Name: count, dtype: int64

In [None]:
# 3. How many gold medals were received by the United States since 1980?
(df.loc[(slice(1980, None)), ["Team", "Medal"]]).query(
    "Team == 'United States' & Medal == 'Gold'"
).count()

# the book's version is this - honestly the query method is easier to understand
# df.loc[1980:].loc[lambda df_: (df_['Team'] == 'United States') & (df_['Medal'] == 'Gold'), 'City'].count()

np.int64(1257)