In [None]:
import polars as pl
from rich import print

df = pl.read_parquet("../data/alltime_athletics_version_2024-07-09.parquet")
plot_kwargs = {"width": 1000, "height": 500, "grid": True}

# What events do we have data for


In [None]:
print("Events:")
print(df["event"].unique().to_list())

print("Events that women do, but men don't:")
print(
    df.filter(
        pl.col("sex") == "female",
        pl.col("event")
        .is_in(df.filter(pl.col("sex") == "male")["event"].unique())
        .not_(),
    )["event"]
    .unique()
    .to_list()
)

print("Events that men do, but women don't:")
print(
    df.filter(
        pl.col("sex") == "male",
        pl.col("event")
        .is_in(df.filter(pl.col("sex") == "female")["event"].unique())
        .not_(),
    )["event"]
    .unique()
    .to_list()
)


# Get all world records, sorted by when they were achieved


In [None]:
(
    df.filter(pl.col("event type") == "standard")
    .filter(
        pl.col("result seconds")
        == pl.col("result seconds").min().over("sex", "event")
    )
    .sort("date of event")
)


# World record progression of the 10km road


In [None]:
(
    df.filter(pl.col("event") == "10km road")
    .sort("date of event")
    .with_columns(current_wr=pl.col("result seconds").cum_min().over("sex"))
    .filter(pl.col("result seconds") == pl.col("current_wr"))
    .with_columns(
        percent_of_wr=100
        * pl.col("result seconds")
        / pl.col("result seconds").min().over("sex")
    )
    .plot.step(
        "date of event",
        "percent_of_wr",
        by="sex",
        xlabel="Date",
        ylabel="Percent of current WR",
        title="WR progression in the marathon",
        **plot_kwargs,
    )
)


# How much have times improved since "super shoes" came to market?


In [None]:
(
    df.filter(pl.col("event") == "10km road")
    .with_columns(year=pl.col("date of event").dt.year())
    .with_columns(
        rank_in_that_year=pl.col("result seconds")
        .rank()
        .over("sex", "year")
        .cast(pl.Int64)
    )
    .filter(pl.col("year") < 2024)
    .filter(pl.col("rank_in_that_year") <= 10)
    .sort("sex", "year", "rank")
    .plot.scatter(
        "date of event", "result seconds", by="sex", alpha=0.45, **plot_kwargs
    )
)

# How do pace and distance of the WRs relate?


In [None]:
(
    df.filter(
        pl.col("result seconds")
        == pl.col("result seconds").min().over("sex", "event"),
        pl.col("event type") == "standard",
        pl.col("event").str.contains("hurdles").not_(),
        pl.col("event").str.contains("walk").not_(),
        pl.col("event").str.contains("steeple").not_(),
    )
    .with_columns(speed_mps=pl.col("distance") / pl.col("result seconds"))
    .sort("sex", "distance")
    .plot("distance", "speed_mps", by="sex", **plot_kwargs, logx=True)
)

# What's the average age of a Top10 performance across disciplines for women


In [None]:
(
    df.filter(pl.col("distance type") == "long distance")
    .with_columns(year=pl.col("date of event").dt.year())
    .with_columns(
        rank_in_that_year=pl.col("result seconds")
        .rank()
        .over("event", "year", "sex")
        .cast(pl.Int64)
    )
    .filter(pl.col("year") < 2024)
    .filter(pl.col("rank_in_that_year") <= 10)
    .filter(
        pl.col("event type") == "standard",
        pl.col("event").str.contains("hurdles").not_(),
        pl.col("event").str.contains("walk").not_(),
        pl.col("event").str.contains("steeple").not_(),
    )
    .filter(pl.col("year") > 2010)
    .group_by("event", "sex")
    .agg(
        avg_age=pl.col("age at event in years").mean(),
        distance=pl.col("distance").first(),
    )
    .sort("distance")
    .plot.line(
        "distance",
        "avg_age",
        by="sex",
        **plot_kwargs,
        # subplots=True,
    )
)