<a href="https://colab.research.google.com/github/the-bucketless/iihf/blob/main/iihf_stats_and_lineups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import pandas as pd
import requests

In [18]:
def get_api_df(url):
    response = requests.get(url)
    api_json = json.loads(response.text)

    periods_df = pd.concat([
        pd.json_normalize({
            # items without "Actions" in the title create a single row
            # of summary data
            # items with "Actions" in the key are lists of dicts
            # these will get split into their own dataframes below and
            # joined to the summary data
            k: v 
            for k, v in period.items()
            if "Actions" not in k
        })
        .join(
            # there are two other keys that include "Actions", but they're 
            # just subsets of the Actions dict
            pd.concat([
                pd.json_normalize(period["Actions"]),
                pd.json_normalize(period["IceRingActions"])
            ]),
            how="cross", rsuffix="_Action"
        )
        for period in api_json["Periods"]
    ])

    api_df = (
        # the data in "Periods" is what gets thrown together above
        # we can ignore PeriodsGrouped - it's a subset of Periods
        pd.json_normalize({
            k: v 
            for k, v in api_json.items()
            if "Periods" not in k
        })
        .join(periods_df, how="cross", rsuffix="_Action")
        .assign(api_url=url)
    )

    # remove periods from column names
    api_df.columns = [c.replace(".", "_") for c in api_df.columns]

    return api_df

In [19]:
def get_statistics_dfs(url):
    response = requests.get(url)
    try:
        dfs = pd.read_html(response.text)
    except ValueError:
        dfs = []

    if not len(dfs) > 1:
        return pd.DataFrame(), pd.DataFrame()

    team_skaters = [
        df
        .rename(columns={"j#": "jersey_number"})
        .drop(df.columns[[0, 3]], axis=1)
        .assign(team=dfs[0].teams.iloc[i], home=i == 0)
        for i, df in enumerate(dfs[1:6:4])
    ]

    for i in range(2):
        team_dfs = [dfs[2]] + dfs[10::4] if i == 0 else [dfs[6]] + dfs[12::4]
        for period, df in enumerate(team_dfs):
            if period > 0:
                df = (
                    df
                    .drop(columns="pos")
                    .add_suffix(f"_{period}")
                )

            team_skaters[i] = team_skaters[i].join(df)

    team_goalies = [
        name_df
        .rename(columns={"j#": "jersey_number"})
        .drop(name_df.columns[[0, 3]], axis=1)
        .assign(team=dfs[0].teams.iloc[i], home=i == 0)
        .join(
            stats_df
            .drop(columns=[c for c in stats_df.columns if "Unnamed" in c])
        )
        for i, (name_df, stats_df) in enumerate(zip(dfs[3:9:4], dfs[4:10:4]))
    ]

    skaters = pd.concat(team_skaters)
    goalies = pd.concat(team_goalies)

    skaters["statistics_url"] = url
    goalies["statistics_url"] = url

    return skaters, goalies

In [20]:
def get_lineups_df(url):
    response = requests.get(url)
    try:
        dfs = pd.read_html(response.text)
    except ValueError:
        dfs = []

    if not len(dfs) > 1:
        return pd.DataFrame()

    team_dfs = []
    n_team_dfs = len(dfs) // 2
    for i, df in enumerate(dfs[1:]):
        idx = i // n_team_dfs

        df = (
            df
            .drop(df.columns[0], axis=1)
            .rename(columns={"j#": "jersey_number"})
            .assign(
                line_number=i % n_team_dfs,
                team=dfs[0].teams.iloc[idx],
                home=idx == 0,
            )
        )

        if i > 0:
            df.columns = team_dfs[0].columns

        team_dfs.append(df)

    lineups = pd.concat(team_dfs)
    lineups["lineups_url"] = url

    return lineups

In [21]:
def get_game_dfs(game_url):
    skaters, goalies = get_statistics_dfs(game_url.replace("playbyplay", "statistics"))
    lineups = get_lineups_df(game_url.replace("playbyplay", "lineup"))

    # ugly way to find the game-id in the html
    response = requests.get(game_url)
    id_str = 'game-id="'
    start_idx = response.text.find(id_str)
    shortened = response.text[start_idx + len(id_str):]
    end_idx = shortened.find('"')
    game_id = shortened[:end_idx]
    api_df = get_api_df(f"https://realtime.iihf.com/gamestate/GetLatestState/{game_id}")

    for df in (skaters, goalies, lineups, api_df):
        if len(df):
            df["game_url"] = game_url

    return skaters, goalies, lineups, api_df

In [38]:
url = "https://www.iihf.com/en/events/2022/ww/gamecenter/playbyplay/37196/12-den-vs-hun"  #@param {type: "string"}

In [39]:
skaters, goalies, lineups, api_df = get_game_dfs(url)

In [40]:
skaters.iloc[:, :-2]

Unnamed: 0,jersey_number,name,team,home,pos,g,a,p,pim,ts,+/-,tot,shf,avg,ts_1,tot_1,ts_2,tot_2,ts_3,tot_3
0,14,JENSEN Nicoline,DEN,True,F,0,0,0,2,1,1.0,24:44,39.0,0:38,1,8:46,0,7:10,0,8:48
1,15,REFSGAARD Amanda,DEN,True,D,0,0,0,0,0,1.0,27:35,38.0,0:43,0,6:20,0,9:55,0,11:20
2,19,ASPERUP Josephine,DEN,True,D,0,0,0,0,1,0.0,26:17,38.0,0:41,0,7:58,0,9:27,1,8:52
3,21,WEIS Michelle,DEN,True,F,0,0,0,2,4,0.0,20:04,34.0,0:35,2,7:26,1,5:58,1,6:40
4,50,BAU Mia,DEN,True,F,0,0,0,0,1,0.0,10:21,17.0,0:36,1,4:08,0,2:15,0,3:58
5,4,GLUD Silke,DEN,True,F,0,0,0,4,4,1.0,16:37,23.0,0:43,1,7:21,2,4:10,1,5:06
6,8,PERSSON Josefine,DEN,True,F,0,0,0,2,1,1.0,22:23,32.0,0:41,1,6:52,0,6:17,0,9:14
7,22,SKOTT Sofie,DEN,True,D,0,0,0,0,0,0.0,19:03,26.0,0:43,0,6:48,0,5:47,0,6:28
8,63,JAKOBSEN Josefine,DEN,True,F,0,1,1,2,4,0.0,26:07,34.0,0:46,1,8:00,3,8:31,0,9:36
9,80,OSTERGAARD Julie,DEN,True,F,1,0,1,0,1,1.0,21:46,32.0,0:40,0,8:18,0,5:55,1,7:33


In [41]:
goalies.iloc[:, :-2]

Unnamed: 0,jersey_number,name,team,home,pos,sog,ga,svs,svs%
0,30,JENSEN Lisa,DEN,True,GK,45,0,45,100.0
1,72,REPSTOCK-ROMME Cassandra,DEN,True,GK,0,0,0,0.0
0,1,NEMETH Aniko,HUN,False,GK,19,1,18,94.74
1,33,REVESZ Zsuzsa,HUN,False,GK,0,0,0,0.0


In [42]:
lineups.iloc[:, :-2]

Unnamed: 0,jersey_number,Name,Pos,line_number,team,home
0,30.0,JENSEN Lisa,GK,0,DEN,True
1,72.0,REPSTOCK-ROMME Cassandra,GK,0,DEN,True
0,15.0,REFSGAARD Amanda,D,1,DEN,True
1,19.0,ASPERUP Josephine,D,1,DEN,True
2,21.0,WEIS Michelle,F,1,DEN,True
3,14.0,JENSEN Nicoline,F,1,DEN,True
4,50.0,BAU Mia,F,1,DEN,True
0,22.0,SKOTT Sofie,D,2,DEN,True
1,63.0,JAKOBSEN Josefine,F,2,DEN,True
2,4.0,GLUD Silke,F,2,DEN,True


In [43]:
api_df.iloc[:, :-2]

Unnamed: 0,GameId,GameNumber,EventId,Gender,Status,IsGameOverridden,IsGameCompleted,Spectators,GameTime_TimedGameStatus,GameTime_PlayTime,...,Scorer_ReportingName,Scorer_FamilyName,Scorer_GivenName,Scorer_InitialName,NewScore_Home,NewScore_Away,Score,ShotType,Coordinates_X,Coordinates_Y
0,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
1,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
2,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
3,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
4,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
5,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
6,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
7,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
8,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,
9,10209,12,164,W,Game Completed,False,True,719,Final,0,...,,,,,,,,,,


In [44]:
toi_columns = ["Period", "HomeTeam_LongTeamName", "AwayTeam_LongTeamName", "Statistics_TPP_H", "Statistics_TPP_A"]
api_df[toi_columns].drop_duplicates()

Unnamed: 0,Period,HomeTeam_LongTeamName,AwayTeam_LongTeamName,Statistics_TPP_H,Statistics_TPP_A
0,1,Denmark,Hungary,04:00,02:00
8,2,Denmark,Hungary,02:00,06:22
15,3,Denmark,Hungary,02:00,06:53
