# MoneyPuck - Hockey data

### Step 0: Import libraries

In [18]:
import pandas as pd
import numpy as np

In [None]:
from utils import plot_utils, df_utils

### Step 1: Import the dataset

In [None]:
skaters_df = "https://moneypuck.com/moneypuck/playerData/seasonSummary/2024/regular/skaters.csv"
df = pd.read_csv(skaters_df)
df.head(5)

Unnamed: 0,team,season,name,gameId,playerTeam,opposingTeam,home_or_away,gameDate,position,situation,...,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,playoffGame
0,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,other,...,1.0,1.0,0.0,0.017,0.0,0.0,0.037,0.037,0.037,0
1,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,all,...,31.0,30.369,5.0,0.396,0.168,0.168,2.917,2.833,2.714,0
2,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,5on5,...,20.0,19.369,3.0,0.237,0.168,0.168,1.862,1.777,1.665,0
3,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,4on5,...,9.0,9.0,1.0,0.124,0.0,0.0,0.795,0.795,0.789,0
4,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,5on4,...,1.0,1.0,1.0,0.019,0.0,0.0,0.224,0.224,0.224,0


### Step 2: Read the data

In [None]:
df_utils.print_df_size(df)

Number of samples:  218280
Number of features:  111


In [None]:
df_utils.save_column_names(df)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218280 entries, 0 to 218279
Columns: 111 entries, team to playoffGame
dtypes: float64(100), int64(4), object(7)
memory usage: 184.9+ MB


### Step 3: Data cleaning

In [None]:
# Missing values?
df_utils.report_nan(df)

Number of samples:  218280
Number of features:  108


### Step 4: Some stats

In [None]:
df_cpy = df.copy()
df_cpy['goals_per_60'] = (df['I_F_goals'] / df['icetime']) * 60
df_cpy['xG_per_60'] = (df['I_F_xGoals'] / df['icetime']) * 60

In [None]:
plot_utils.plot_histograms_by_group(
    df_cpy,
    group_col="situation",
    hist_col="goals_per_60",
    derived_cols_funcs={
        "goals_per_60": lambda d: (d["I_F_goals"] / d["icetime"]) * 60,
        "xG_per_60": lambda d: (d["I_F_xGoals"] / d["icetime"]) * 60
    }
)

In [None]:
box = plot_utils.create_interactive_boxplot(
    df=df_cpy,
    x="position",
    y="goals_per_60",
    filter_col="situation",
    title="Goals per 60 by Position",
    default_filter_value="all"
)

box.show()

In [None]:
chart = plot_utils.create_interactive_scatter(
    df=df_cpy,
    x='xG_per_60',
    y='goals_per_60',
    category='position',
    filter_col='situation',
    tooltip_cols=['name', 'team', 'position', 'goals_per_60', 'xG_per_60'],
    title='Goals per 60 vs xG per 60 by Situation',
    default_filter_value='all'
)

chart.show()