# MoneyPuck - Hockey data

### Step 0: Import libraries

In [1]:
import pandas as pd
import numpy as np
import os

In [16]:
from utils import plot_utils, df_utils, thinkstats

In [3]:
outputs_dir = 'outputs'

### Step 1: Import the dataset

In [4]:
skaters_df = "https://moneypuck.com/moneypuck/playerData/seasonSummary/2024/regular/skaters.csv"
df = pd.read_csv(skaters_df)
df.head(5)

Unnamed: 0,playerId,season,name,team,position,situation,games_played,icetime,shifts,gameScore,...,OffIce_F_xGoals,OffIce_A_xGoals,OffIce_F_shotAttempts,OffIce_A_shotAttempts,xGoalsForAfterShifts,xGoalsAgainstAfterShifts,corsiForAfterShifts,corsiAgainstAfterShifts,fenwickForAfterShifts,fenwickAgainstAfterShifts
0,8478047,2024,Michael Bunting,NSH,L,other,76,2237.0,37.0,26.19,...,7.28,10.09,72.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8478047,2024,Michael Bunting,NSH,L,all,76,70819.0,1474.0,43.7,...,161.54,187.75,3221.0,3522.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8478047,2024,Michael Bunting,NSH,L,5on5,76,59813.0,1294.0,43.7,...,112.73,122.08,2661.0,2707.0,0.71,1.71,19.0,43.0,16.0,31.0
3,8478047,2024,Michael Bunting,NSH,L,4on5,76,6.0,2.0,2.58,...,0.2,0.17,4.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8478047,2024,Michael Bunting,NSH,L,5on4,76,8763.0,141.0,36.88,...,23.81,2.6,311.0,54.0,0.0,0.01,0.0,1.0,0.0,1.0


### Step 2: Read the data

In [5]:
df_utils.print_df_size(df)

Number of samples: 4600
Number of features: 154


In [6]:
df_utils.save_column_names(df)

Saved: names_columns.txt


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Columns: 154 entries, playerId to fenwickAgainstAfterShifts
dtypes: float64(147), int64(3), object(4)
memory usage: 5.4+ MB


### Step 3: Data cleaning

In [8]:
# Missing values?
df_utils.report_nan(df)

No missing values found.


### Step 4: Some stats

Player reference: Leon Draisaitl, 2025 Hart trophy finalist

In [9]:
namePlayer="Leon Draisaitl"

In [10]:
situations = df['situation'].unique()

### I_F_xGoals: Expected Goals

In [11]:
selected_stat = "I_F_xGoals"
i_f_xgoals_dir = os.path.join(outputs_dir, selected_stat)

In [17]:
# Percentile rank, cdf

results = []

for sit in situations:
    df_sit = df[df['situation'] == sit].copy()
    xGoals = df_sit[selected_stat].values

    player_xGoals = df_sit[df_sit["name"]==namePlayer][selected_stat].values
    threshold = player_xGoals[0]
    percentage = thinkstats.percentile_rank(threshold, xGoals)
    results.append({"situation": sit, "xGoals ≤ player": percentage})

    plot_utils.save_cdf(
        values=xGoals, 
        reference_value=threshold, 
        output_dir=os.path.join(i_f_xgoals_dir, f'cdf_{sit}'),
        title=f'I_F_xGoals - {sit}',
        reference_label=f'{namePlayer}'
        )

df_result = pd.DataFrame(results)
df_result


TypeError: save_cdf() got an unexpected keyword argument 'output_dir'. Did you mean 'output_path'?

### Goals per 60

In [None]:
goals_per_60_dir = os.path.join(outputs_dir, 'goals_per_60')
goals_vs_xg_dir=os.path.join(outputs_dir, 'goals_vs_xg')

In [None]:
for sit in situations:
    df_sit = df[df['situation']==sit].copy()
    df_sit['goals_per_60'] = (df_sit['I_F_goals'] / df_sit['icetime']) * 60
    df_sit['xG_per_60'] = (df_sit['I_F_xGoals'] / df_sit['icetime']) * 60

    plot_utils.save_histogram(
        df=df_sit,
        column='goals_per_60',
        output_dir=os.path.join(goals_per_60_dir, 'histograms'),
        filename_prefix=f'histogram_{sit}',
        title=f'Goals per 60 - {sit}',
        xlabel='Goals per 60 minutes',
        ylabel='Number of players',
        color='blue'
    )

    plot_utils.save_boxplot(
        df=df_sit,
        x_column='position',
        y_column='goals_per_60',
        output_dir=os.path.join(goals_per_60_dir, 'boxplots'),
        filename_prefix=f'boxplot_{sit}',
        title=f'Goals per 60 by Position - {sit}',
        xlabel='Position',
        ylabel='Goals per 60 minutes'
    )

    plot_utils.save_scatterplot(
        df=df_sit,
        x_column='xG_per_60',
        y_column='goals_per_60',
        hue_column='position',
        output_dir=os.path.join(goals_vs_xg_dir, 'scatterplots'),
        filename_prefix=f'scatter_{sit}',
        title=f'Goals per 60 vs Expected Goals per 60 - {sit}',
        xlabel='Expected Goals per 60',
        ylabel='Goals per 60',
    )
