# MoneyPuck - Hockey data
From 2008-09 season to 2024-25

### Step 0: Import libraries

In [18]:
import pandas as pd
import numpy as np

In [24]:
from utils import utils

### Step 1: Import the dataset

In [6]:
# All teams dataset
all_teams_df = "https://moneypuck.com/moneypuck/playerData/careers/gameByGame/all_teams.csv"

# Season 2024-2025: Skaters
skaters_df = "https://moneypuck.com/moneypuck/playerData/seasonSummary/2024/regular/skaters.csv"

# Season 2024-2025: Goalies
goalies_df = "https://moneypuck.com/moneypuck/playerData/seasonSummary/2024/regular/goalies.csv"

# Season 2024-2025: Lines
lines_df = "https://moneypuck.com/moneypuck/playerData/seasonSummary/2024/regular/lines.csv"

# Season 2024-2025: Team level
team_df = "https://moneypuck.com/moneypuck/playerData/seasonSummary/2024/regular/teams.csv"

# Toronto Maple Leafs
tor_df = "https://moneypuck.com/moneypuck/playerData/careers/gameByGame/regular/teams/TOR.csv"

In [20]:
df = pd.read_csv(all_teams_df)
df.head(5)

Unnamed: 0,team,season,name,gameId,playerTeam,opposingTeam,home_or_away,gameDate,position,situation,...,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,playoffGame
0,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,other,...,1.0,1.0,0.0,0.017,0.0,0.0,0.037,0.037,0.037,0
1,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,all,...,31.0,30.369,5.0,0.396,0.168,0.168,2.917,2.833,2.714,0
2,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,5on5,...,20.0,19.369,3.0,0.237,0.168,0.168,1.862,1.777,1.665,0
3,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,4on5,...,9.0,9.0,1.0,0.124,0.0,0.0,0.795,0.795,0.789,0
4,NYR,2008,NYR,2008020001,NYR,T.B,AWAY,20081004,Team Level,5on4,...,1.0,1.0,1.0,0.019,0.0,0.0,0.224,0.224,0.224,0


### Step 2: Read the data

In [25]:
utils.print_df_size(df)

Number of samples:  218280
Number of features:  111


In [9]:
cols = df.columns.tolist()
with open("names_columns.txt", "w") as f:
    for col in cols:
        f.write(col + '\n')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218280 entries, 0 to 218279
Columns: 111 entries, team to playoffGame
dtypes: float64(100), int64(4), object(7)
memory usage: 184.9+ MB


### Step 3: Data cleaning

In [26]:
df = df.drop("gameDate", axis=1)
df = df.drop("gameId", axis=1)
df = df.drop("position", axis=1)
utils.print_df_size(df)

Number of samples:  218280
Number of features:  108


Missing values?

In [27]:
if df.isna().any().any():
    nan_counts = df.isna().sum()
    nan_counts = nan_counts[nan_counts>0]
    print(nan_counts)
else:
    print("No missing values.")

No missing values.


Columns "name", "team" and "playerTeam" are the same?

In [28]:
if sum(df["team"]!=df["name"])==0:
    df = df.drop("name", axis=1)
utils.print_df_size(df)

Number of samples:  218280
Number of features:  107


In [29]:
if sum(df["team"]!=df["playerTeam"])==0:
    df = df.drop("playerTeam", axis=1)
utils.print_df_size(df)

Number of samples:  218280
Number of features:  106


In [30]:
df.head(5)

Unnamed: 0,team,season,opposingTeam,home_or_away,situation,xGoalsPercentage,corsiPercentage,fenwickPercentage,iceTime,xOnGoalFor,...,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,playoffGame
0,NYR,2008,T.B,AWAY,other,0.0,0.0,0.0,42.0,0.0,...,1.0,1.0,0.0,0.017,0.0,0.0,0.037,0.037,0.037,0
1,NYR,2008,T.B,AWAY,all,0.4596,0.6408,0.631,3600.0,37.733,...,31.0,30.369,5.0,0.396,0.168,0.168,2.917,2.833,2.714,0
2,NYR,2008,T.B,AWAY,5on5,0.4857,0.6429,0.6364,2283.0,24.802,...,20.0,19.369,3.0,0.237,0.168,0.168,1.862,1.777,1.665,0
3,NYR,2008,T.B,AWAY,4on5,0.0482,0.0909,0.1,468.0,0.64,...,9.0,9.0,1.0,0.124,0.0,0.0,0.795,0.795,0.789,0
4,NYR,2008,T.B,AWAY,5on4,0.7317,0.9524,0.9444,807.0,12.291,...,1.0,1.0,1.0,0.019,0.0,0.0,0.224,0.224,0.224,0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218280 entries, 0 to 218279
Columns: 106 entries, team to playoffGame
dtypes: float64(100), int64(2), object(4)
memory usage: 176.5+ MB


### Step 3