## Star Players

### Insert Star Player Data

In [12]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

def fetch_player_stats(season):
    url = f"https://www.hockey-reference.com/leagues/NHL_{season}_skaters.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'stats'})
    
    df = pd.read_html(str(table))[0]
    df.columns = df.columns.droplevel(0) 
    df['Season'] = season
    return df

# List of seasons
seasons = [2023, 2022, 2021, 2020, 2019, 2018]

all_data = pd.DataFrame()

for season in tqdm(seasons):
    print(f"Fetching data for season {season}")
    season_stats = fetch_player_stats(season)
    all_data = pd.concat([all_data, season_stats], ignore_index=True)


all_data.to_csv("nhl_player_stats_last_5_seasons.csv", index=False)
print("Raw data saved to nhl_player_stats_last_5_seasons.csv")
all_data = pd.read_csv("nhl_player_stats_last_5_seasons.csv")
print(all_data.columns)

all_data = all_data.dropna(subset=['Player'])
all_data.fillna(0, inplace=True)  

required_columns = ['G', 'A', 'ATOI', '+/-']
for col in required_columns:
    if col not in all_data.columns:
        all_data[col] = 0

all_data['Points'] = all_data['G'] + all_data['A']

numeric_columns = ['G', 'A', 'Points', 'ATOI', '+/-']
for col in numeric_columns:
    all_data[col] = pd.to_numeric(all_data[col], errors='coerce').fillna(0)

criteria = ['Points', 'ATOI', '+/-']

top_players = pd.DataFrame()

for season in seasons:
    season_data = all_data[all_data['Season'] == season]
    for team in season_data['Tm'].unique():
        team_data = season_data[season_data['Tm'] == team]
        top_team_players = team_data.nlargest(5, criteria)
        top_players = pd.concat([top_players, top_team_players])

top_players.to_csv("nhl_top_players_last_5_seasons.csv", index=False)
print("Top players data saved to nhl_top_players_last_5_seasons.csv")

top_players = pd.read_csv("nhl_top_players_last_5_seasons.csv")

final_columns = ['Player', 'Season', 'Tm', 'Pos', 'GP', 'G', 'A', 'Points', 'ATOI', '+/-']
final_data = top_players[final_columns]


final_data.to_csv("nhl_team_star_players_last_5_seasons.csv", index=False)
print("Final dataset saved to nhl_team_star_players_last_5_seasons.csv")


  0%|                                                     | 0/6 [00:00<?, ?it/s]

Fetching data for season 2023


 17%|███████▌                                     | 1/6 [00:06<00:30,  6.01s/it]

Fetching data for season 2022


 33%|███████████████                              | 2/6 [00:12<00:24,  6.10s/it]

Fetching data for season 2021


 50%|██████████████████████▌                      | 3/6 [00:18<00:17,  5.99s/it]

Fetching data for season 2020


 67%|██████████████████████████████               | 4/6 [00:20<00:09,  4.64s/it]

Fetching data for season 2019


 83%|█████████████████████████████████████▌       | 5/6 [00:23<00:03,  3.93s/it]

Fetching data for season 2018


100%|█████████████████████████████████████████████| 6/6 [00:29<00:00,  4.86s/it]


Raw data saved to nhl_player_stats_last_5_seasons.csv
Index(['Rk', 'Player', 'Age', 'Tm', 'Pos', 'GP', 'G', 'A', 'PTS', '+/-', 'PIM',
       'PS', 'EV', 'PP', 'SH', 'GW', 'EV.1', 'PP.1', 'SH.1', 'S', 'S%', 'TOI',
       'ATOI', 'BLK', 'HIT', 'FOW', 'FOL', 'FO%', 'Season'],
      dtype='object')
Top players data saved to nhl_top_players_last_5_seasons.csv
Final dataset saved to nhl_team_star_players_last_5_seasons.csv


In [48]:
star_player_data = pd.read_csv("nhl_team_star_players_last_5_seasons.csv")

print(star_player_data.head())

             Player  Season   Tm Pos  GP     G     A  Points  ATOI   +/-
0  William Nylander    2023  TOR   C  82  40.0  47.0  4047.0   0.0  10.0
1   Auston Matthews    2023  TOR   C  74  40.0  45.0  4045.0   0.0  31.0
2      John Tavares    2023  TOR   C  80  36.0  44.0  3644.0   0.0  -7.0
3      Mitch Marner    2023  TOR  RW  80  30.0  69.0  3069.0   0.0  18.0
4   Michael Bunting    2023  TOR  LW  82  23.0  26.0  2326.0   0.0  21.0


In [49]:
star_player_data.describe()

Unnamed: 0,Season,G,A,Points,ATOI,+/-
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2020.52,22.85,29.963,2313.973,0.0,1.527
std,1.712464,9.54351,15.117015,965.373118,0.0,14.853144
min,2018.0,0.0,0.0,0.0,0.0,-45.0
25%,2019.0,17.0,19.0,1718.0,0.0,-7.25
50%,2021.0,22.0,27.0,2218.5,0.0,0.0
75%,2022.0,28.0,39.0,2827.5,0.0,11.0
max,2023.0,64.0,89.0,6489.0,0.0,64.0


In [50]:
import plotly.express as px

fig = px.bar(star_player_data, x='Tm', y='Points', color='Season', title='Total Points by Team and Season')
fig.show()

fig = px.scatter(star_player_data, x='ATOI', y='Points', color='Tm', title='Points vs ATOI', hover_data=['Player'])
fig.show()

### Top 5% Players for Various Metrics

In [51]:
thresholds = star_player_data.groupby('Season')[['Points', 'ATOI', '+/-']].quantile(0.95).reset_index()
print(thresholds)


   Season  Points  ATOI    +/-
0    2018  3960.4   0.0  25.00
1    2019  4147.8   0.0  23.80
2    2020  3539.0   0.0  21.00
3    2021  2630.4   0.0  22.00
4    2022  4263.1   0.0  29.55
5    2023  4252.8   0.0  28.10


In [52]:
data_with_thresholds = star_player_data.merge(thresholds, on='Season', suffixes=('', '_95th'))

top_players = data_with_thresholds[
    (data_with_thresholds['Points'] >= data_with_thresholds['Points_95th']) |
    (data_with_thresholds['ATOI'] >= data_with_thresholds['ATOI_95th']) |
    (data_with_thresholds['+/-'] >= data_with_thresholds['+/-_95th'])
]

print(top_players.head())


             Player  Season   Tm Pos  GP     G     A  Points  ATOI   +/-  \
0  William Nylander    2023  TOR   C  82  40.0  47.0  4047.0   0.0  10.0   
1   Auston Matthews    2023  TOR   C  74  40.0  45.0  4045.0   0.0  31.0   
2      John Tavares    2023  TOR   C  80  36.0  44.0  3644.0   0.0  -7.0   
3      Mitch Marner    2023  TOR  RW  80  30.0  69.0  3069.0   0.0  18.0   
4   Michael Bunting    2023  TOR  LW  82  23.0  26.0  2326.0   0.0  21.0   

   Points_95th  ATOI_95th  +/-_95th  
0       4252.8        0.0      28.1  
1       4252.8        0.0      28.1  
2       4252.8        0.0      28.1  
3       4252.8        0.0      28.1  
4       4252.8        0.0      28.1  


In [53]:
pd.set_option('display.max_rows', 1000)
top_players_sorted = top_players.sort_values(by=['Tm', 'Season'])

top_players_sorted = top_players_sorted[['Tm', 'Season', 'Player', 'Points', 'ATOI', '+/-']]

print(top_players_sorted)

top_players_sorted.to_csv("nhl_top_5_percent_players_sorted.csv", index=False)


      Tm  Season                 Player  Points  ATOI   +/-
955  ANA    2018         Rickard Rakell  3435.0   0.0   6.0
956  ANA    2018            Ondřej Kaše  2018.0   0.0  18.0
957  ANA    2018          Adam Henrique  2016.0   0.0  17.0
958  ANA    2018            Corey Perry  1732.0   0.0  -4.0
959  ANA    2018      Jakob Silfverberg  1723.0   0.0   6.0
680  ANA    2019      Jakob Silfverberg  2419.0   0.0  -9.0
681  ANA    2019         Rickard Rakell  1825.0   0.0 -13.0
682  ANA    2019          Adam Henrique  1824.0   0.0  -5.0
683  ANA    2019           Ryan Getzlaf  1434.0   0.0 -19.0
684  ANA    2019           Nick Ritchie   922.0   0.0   6.0
535  ANA    2020          Adam Henrique  2617.0   0.0  -3.0
536  ANA    2020      Jakob Silfverberg  2118.0   0.0   0.0
537  ANA    2020         Rickard Rakell  1527.0   0.0  -5.0
538  ANA    2020           Ryan Getzlaf  1329.0   0.0 -16.0
539  ANA    2020             Cam Fowler   920.0   0.0   0.0
360  ANA    2021            Max Comtois 

#### Number of Unique Players in Top 5% by Team and by Season

In [54]:
unique_top_players = top_players_sorted.drop_duplicates(subset=['Tm', 'Player'])
unique_player_counts = unique_top_players.groupby(['Season', 'Tm']).size().reset_index(name='UniquePlayerCount')
unique_player_counts_sorted = unique_player_counts.sort_values(by=['Season', 'Tm'])

print(unique_player_counts_sorted)

unique_player_counts_sorted.to_csv("number_of_top_players_by_team_and_season.csv", index=False)

     Season   Tm  UniquePlayerCount
0      2018  ANA                  5
1      2018  ARI                  5
2      2018  BOS                  5
3      2018  BUF                  5
4      2018  CAR                  5
5      2018  CBJ                  5
6      2018  CGY                  5
7      2018  CHI                  5
8      2018  COL                  5
9      2018  DAL                  5
10     2018  DET                  5
11     2018  EDM                  5
12     2018  FLA                  5
13     2018  LAK                  5
14     2018  MIN                  5
15     2018  MTL                  5
16     2018  NJD                  5
17     2018  NSH                  5
18     2018  NYI                  5
19     2018  NYR                  5
20     2018  OTT                  5
21     2018  PHI                  5
22     2018  PIT                  5
23     2018  SJS                  5
24     2018  STL                  5
25     2018  TBL                  5
26     2018  TOR            

### Number of top 100 players based on AAV% of Salary Cap

In [55]:
aav_percent_cap = pd.read_csv("nhl_top100_aav_salary_cap_2018_2024.csv")
aav_percent_cap.head()

Unnamed: 0,Season,Team,Top100SalaryPlayers
0,2018,BOS,6
1,2018,SJS,6
2,2018,WPG,6
3,2018,CHI,5
4,2018,EDM,5


In [56]:
aav_percent_cap.tail()

Unnamed: 0,Season,Team,Top100SalaryPlayers
176,2023,SJS,2
177,2023,ANA,1
178,2023,CHI,1
179,2023,SEA,1
180,2023,WPG,1


## Adding in Rivalry Teams

#### Insert main dataset

In [57]:
nhldata = pd.read_excel("2018-2024 for MODEL.xlsx")

#### Insert known rivalries

In [58]:
rivalries = [
    ('Florida Panthers', 'Tampa Bay Lightning'),
    ('Calgary Flames', 'Edmonton Oilers'),
    ('New Jersey Devils', 'New York Rangers'),
    ('Toronto Maple Leafs', 'Boston Bruins'),
    ('Philadelphia Flyers', 'Pittsburgh Penguins'),
    ('Winnipeg Jets', 'Minnesota Wild'),
    ('New York Islanders', 'New York Rangers'),
    ('Ottawa Senators', 'Toronto Maple Leafs'),
    ('Boston Bruins', 'Montreal Canadiens'),
    ('Boston Bruins', 'Toronto Maple Leafs'),
    ('Detroit Red Wings', 'Toronto Maple Leafs'),
    ('Florida Panthers', 'Tampa Bay Lightning'),
    ('Montreal Canadiens', 'Toronto Maple Leafs'),
    ('Chicago Blackhawks', 'Detroit Red Wings')
]


#### Check for rivalries

In [59]:
def is_rivalry(home_team, away_team, rivalries):
    return any([(home_team, away_team) == rivalry or (away_team, home_team) == rivalry for rivalry in rivalries])


In [60]:
nhldata['Rivalry'] = nhldata.apply(
    lambda row: is_rivalry(row['Home Team'], row['Opponent'], rivalries), axis=1)

nhldata['Rivalry'] = nhldata['Rivalry'].astype(bool)

In [61]:
nhldata = pd.DataFrame(nhldata)

file_path = "/Users/aylaspitz/Desktop/MSDS Capstone/nhldata_rivalries.csv"

nhldata.to_csv(file_path, index=False)


In [62]:
true_count = nhldata['Rivalry'].sum()
print(true_count)

265


## Adding in Home Game Streak

In [63]:
nhldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15202 entries, 0 to 15201
Data columns (total 29 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID              15202 non-null  object        
 1   Team            15202 non-null  object        
 2   GP              15202 non-null  int64         
 3   Date            15202 non-null  datetime64[ns]
 4   Year            480 non-null    float64       
 5   Month           15202 non-null  int64         
 6   Day             15202 non-null  object        
 7   Order of Day    15202 non-null  int64         
 8   Order of Month  15202 non-null  int64         
 9   Full Time       15202 non-null  object        
 10  Opponent        15202 non-null  object        
 11  Home Team       15202 non-null  object        
 12  Capacity        15202 non-null  int64         
 13  GF              15202 non-null  int64         
 14  GF Average      480 non-null    float64       
 15  GA

In [64]:
nhldata = nhldata.sort_values(by=['Team', 'Date'])


In [65]:
nhldata['Is Home Game'] = nhldata['Team'] == nhldata['Home Team']
nhldata['Home Game Streak'] = nhldata.groupby('Team')['Is Home Game'].cumsum() - nhldata.groupby('Team')['Is Home Game'].cumsum().where(~nhldata['Is Home Game']).ffill().fillna(0).astype(int)

In [66]:
print(nhldata[['Team', 'Date', 'Home Team', 'Is Home Game', 'Home Game Streak']].head())

               Team       Date        Home Team  Is Home Game  \
9556  Anaheim Ducks 2018-10-03  San Jose Sharks         False   
9557  Anaheim Ducks 2018-10-06  Arizona Coyotes         False   
9558  Anaheim Ducks 2018-10-08    Anaheim Ducks          True   
9559  Anaheim Ducks 2018-10-10    Anaheim Ducks          True   
9560  Anaheim Ducks 2018-10-13     Dallas Stars         False   

      Home Game Streak  
9556                 0  
9557                 0  
9558                 1  
9559                 2  
9560                 0  


In [67]:
file_path = "/Users/aylaspitz/Desktop/MSDS Capstone/nhldata_rivalries_home_game_streak.csv"
nhldata.to_csv(file_path, index=False)