In [63]:
import requests
import pandas as pd

def get_scoring_seasons(season_start, season_end):
    # Get the summary JSON data
    scoring_stats_url = f"https://api.nhle.com/stats/rest/en/skater/summary?isAggregate=true&isGame=false&sort=%5B%7B%22property%22:%22points%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22goals%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22assists%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameTypeId=2%20and%20seasonId%3C={season_end}%20and%20seasonId%3E={season_start}"
    scoring_stats_data = requests.get(scoring_stats_url).json()["data"]

    # Convert to DataFrame and select/rename desired columns
    scoring_stats_df = pd.json_normalize(scoring_stats_data)
    scoring_stats_df = scoring_stats_df.rename(columns={
        "playerId": "player_id",
        "skaterFullName": "player",
        "positionCode": "position",
        "evGoals": "es_goals",
        "evPoints": "es_points",
        "ppGoals": "pp_goals",
        "ppPoints": "pp_points",
        "shGoals": "sh_goals",
        "shPoints": "sh_points",
        "otGoals": "ot_goals"
    })[[
        "player_id", "player", "position", "goals", "assists", "points", 
        "es_goals", "es_points", "pp_goals", "pp_points", "sh_goals", "sh_points", "ot_goals"
    ]]

    # Add missing assists
    scoring_stats_df["es_assists"] = scoring_stats_df["es_points"] - scoring_stats_df["es_goals"]
    scoring_stats_df["pp_assists"] = scoring_stats_df["pp_points"] - scoring_stats_df["pp_goals"]
    scoring_stats_df["sh_assists"] = scoring_stats_df["sh_points"] - scoring_stats_df["sh_goals"]

    # Add proportions
    scoring_stats_df["es_goals_proportion"] = scoring_stats_df["es_goals"] / scoring_stats_df["goals"]
    scoring_stats_df["pp_goals_proportion"] = scoring_stats_df["pp_goals"] / scoring_stats_df["goals"]
    scoring_stats_df["sh_goals_proportion"] = scoring_stats_df["sh_goals"] / scoring_stats_df["goals"]
    scoring_stats_df["ot_goals_proportion"] = scoring_stats_df["ot_goals"] / scoring_stats_df["goals"]
    scoring_stats_df["es_assists_proportion"] = scoring_stats_df["es_assists"] / scoring_stats_df["assists"]
    scoring_stats_df["pp_assists_proportion"] = scoring_stats_df["pp_assists"] / scoring_stats_df["assists"]
    scoring_stats_df["sh_assists_proportion"] = scoring_stats_df["sh_assists"] / scoring_stats_df["assists"]
    scoring_stats_df["es_points_proportion"] = scoring_stats_df["es_points"] / scoring_stats_df["points"]
    scoring_stats_df["pp_points_proportion"] = scoring_stats_df["pp_points"] / scoring_stats_df["points"]
    scoring_stats_df["sh_points_proportion"] = scoring_stats_df["sh_points"] / scoring_stats_df["points"]

    # Change position to F/D
    scoring_stats_df["position"] = scoring_stats_df["position"].apply(lambda x: "D" if x == "D" else "F")

    # Arrange data by descending points
    scoring_stats_df = scoring_stats_df.sort_values(by="points", ascending=False)

    # Add on-ice goals-for data
    oi_stats_url = f"https://api.nhle.com/stats/rest/en/skater/goalsForAgainst?isAggregate=true&isGame=false&sort=%5B%7B%22property%22:%22evenStrengthGoalDifference%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameTypeId=2%20and%20seasonId%3C={season_end}%20and%20seasonId%3E={season_start}"
    oi_stats_data = requests.get(oi_stats_url).json()["data"]

    oi_stats_df = pd.json_normalize(oi_stats_data)
    oi_stats_df = oi_stats_df.rename(columns={
        "playerId": "player_id",
        "evenStrengthGoalsFor": "oi_es_goals_for",
        "powerPlayGoalFor": "oi_pp_goals_for",
        "shortHandedGoalsFor": "oi_sh_goals_for"
    })[["player_id", "oi_es_goals_for", "oi_pp_goals_for", "oi_sh_goals_for"]]

    # Join the on-ice data to the general scoring data
    scoring_stats_df = scoring_stats_df.merge(oi_stats_df, on="player_id", how="left").fillna(0)

    # Add on-ice data excluding the skater's own goals
    scoring_stats_df["oi_es_gf_xskater"] = scoring_stats_df["oi_es_goals_for"] - scoring_stats_df["es_goals"]
    scoring_stats_df["oi_pp_gf_xskater"] = scoring_stats_df["oi_pp_goals_for"] - scoring_stats_df["pp_goals"]
    scoring_stats_df["oi_sh_gf_xskater"] = scoring_stats_df["oi_sh_goals_for"] - scoring_stats_df["sh_goals"]

    # Add A1/A2 data [all strengths and power play]
    a1_a2_stats_url = f"https://api.nhle.com/stats/rest/en/skater/scoringpergame?isAggregate=true&isGame=false&sort=%5B%7B%22property%22:%22pointsPerGame%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22goalsPerGame%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameTypeId=2%20and%20seasonId%3C={season_end}%20and%20seasonId%3E={season_start}"
    a1_a2_stats_data = requests.get(a1_a2_stats_url).json()["data"]

    a1_a2_stats_df = pd.json_normalize(a1_a2_stats_data)
    a1_a2_stats_df = a1_a2_stats_df.rename(columns={
        "playerId": "player_id",
        "totalPrimaryAssists": "primary_assists",
        "totalSecondaryAssists": "secondary_assists"
    })[["player_id", "primary_assists", "secondary_assists"]]

    a1_a2_stats_df["primary_a_proportion"] = a1_a2_stats_df["primary_assists"] / (a1_a2_stats_df["primary_assists"] + a1_a2_stats_df["secondary_assists"])

    # Join the A1/A2 data to the general scoring data
    scoring_stats_df = scoring_stats_df.merge(a1_a2_stats_df.drop(columns="secondary_assists"), on="player_id", how="left")

    # Repeat for power play A1/A2 data
    a1_a2_pp_stats_url = f"https://api.nhle.com/stats/rest/en/skater/powerplay?isAggregate=true&isGame=false&sort=%5B%7B%22property%22:%22ppTimeOnIce%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameTypeId=2%20and%20seasonId%3C={season_end}%20and%20seasonId%3E={season_start}"
    a1_a2_pp_stats_data = requests.get(a1_a2_pp_stats_url).json()["data"]

    a1_a2_pp_stats_df = pd.json_normalize(a1_a2_pp_stats_data)
    a1_a2_pp_stats_df = a1_a2_pp_stats_df.rename(columns={
        "playerId": "player_id",
        "ppPrimaryAssists": "pp_primary_assists",
        "ppSecondaryAssists": "pp_secondary_assists"
    })[["player_id", "pp_primary_assists", "pp_secondary_assists"]]

    a1_a2_pp_stats_df["pp_primary_a_proportion"] = a1_a2_pp_stats_df["pp_primary_assists"] / (a1_a2_pp_stats_df["pp_primary_assists"] + a1_a2_pp_stats_df["pp_secondary_assists"])

    # Join the A1/A2 PP data to the general scoring data
    scoring_stats_df = scoring_stats_df.merge(a1_a2_pp_stats_df.drop(columns="pp_secondary_assists"), on="player_id", how="left")

    return scoring_stats_df


In [74]:
import requests
import pandas as pd
import numpy as np
#empy net is being funky and all EN stats are missing here currently
def get_scoring_dates(date_start, date_end):
    
    # Function to handle API requests and return JSON data
    def get_api_data(url):
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        return response.json()['data']
    
    # Construct URLs for the API requests
    scoring_stats_url = (
        f"https://api.nhle.com/stats/rest/en/skater/summary?"
        f"isAggregate=true&isGame=true&sort=%5B%7B%22property%22:%22points%22,%22direction%22:%22DESC%22%7D,"
        f"%7B%22property%22:%22goals%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22assists%22,%22direction%22:%22DESC%22%7D,"
        f"%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameDate%3C=%22"
        f"{date_end}%2023%3A59%3A59%22%20and%20gameDate%3E=%22{date_start}%22%20and%20gameTypeId=2"
    )
    
    # Get and process the scoring stats data
    scoring_stats_data = pd.json_normalize(get_api_data(scoring_stats_url))
    
    # Select and rename the desired columns
    scoring_stats_data = scoring_stats_data.rename(columns={
        'playerId': 'player_id',
        'skaterFullName': 'player',
        'positionCode': 'position',
        'evGoals': 'es_goals',
        'evPoints': 'es_points',
        'ppGoals': 'pp_goals',
        'ppPoints': 'pp_points',
        'shGoals': 'sh_goals',
        'shPoints': 'sh_points',
        'otGoals': 'ot_goals'
    }).filter([
        'player_id', 'player', 'position', 'goals', 'assists', 'points', 
        'es_goals', 'es_points', 'pp_goals', 'pp_points', 'sh_goals', 
        'sh_points', 'ot_goals'
    ])
    
    # Add missing assists
    scoring_stats_data['es_assists'] = scoring_stats_data['es_points'] - scoring_stats_data['es_goals']
    scoring_stats_data['pp_assists'] = scoring_stats_data['pp_points'] - scoring_stats_data['pp_goals']
    scoring_stats_data['sh_assists'] = scoring_stats_data['sh_points'] - scoring_stats_data['sh_goals']
    
    # Add proportions
    for col in ['es', 'pp', 'sh']:
        scoring_stats_data[f'{col}_goals_proportion'] = scoring_stats_data[f'{col}_goals'] / scoring_stats_data['goals']
        scoring_stats_data[f'{col}_assists_proportion'] = scoring_stats_data[f'{col}_assists'] / scoring_stats_data['assists']
        scoring_stats_data[f'{col}_points_proportion'] = scoring_stats_data[f'{col}_points'] / scoring_stats_data['points']
    
    # Add overtime goals proportion
    scoring_stats_data['ot_goals_proportion'] = scoring_stats_data['ot_goals'] / scoring_stats_data['goals']
    
    # Change position to F/D
    scoring_stats_data['position'] = np.where(scoring_stats_data['position'] == 'D', 'D', 'F')
    
    # Sort data by descending points
    scoring_stats_data = scoring_stats_data.sort_values(by='points', ascending=False)
    
    # Process on-ice goals-for data
    oi_stats_url = (
        f"https://api.nhle.com/stats/rest/en/skater/goalsForAgainst?"
        f"isAggregate=true&isGame=true&sort=%5B%7B%22property%22:%22evenStrengthGoalDifference%22,%22direction%22:%22DESC%22%7D,"
        f"%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameDate%3C=%22"
        f"{date_end}%2023%3A59%3A59%22%20and%20gameDate%3E=%22{date_start}%22%20and%20gameTypeId=2"
    )
    
    oi_stats_data = pd.json_normalize(get_api_data(oi_stats_url)).rename(columns={
        'playerId': 'player_id',
        'evenStrengthGoalsFor': 'oi_es_goals_for',
        'powerPlayGoalFor': 'oi_pp_goals_for',
        'shortHandedGoalsFor': 'oi_sh_goals_for'
    })
    
    # Join the on-ice data to the general scoring data
    scoring_stats_data = scoring_stats_data.merge(oi_stats_data, on='player_id', how='left').fillna(0)
    
    # Add on-ice data excluding the skater's own goals
    scoring_stats_data['oi_es_gf_xskater'] = scoring_stats_data['oi_es_goals_for'] - scoring_stats_data['es_goals']
    scoring_stats_data['oi_pp_gf_xskater'] = scoring_stats_data['oi_pp_goals_for'] - scoring_stats_data['pp_goals']
    scoring_stats_data['oi_sh_gf_xskater'] = scoring_stats_data['oi_sh_goals_for'] - scoring_stats_data['sh_goals']
    
    # Process A1/A2 data (all strengths and power play)
    a1_a2_stats_url = (
        f"https://api.nhle.com/stats/rest/en/skater/scoringpergame?"
        f"isAggregate=true&isGame=true&sort=%5B%7B%22property%22:%22pointsPerGame%22,%22direction%22:%22DESC%22%7D,"
        f"%7B%22property%22:%22goalsPerGame%22,%22direction%22:%22DESC%22%7D,"
        f"%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameDate%3C=%22"
        f"{date_end}%2023%3A59%3A59%22%20and%20gameDate%3E=%22{date_start}%22%20and%20gameTypeId=2"
    )
    
    a1_a2_stats_data = pd.json_normalize(get_api_data(a1_a2_stats_url)).rename(columns={
        'playerId': 'player_id',
        'totalPrimaryAssists': 'primary_assists',
        'totalSecondaryAssists': 'secondary_assists'
    })
    
    a1_a2_stats_data['primary_a_proportion'] = a1_a2_stats_data['primary_assists'] / a1_a2_stats_data['assists']
    
    scoring_stats_data = scoring_stats_data.merge(a1_a2_stats_data.drop(columns=['assists']), on='player_id', how='left').fillna(0)
    
    # Process power play A1/A2 data
    a1_a2_pp_stats_url = (
        f"https://api.nhle.com/stats/rest/en/skater/powerplay?"
        f"isAggregate=true&isGame=true&sort=%5B%7B%22property%22:%22ppTimeOnIce%22,%22direction%22:%22DESC%22%7D,"
        f"%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D&start=0&limit=-1&cayenneExp=gameDate%3C=%22"
        f"{date_end}%2023%3A59%3A59%22%20and%20gameDate%3E=%22{date_start}%22%20and%20gameTypeId=2"
    )
    
    a1_a2_pp_stats_data = pd.json_normalize(get_api_data(a1_a2_pp_stats_url)).rename(columns={
        'playerId': 'player_id',
        'ppPrimaryAssists': 'pp_primary_assists',
        'ppSecondaryAssists': 'pp_secondary_assists'
    })
    
    a1_a2_pp_stats_data['pp_primary_a_proportion'] = a1_a2_pp_stats_data['pp_primary_assists'] / a1_a2_pp_stats_data['ppAssists']
    
    scoring_stats_data = scoring_stats_data.merge(a1_a2_pp_stats_data.drop(columns=['ppAssists']), on='player_id', how='left').fillna(0)
    
    # Process empty-net and game-winning goals


#     en_stats_url = (
#         f"https://api.nhle.com/stats/rest/en/skater/realtime?isAggregate=true&isGame=false"
#         f"&sort=%5B%7B%22property%22:%22hits%22,%22direction%22:%22DESC%22%7D,"
#         f"%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D"
#         f"&start=0&limit=-1&cayenneExp=gameTypeId=2%20and%20seasonId%3C={date_end}%20and%20seasonId%3E={date_start}"
# )
#     en_stats_data = pd.json_normalize(get_api_data(en_stats_url)).rename(columns={
#         'playerId': 'player_id',
#         'emptyNetGoals': 'en_goals',
#         'gameWinningGoals': 'gw_goals'
#     })
    
    #scoring_stats_data = scoring_stats_data.merge(en_stats_data, on='player_id', how='left').fillna(0)
    
    # Select final columns for the output
    scoring_stats_data = scoring_stats_data.filter([
        'player_id', 'player', 'position', 'goals', 'assists', 'points', 'primary_assists',
        'secondary_assists', 'primary_a_proportion', 'en_goals', 'gw_goals', 'es_goals',
        'es_assists', 'es_points', 'es_goals_proportion', 'es_assists_proportion', 'es_points_proportion',
        'pp_goals', 'pp_assists', 'pp_points', 'pp_goals_proportion', 'pp_assists_proportion', 
        'pp_points_proportion', 'pp_primary_assists', 'pp_secondary_assists', 'pp_primary_a_proportion',
        'sh_goals', 'sh_assists', 'sh_points', 'sh_goals_proportion', 'sh_assists_proportion', 
        'sh_points_proportion', 'oi_es_goals_for', 'oi_es_gf_xskater', 'oi_pp_goals_for', 
        'oi_pp_gf_xskater', 'oi_sh_goals_for', 'oi_sh_gf_xskater', 'ot_goals', 'ot_goals_proportion'
    ]).sort_values(by='points', ascending=False)
    
    return scoring_stats_data

# Example usage
date_start = "2024-01-01"
date_end = "2024-02-01"
df = get_scoring_dates(date_start = "2023-01-01",
                                       date_end = "2024-04-18")
print(df.head())


In [61]:
import requests
import pandas as pd

def get_toi_dates(date_start, date_end, rounding=True):
    # Construct the URL for the API request
    toi_stats_url = (
        f"https://api.nhle.com/stats/rest/en/skater/timeonice?"
        f"isAggregate=true&isGame=true&sort=%5B%7B%22property%22:%22timeOnIce%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D"
        f"&start=0&limit=-1&cayenneExp=gameDate%3C=%22{date_end}%2023%3A59%3A59%22%20and%20gameDate%3E=%22{date_start}%22%20and%20gameTypeId=2"
    )

    # Get the JSON data from the API
    response = requests.get(toi_stats_url)
    response.raise_for_status()  # Ensure the request was successful
    toi_stats_data = response.json()['data']

    # Convert the JSON data to a pandas DataFrame
    df = pd.json_normalize(toi_stats_data)

    # Select and rename the desired columns
    df = df.rename(columns={
        'playerId': 'player_id',
        'skaterFullName': 'player',
        'positionCode': 'position',
        'gamesPlayed': 'games_played',
        'timeOnIce': 'toi_total',
        'timeOnIcePerGame': 'toi_gp',
        'timeOnIcePerShift': 'toi_shift',
        'shifts': 'shifts',
        'shiftsPerGame': 'shifts_gp',
        'evTimeOnIce': 'toi_es_total',
        'evTimeOnIcePerGame': 'toi_es_gp',
        'ppTimeOnIce': 'toi_pp_total',
        'ppTimeOnIcePerGame': 'toi_pp_gp',
        'shTimeOnIce': 'toi_sh_total',
        'shTimeOnIcePerGame': 'toi_sh_gp',
        'otTimeOnIce': 'toi_ot_total',
        'otTimeOnIcePerOtGame': 'toi_ot_per_ot_gp'
    })

    # Change position to F/D
    df['position'] = df['position'].apply(lambda x: 'D' if x == 'D' else 'F')

    # Fill NAs in OT data with 0s
    df['toi_ot_per_ot_gp'] = df['toi_ot_per_ot_gp'].fillna(0)

    # Arrange data by descending TOI/GP
    df = df.sort_values(by='toi_gp', ascending=False)

    # Add proportion of total TOI that is ES, PP, SH, OT
    df['proportion_es'] = (df['toi_es_total'] / df['toi_total']).round(3)
    df['proportion_pp'] = (df['toi_pp_total'] / df['toi_total']).round(3)
    df['proportion_sh'] = (df['toi_sh_total'] / df['toi_total']).round(3)
    df['proportion_ot'] = (df['toi_ot_total'] / df['toi_total']).round(3)

    # Apply the rounding argument
    if rounding:
        rounding_columns = [col for col in df.columns if col.endswith('_gp') or col.endswith('_shift')]
        df[rounding_columns] = df[rounding_columns].round()

    return df




In [2]:
import requests
import pandas as pd
import numpy as np

def get_toi_seasons(season_start, season_end, aggregate_data=False, rounding=True):
    # Prepare the aggregate_data argument
    agg_data_arg = 'true' if aggregate_data else 'false'
    
    # Get the JSON data
    toi_stats_url = (
        f"https://api.nhle.com/stats/rest/en/skater/timeonice?"
        f"isAggregate={agg_data_arg}&isGame=false&sort=%5B%7B%22property%22:%22timeOnIce%22,%22direction%22:%22DESC%22%7D,%7B%22property%22:%22playerId%22,%22direction%22:%22ASC%22%7D%5D"
        f"&start=0&limit=-1&cayenneExp=gameTypeId=2%20and%20seasonId%3C={season_end}%20and%20seasonId%3E={season_start}"
    )
    
    response = requests.get(toi_stats_url)
    response.raise_for_status()  # Ensure we notice bad responses
    toi_stats_data = response.json()['data']
    
    # Convert JSON data to DataFrame
    df = pd.json_normalize(toi_stats_data)
    
    # Select and rename the desired columns
    if not aggregate_data:
        df = df.rename(columns={
            'playerId': 'player_id',
            'skaterFullName': 'player',
            'seasonId': 'season',
            'positionCode': 'position',
            'gamesPlayed': 'games_played',
            'timeOnIce': 'toi_total',
            'timeOnIcePerGame': 'toi_gp',
            'timeOnIcePerShift': 'toi_shift',
            'shifts': 'shifts',
            'shiftsPerGame': 'shifts_gp',
            'evTimeOnIce': 'toi_es_total',
            'evTimeOnIcePerGame': 'toi_es_gp',
            'ppTimeOnIce': 'toi_pp_total',
            'ppTimeOnIcePerGame': 'toi_pp_gp',
            'shTimeOnIce': 'toi_sh_total',
            'shTimeOnIcePerGame': 'toi_sh_gp',
            'otTimeOnIce': 'toi_ot_total',
            'otTimeOnIcePerOtGame': 'toi_ot_per_ot_gp'
        })
    else:
        df = df.rename(columns={
            'playerId': 'player_id',
            'skaterFullName': 'player',
            'positionCode': 'position',
            'gamesPlayed': 'games_played',
            'timeOnIce': 'toi_total',
            'timeOnIcePerGame': 'toi_gp',
            'timeOnIcePerShift': 'toi_shift',
            'shifts': 'shifts',
            'shiftsPerGame': 'shifts_gp',
            'evTimeOnIce': 'toi_es_total',
            'evTimeOnIcePerGame': 'toi_es_gp',
            'ppTimeOnIce': 'toi_pp_total',
            'ppTimeOnIcePerGame': 'toi_pp_gp',
            'shTimeOnIce': 'toi_sh_total',
            'shTimeOnIcePerGame': 'toi_sh_gp',
            'otTimeOnIce': 'toi_ot_total',
            'otTimeOnIcePerOtGame': 'toi_ot_per_ot_gp'
        })
    
    # Change position to F/D
    df['position'] = df['position'].apply(lambda x: 'D' if x == 'D' else 'F')
    
    # Fill NAs in OT data with 0s
    df['toi_ot_per_ot_gp'] = df['toi_ot_per_ot_gp'].fillna(0)
    
    # Arrange data by descending TOI/GP
    df = df.sort_values(by='toi_gp', ascending=False)
    
    # Add proportion of total TOI that is ES, PP, SH, OT
    df['proportion_es'] = (df['toi_es_total'] / df['toi_total']).round(3)
    df['proportion_pp'] = (df['toi_pp_total'] / df['toi_total']).round(3)
    df['proportion_sh'] = (df['toi_sh_total'] / df['toi_total']).round(3)
    df['proportion_ot'] = (df['toi_ot_total'] / df['toi_total']).round(3)
    
    # Apply the rounding argument
    if rounding:
        rounding_columns = [col for col in df.columns if col.endswith('_gp') or col.endswith('_shift')]
        df[rounding_columns] = df[rounding_columns].round()
    
    return df


In [5]:
import pandas as pd
from scipy.stats import percentileofscore
players_df=get_toi_seasons(20232024,20242025)


# Step 1: Drop all rows where 'games_played' is below 40
players_df = players_df[players_df['games_played'] >= 40]

# Step 2: Keep only the desired columns
columns_to_keep = ['player', 'toi_pp_gp', 'toi_sh_gp', 'toi_es_gp', 'toi_ot_per_ot_gp', 'games_played', 'position']
players_df = players_df[columns_to_keep]

# Split the DataFrame into forwards and defensemen
forwards_df = players_df[players_df['position'] == 'F'].copy()
defense_df = players_df[players_df['position'] == 'D'].copy()

# Define a function to calculate percentiles for a specific column
def calculate_percentile(df, column):
    return df[column].apply(lambda x: percentileofscore(df[column], x))

# Calculate percentiles for each category
for column in ['toi_pp_gp', 'toi_sh_gp', 'toi_es_gp','toi_ot_per_ot_gp']:
    forwards_df[f'{column}_percentile'] = calculate_percentile(forwards_df, column)
    defense_df[f'{column}_percentile'] = calculate_percentile(defense_df, column)

# Now forwards_df and defense_df contain the percentile rankings for each player in their respective categories


In [7]:

forwards_df

Unnamed: 0,player,toi_pp_gp,toi_sh_gp,toi_es_gp,toi_ot_per_ot_gp,games_played,position,toi_pp_gp_percentile,toi_sh_gp_percentile,toi_es_gp_percentile,toi_ot_per_ot_gp_percentile
31,Mikko Rantanen,276.0,2.0,1097.0,79.0,80,F,100.000000,25.985222,100.000000,89.901478
20,Nathan MacKinnon,270.0,4.0,1095.0,104.0,82,F,99.507389,32.389163,99.753695,98.891626
39,Nikita Kucherov,243.0,0.0,1057.0,70.0,81,F,99.014778,6.773399,99.507389,84.359606
76,Kirill Kaprizov,250.0,2.0,1043.0,112.0,75,F,99.261084,25.985222,99.014778,99.507389
38,Vincent Trocheck,209.0,95.0,984.0,78.0,82,F,92.610837,73.891626,96.428571,89.162562
...,...,...,...,...,...,...,...,...,...,...,...
676,Walker Duehr,6.0,3.0,477.0,0.0,40,F,16.009852,29.310345,1.231527,9.975369
650,Ryan Reaves,3.0,0.0,484.0,0.0,49,F,8.990148,6.773399,1.847291,9.975369
622,Michael Pezzetta,2.0,8.0,458.0,0.0,61,F,5.295567,38.423645,0.738916,9.975369
618,Mark Kastelic,4.0,7.0,453.0,0.0,63,F,12.192118,36.945813,0.492611,9.975369


In [9]:
import hdbscan
import pandas as pd

# Select the relevant columns for clustering
percentile_columns = ['toi_pp_gp_percentile', 'toi_sh_gp_percentile', 'toi_es_gp_percentile', 'toi_ot_per_ot_gp_percentile']
data_for_clustering = forwards_df[percentile_columns]

# Initialize and fit HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1)
cluster_labels = clusterer.fit_predict(data_for_clustering)

# Add the cluster labels to your DataFrame
forwards_df['cluster'] = cluster_labels


## Code to make unfilterable chart

In [16]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP



# Reducing to 2D space using UMAP
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine")
percentile_columns = ['toi_pp_gp_percentile', 'toi_sh_gp_percentile', 'toi_es_gp_percentile', 'toi_ot_per_ot_gp_percentile']
reduced_embeddings = umap_model.fit_transform(forwards_df[percentile_columns])

# Add the reduced 2D embeddings to the DataFrame
forwards_df['x'] = reduced_embeddings[:, 0]
forwards_df['y'] = reduced_embeddings[:, 1]

# Create the scatter plot
fig = go.Figure()

fig.add_trace(go.Scattergl(
    x=forwards_df['x'],
    y=forwards_df['y'],
    mode='markers',
    marker=dict(size=5, opacity=0.7),
    text=forwards_df.apply(lambda row: f"Player: {row['player']}<br>PP Percentile: {row['toi_pp_gp_percentile']}<br>SH Percentile: {row['toi_sh_gp_percentile']}<br>ES Percentile: {row['toi_es_gp_percentile']}<br>OT Percentile: {row['toi_ot_per_ot_gp_percentile']}", axis=1),
    hoverinfo='text'
))

# Stylize layout
fig.update_layout(
    template="simple_white",
    title={
        "text": "<b>Player Percentiles Mapped to 2D Space</b>",
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
        "font": dict(size=22, color="Black"),
    },
    width=1200,
    height=750,
)

fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)

# export the plot
fig.write_html('forwardclusters.html')


## defense clustering

In [19]:
import hdbscan
import pandas as pd


# Select the relevant columns for clustering
percentile_columns = ['toi_pp_gp_percentile', 'toi_sh_gp_percentile', 'toi_es_gp_percentile', 'toi_ot_per_ot_gp_percentile']
data_for_clustering = defense_df[percentile_columns]

# Initialize and fit HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=1)
cluster_labels = clusterer.fit_predict(data_for_clustering)

# Add the cluster labels to your DataFrame
defense_df['cluster'] = cluster_labels

## Generate plots with ability to isolate by cluster

In [23]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from umap import UMAP


# Reducing to 2D space using UMAP
umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine")
percentile_columns = ['toi_pp_gp_percentile', 'toi_sh_gp_percentile', 'toi_es_gp_percentile', 'toi_ot_per_ot_gp_percentile']
reduced_embeddings = umap_model.fit_transform(defense_df[percentile_columns])

# Add the reduced 2D embeddings to the DataFrame
defense_df['x'] = reduced_embeddings[:, 0]
defense_df['y'] = reduced_embeddings[:, 1]

# Create the scatter plot
fig = go.Figure()

fig.add_trace(go.Scattergl(
    x=defense_df['x'],
    y=defense_df['y'],
    mode='markers',
    marker=dict(size=5, opacity=0.7),
    text=defense_df.apply(lambda row: f"Player: {row['player']}<br>PP Percentile: {row['toi_pp_gp_percentile']}<br>SH Percentile: {row['toi_sh_gp_percentile']}<br>ES Percentile: {row['toi_es_gp_percentile']}<br>OT Percentile: {row['toi_ot_per_ot_gp_percentile']}", axis=1),
    hoverinfo='text'
))

# Stylize layout
fig.update_layout(
    template="simple_white",
    title={
        "text": "<b>Defence Percentiles Mapped to 2D Space</b>",
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
        "font": dict(size=22, color="Black"),
    },
    width=1200,
    height=750,
)

fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)

# Show the plot
fig.write_html('defence_clusters.html')


In [27]:
import plotly.graph_objects as go
from umap import UMAP

def plot_hdbscan_clusters(df, cluster_labels):
    # Reduce dimensionality with UMAP
    umap = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine')
    reduced_embeddings = umap.fit_transform(df[['toi_pp_gp_percentile', 'toi_sh_gp_percentile', 
                                                'toi_es_gp_percentile', 'toi_ot_per_ot_gp_percentile']])
    
    df['x'] = reduced_embeddings[:, 0]
    df['y'] = reduced_embeddings[:, 1]
    df['cluster'] = cluster_labels
    
    
    # Initialize the Plotly figure
    fig = go.Figure()

    # Add traces for each cluster
    for cluster in df['cluster'].unique():
        cluster_data = df[df['cluster'] == cluster]
        fig.add_trace(go.Scattergl(
            x=cluster_data['x'],
            y=cluster_data['y'],
            mode='markers',
            marker=dict(size=8, opacity=0.7),
            name=f'Cluster {cluster}',
            text=[f"{player}<br>PP: {pp:.2f}<br>SH: {sh:.2f}<br>ES: {es:.2f}<br>OT: {ot:.2f}" 
                  for player, pp, sh, es, ot in zip(cluster_data['player'], cluster_data['toi_pp_gp_percentile'], 
                                                     cluster_data['toi_sh_gp_percentile'], 
                                                     cluster_data['toi_es_gp_percentile'], 
                                                     cluster_data['toi_ot_per_ot_gp_percentile'])]
        ))

    # Update layout to allow isolating clusters via the legend
    fig.update_layout(
        title='HDBSCAN Clusters of NHL Players',
        xaxis_title='UMAP Dimension 1',
        yaxis_title='UMAP Dimension 2',
        legend_title='Cluster',
        hovermode='closest',
        width=800,
        height=600
    )

    return fig

fwd_iso_fig = plot_hdbscan_clusters(forwards_df, forwards_df['cluster'])
fwd_iso_fig.write_html('fwd_clusters_iso.html')

def_iso_fig = plot_hdbscan_clusters(forwards_df, forwards_df['cluster'])
def_iso_fig.write_html('def_clusters_iso.html')