Importing libraries

In [991]:
import csv
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import warnings
import os
import re
import time
import unidecode

warnings.simplefilter(action='ignore', category=FutureWarning)

Function that checks the validity of our player keys

In [856]:
def check_links(ids):
    base_url = "https://www.basketball-reference.com/players"
    ids_to_remove = []
    
    for id_ in ids:
        link = f"{base_url}/{str(id_)[0]}/{str(id_)}.html"
        response = requests.head(link)
        if response.status_code != 200:
            print(f"Not working: {link}")
            ids_to_remove.append(id_)
        else:
            print("Working")
        time.sleep(2)
        
    return ids_to_remove

Function that removes accents

In [857]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    
    return nfkd_form.encode('ASCII', 'ignore').decode('ASCII')

Function that extracts all the active player names for a given season

In [886]:
def extract_unique_players(year):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_totals.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': 'totals_stats'})
    players = [td.a.getText() for td in table.findAll('td', {'data-stat': 'player'})]
    
    return list(set(players))

## Injured players IDs extraction

In [1363]:
if not os.path.isfile('injuries.txt'):
    injuries_data = []
    with open('url_str.txt', 'r') as file: url_base = file.read()
    
    for i in range(0, 442):
        url_str = url_base + str(25 * i)
        req = requests.get(url_str)
        soup = BeautifulSoup(req.content, 'lxml')

        for item in soup.find_all("tr", {"align":"left"}):
            raw_text = item.text.strip().split("\n")
            injuries_data.append(raw_text)
        print("Working")
else:
    injuries_df = pd.read_csv('injuries.txt', sep=' ')

In [1364]:
if not os.path.isfile('injuries.txt'):
    injuries_df = pd.DataFrame(injuries_data)
    injuries_df.columns = ["Date", "Team", "Player Out", "Player In", "Reason"]

    injuries_df = injuries_df.drop(columns=["Player Out", "Reason"])
    injuries_df = injuries_df[injuries_df["Player In"].str.strip() != '']
    injuries_df = injuries_df[injuries_df["Team"].str.strip() != '']

    injuries_df.reset_index(drop=True, inplace=True)
    injuries_df["Player In"] = injuries_df["Player In"].str.lstrip("• ")

In [1365]:
if not os.path.isfile('injuries.txt'):
    injuries_df["Player In"] = injuries_df["Player In"].str.replace(r'\([^)]*\)', '')
    injuries_df["Player In"] = injuries_df["Player In"].str.replace('.', '')
    injuries_df["Player In"] = injuries_df["Player In"].str.replace('\'', '')
    injuries_df["Player In"] = injuries_df["Player In"].str.replace('-', '')
    injuries_df["Player In"] = injuries_df["Player In"].str.split(" /").str[0]

In [1366]:
if not os.path.isfile('injuries.txt'):
    name_split_df = injuries_df["Player In"].str.split(n=1, expand=True)
    injuries_df["First Name"] = name_split_df[0]
    injuries_df["Last Name"] = name_split_df[1]
    injuries_df["First Name"] = injuries_df["First Name"].str.replace(' ', '')
    injuries_df["Last Name"] = injuries_df["Last Name"].str.replace(' ', '')

    injuries_df["Final ID"] = (injuries_df["Last Name"].str[:5] + injuries_df["First Name"].str[:2] + '01').str.lower()
    injuries_df = injuries_df.drop(columns=["Player In", "First Name", "Last Name", "Team"])
    injuries_df = injuries_df.dropna()

Saving the dataframe as txt to avoid having to reload the data

In [884]:
if not os.path.isfile('injuries.txt'):
    injuries_df.to_csv('injuries.txt', sep=' ', index=False)

## Total players IDs extraction

In [887]:
if not os.path.isfile('players.txt'):
    start_year = 2013
    end_year = 2023

    all_unique_players = []

    for year in range(start_year, end_year + 1):
        unique_players = extract_unique_players(year)
        all_unique_players.extend(unique_players)

    all_unique_players = list(set(all_unique_players))
else:
    players_df = pd.read_csv('players.txt', sep=' ')

In [845]:
if not os.path.isfile('players.txt'):
    all_unique_players = [element.replace('.', '') for element in all_unique_players]
    all_unique_players = [element.replace('-', '') for element in all_unique_players]
    all_unique_players = [element.replace('\'', '') for element in all_unique_players]
    all_unique_players = [name.replace(' Jr', '') for name in all_unique_players]

In [846]:
if not os.path.isfile('players.txt'):
    final_column = []

    for name in all_unique_players:
        parts = name.split(' ')
        last_name = parts[-1][:5]
        first_name = parts[0][:2]
        final_id = (last_name + first_name + '01').lower()
        final_column.append(final_id)

    players_id_df = pd.DataFrame(final_column, columns=['Final ID'])

We chose to remove the duplicates: there are very few of them and keeping them would greatly hurt our model

In [847]:
if not os.path.isfile('players.txt'):
    non_unique_values = players_id_df.loc[players_id_df.duplicated('Final ID', keep=False), 'Final ID'].unique()
    players_df = players_id_df[~players_id_df['Final ID'].isin(non_unique_values)]

In [849]:
if not os.path.isfile('players.txt'):
    with pd.option_context('mode.chained_assignment', None):
        players_df['Final ID'] = players_df['Final ID'].apply(remove_accents)

Removing the non-working ids from our selection

In [853]:
if not os.path.isfile('players.txt'):
    ids = list(players_df['Final ID'].unique())

    ids_to_remove = check_links(ids)
    players_df = players_df[~players_df['Final ID'].isin(ids_to_remove)]

Removing all the injured players that are not within the broader player set for some reasons

In [873]:
if not os.path.isfile('players.txt'):
    intersection = set(injuries_df['Final ID']).intersection(set(players_df['Final ID']))
    injuries_df = injuries_df[injuries_df['Final ID'].isin(intersection)]

Showing that all injured players are inside the broader set of players with registered statistics

In [889]:
if len(set(injuries_df['Final ID']).intersection(set(players_df['Final ID']))) == len(injuries_df['Final ID'].unique()): 
    print("The ids are correct.")

The ids are correct.


Saving the dataframe as txt to avoid having to reload the data

In [852]:
if not os.path.isfile('players.txt'):
    players_df.to_csv('players.txt', sep=' ', index=False)

## Player game statistics extraction

In [1496]:
if not os.path.isfile('stats.txt'):
    base_url = "https://www.basketball-reference.com/players"
    stats = []

    for player in list(players_df["Final ID"].iloc[0:30]):
        dfs = []
        link = base_url + '/' + player[0] + '/' + player + '.html'
        response = requests.get(link)
        webpage = response.content

        soup = BeautifulSoup(webpage, 'html.parser')
        table = soup.find('table', {'id': 'per_game'})
        if table == None: continue
        avg_stats_df = pd.read_html(str(table))[0]

        career_index = avg_stats_df[avg_stats_df[avg_stats_df.columns[0]] == 'Career'].index
        if not career_index.empty: avg_stats_df = avg_stats_df.loc[:career_index[0] - 1]

        shortened_seasons = [season[:2] + season[-2:] for season in list(avg_stats_df["Season"].unique()) if 2012 <= int(season[:4]) < 2023]
        if shortened_seasons == []: continue

        for season in shortened_seasons:
            link = base_url + '/' + player[0] + '/' + player + '/gamelog/' + season
            response = requests.get(link)
            time.sleep(3)
            soup = BeautifulSoup(response.content, "html.parser")

            table = soup.find("table", {"id": "pgl_basic"})
            if table == None: continue
            columns = [th.getText() for th in table.find("thead").findAll("th")][1:]
            columns[4] = 'H/A'
            columns[6] = 'W/L'

            data_rows = table.find("tbody").findAll("tr")
            data = [[td.getText() for td in data_rows[i].findAll("td")] for i in range(len(data_rows))]

            data = [row for row in data if row and row[0] != '']
            df_season = pd.DataFrame(data, columns=columns)
            df_season.insert(0, 'Season', season)
            if len(df_season) == 0: continue
                
            df_season = df_season.drop(columns=['Tm', 'Opp', 'FG%', '3P%', 'FT%', 'W/L'])
            df_season['H/A'] = df_season['H/A'].replace('@', '1').replace('', '0')
            df_season["+/-"] = df_season["+/-"].str.replace('+', '')

            df_season['Age'] = df_season['Age'].apply(lambda x: int(x.split('-')[0]) + int(x.split('-')[1])/365)
            df_season['Age'] = df_season['Age'].round(1)
            df_season['MP'] = df_season['MP'].apply(lambda x: int(x.split(':')[0]) + int(x.split(':')[1])/60)
            df_season.replace('', 0, inplace=True)

            average_series = df_season.drop(columns=['Season', 'G', 'Date', 'Age', 'H/A', 'GS', '+/-']).astype(float).mean()
            average_df_season = pd.DataFrame(average_series).T

            for column in average_df_season.columns:
                if average_df_season[column][0] != 0:
                    df_season[column] = (df_season[column].astype(float) / average_df_season[column][0]).round(2)
                else:
                    df_season[column] = 0

            dfs.append(df_season)

        if len(dfs) > 0:
            final_df = pd.concat(dfs, ignore_index=True)
            print("Done for " + str(player))

            final_df.insert(0, 'Player', player)
            final_df.insert(1, 'Inj After', 0)

            stats.append(final_df)

    stats_df = pd.concat(stats, ignore_index=True)

Done for douglto01
Done for cunnija01
Done for thybuma01
Done for outlatr01
Done for toddis01
Done for mickejo01
Done for howarjo01
Done for sullija01
Done for lowryky01
Done for houstca01
Done for connapa01
Done for poirivi01
Done for jonesje01
Done for alkinra01
Done for kuminjo01
Done for gudurma01
Done for greenaj01
Done for careyve01
Done for cacokde01
Done for tayloty01
Done for terryem01
Done for mitchda01
Done for obriejj01
Done for grantdo01
Done for johnsam01
Done for mathega01
Done for vucevni01


Saving the dataframe as txt to avoid having to reload the data

In [1497]:
if not os.path.isfile('stats.txt'):
    stats_df.to_csv('stats.txt', sep=' ', index=False)

## Statistics enhancement

We want to translate a current trend that a given player has followed in his past games. We will add stats for the previous game, as well as the past three and five games. Some injuries are developped through time and effort, and those variables will try and capture that. Beyond the usual efficiency statistics, there are also the *H/A* stat that is interesting, as an accumulation of away games can have an effect on fatigue, as much as the number of games started within *GS*.

In [1498]:
modified_columns = list(stats_df.columns)[6:]

trend_columns = []
for column_name in modified_columns:
    trend_columns.append(column_name + "_Last1")
    trend_columns.append(column_name + "_Last3")
    trend_columns.append(column_name + "_Last5")

For each player and for each season, we will compute those average trend statistics. We will therefore have to drop individual seasons with fewer than six games (current one + the five past games). 

In [1499]:
stats_df_above = stats_df[stats_df['G'].astype(float) > 5]

In [1500]:
modified_df = pd.DataFrame()

for column_name in trend_columns: modified_df[column_name] = 0

stats_df_above = pd.concat([stats_df_above, modified_df], axis=1)
stats_df_above.fillna(0, inplace=True)

In [1501]:
for idx, game in stats_df_above.iterrows():
    static = idx
    player = game["Player"]
    season = game["Season"]
    count = int(game["G"])
          
    last1_row = stats_df[(stats_df['Player'] == player) &  (stats_df['Season'] == season) & (stats_df['G'].astype(int) == count - 1)]
    last2_row = stats_df[(stats_df['Player'] == player) &  (stats_df['Season'] == season) & (stats_df['G'].astype(int) == count - 2)]
    last3_row = stats_df[(stats_df['Player'] == player) &  (stats_df['Season'] == season) & (stats_df['G'].astype(int) == count - 3)]
    last4_row = stats_df[(stats_df['Player'] == player) &  (stats_df['Season'] == season) & (stats_df['G'].astype(int) == count - 4)]
    last5_row = stats_df[(stats_df['Player'] == player) &  (stats_df['Season'] == season) & (stats_df['G'].astype(int) == count - 5)]

    last1_row = last1_row[list(stats_df.columns)[6:]].astype(float)
    last2_row = last2_row[list(stats_df.columns)[6:]].astype(float)
    last3_row = last3_row[list(stats_df.columns)[6:]].astype(float)
    last4_row = last4_row[list(stats_df.columns)[6:]].astype(float)
    last5_row = last5_row[list(stats_df.columns)[6:]].astype(float)

    last3_avg = pd.DataFrame(last1_row.values + last2_row.values + last3_row.values) / 3
    last5_avg = pd.DataFrame(last1_row.values + last2_row.values + last3_row.values + last4_row.values + last5_row.values) / 5
    
    last3_avg.columns = last1_row.columns + "_Last3"
    last5_avg.columns = last1_row.columns + "_Last5"
    last1_row.columns = last1_row.columns + "_Last1"
    
    last1_row.reset_index(drop=True, inplace=True)
    last3_avg.reset_index(drop=True, inplace=True)
    last5_avg.reset_index(drop=True, inplace=True)
    result = pd.concat([last1_row, last3_avg, last5_avg], axis=1)
    
    stats_df_above.loc[static, result.columns] = result.iloc[0].values

## Linking the injury variable

Now that we have an entire dataset for all individual games played for the period, we must link the injury list to pinpoint what games led to player injuries.

In [1502]:
stats_df_above['Date'] = pd.to_datetime(stats_df_above['Date'])
injuries_df['Date'] = pd.to_datetime(injuries_df['Date'])

Returns the set of exact dates before an injury was registered.

In [1503]:
def gameBeforeInjuryFinder(injury_dates, all_dates):
    most_recent_dates = []

    for small_date in injury_dates:
        before_dates = all_dates[all_dates < small_date]

        if not before_dates.empty:
            most_recent_date = before_dates.max()
            most_recent_dates.append(most_recent_date)
        else:
            most_recent_dates.append(None)

    return set(most_recent_dates)

Looping through all players on the set.

In [1519]:
for player in list(stats_df_above["Player"].unique()):

    injury_dates = list(injuries_df.loc[injuries_df['Final ID'] == player, 'Date'])
    all_dates = stats_df_above.loc[stats_df_above['Player'] == player, 'Date']

    most_recent_dates_set = gameBeforeInjuryFinder(injury_dates, all_dates)
    stats_df_above = stats_df_above.copy()
    stats_df_above['Inj After'] = stats_df_above.apply(lambda row: 1 if row['Date'] in most_recent_dates_set else row['Inj After'], axis=1)

columns_to_select = modified_columns + trend_columns
stats_df = stats_df_above.loc[:, columns_to_select].astype(float).round(2)

In [1520]:
if not os.path.isfile('stats_full.txt'):
    stats_df.to_csv('stats_full.txt', sep=' ', index=False)