The way this works is that each NBA player has a share price based on the market's
projection of their career stats. One can then trade the stocks depending on if they
think the share price will increase or decrease.

The mojo value formula for NBA career stocks is as follows:
0.0125 steals + 0.0125 blocks + 0.01 points + 0.0075 assists 
+ 0.005 rebounds + 0.005 3pts made + 0.0025 plus minus - 0.0125 turnovers
- 0.01 fg missed - 0.005 ft missed

In [1]:
import timeit
import time
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import string
import re

import warnings
warnings.filterwarnings('ignore')

First let's create our dataset by scraping all players from basketball reference who debuted after the introduction of the 3 point line.

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}

Define a function to retrieve each feature

In [3]:
def player_debut(soupy_object):
    try:
        debut = soupy_object.find('strong', text=re.compile('NBA Debut')).find_next_sibling('a').text[-4:]
    except:
        debut = None
    return debut

def player_name(soupy_object):
    try:
        name = soupy_object.find('h1').find('span').text
    except:
        name = None
    return name

def player_height(soupy_object):
    try:
        height_parent = soupy_object.find('span', text=re.compile('lb')).parent()
        height = height_parent[0].text
    except:
        height = None
    return height

def player_weight(soupy_object):
    try:
        weight_parent = soupy_object.find('span', text=re.compile('lb')).parent()
        weight = weight_parent[1].text
    except:
        weight = None
    return weight

def player_season(soupy_object):
    try:
        season = []
        seasons = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for szn in seasons:
            season.append(szn['id'][-4:])
    except:
        season = None
    return season

def player_age(soupy_object):
    try:
        age = []
        ages = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for ag in ages:
            a = ag.find('td', attrs = {'data-stat':'age'}).text
            age.append(a)
    except:
        age = None
    return age

def player_games_played(soupy_object):
    try:
        games_played = []
        gamess = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for games in gamess:
            game = games.find('td', attrs = {'data-stat':'g'}).text
            games_played.append(game)
    except:
        games_played = None
    return games_played

def player_games_started(soupy_object):
    try:
        games_started = []
        gamess = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for games in gamess:
            game = games.find('td', attrs = {'data-stat':'gs'}).text
            games_started.append(game)
    except:
        games_started = None
    return games_started

def player_minutes_played(soupy_object):
    try:
        mins_played = []
        minss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for mins in minss:
            min = mins.find('td', attrs = {'data-stat':'mp'}).text
            mins_played.append(min)
    except:
        mins_played = None
    return mins_played

def player_fg(soupy_object):
    try:
        field_goals = []
        fgss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for fgs in fgss:
            fg = fgs.find('td', attrs = {'data-stat':'fg'}).text
            field_goals.append(fg)
    except:
        field_goals = None
    return field_goals

def player_fga(soupy_object):
    try:
        field_goals_attempted = []
        fgass = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for fgas in fgass:
            fga = fgas.find('td', attrs = {'data-stat':'fga'}).text
            field_goals_attempted.append(fga)
    except:
        field_goals_attempted = None
    return field_goals_attempted

def player_threes(soupy_object):
    try:
        three_pointers = []
        threess = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for threes in threess:
            three = threes.find('td', attrs = {'data-stat':'fg3'}).text
            three_pointers.append(three)
    except:
        three_pointers = None
    return three_pointers

def player_threes_attempted(soupy_object):
    try:
        three_pointers_attempted = []
        threeass = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for threeas in threeass:
            threea = threeas.find('td', attrs = {'data-stat':'fg3a'}).text
            three_pointers_attempted.append(threea)
    except:
        three_pointers_attempted = None
    return three_pointers_attempted

def player_ft(soupy_object):
    try:
        free_throws = []
        ftss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for fts in ftss:
            ft = fts.find('td', attrs = {'data-stat':'ft'}).text
            free_throws.append(ft)
    except:
        free_throws = None
    return free_throws

def player_fta(soupy_object):
    try:
        free_throws_attempted = []
        ftass = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for ftas in ftass:
            fta = ftas.find('td', attrs = {'data-stat':'fta'}).text
            free_throws_attempted.append(fta)
    except:
        free_throws_attempted = None
    return free_throws_attempted

def player_orb(soupy_object):
    try:
        offensive_rebounds = []
        orbss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for orbs in orbss:
            orb = orbs.find('td', attrs = {'data-stat':'orb'}).text
            offensive_rebounds.append(orb)
    except:
        offensive_rebounds = None
    return offensive_rebounds

def player_drb(soupy_object):
    try:
        defensive_rebounds = []
        drbss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for drbs in drbss:
            drb = drbs.find('td', attrs = {'data-stat':'drb'}).text
            defensive_rebounds.append(drb)
    except:
        defensive_rebounds = None
    return defensive_rebounds

def player_assists(soupy_object):
    try:
        assists = []
        astss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for asts in astss:
            ast = asts.find('td', attrs = {'data-stat':'ast'}).text
            assists.append(ast)
    except:
        assists = None
    return assists

def player_steals(soupy_object):
    try:
        steals = []
        stlss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for stls in stlss:
            stl = stls.find('td', attrs = {'data-stat':'stl'}).text
            steals.append(stl)
    except:
        steals = None
    return steals

def player_blocks(soupy_object):
    try:
        blocks = []
        blkss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for blks in blkss:
            blk = blks.find('td', attrs = {'data-stat':'blk'}).text
            blocks.append(blk)
    except:
        blocks = None
    return blocks

def player_turnovers(soupy_object):
    try:
        turnovers = []
        tovss = soupy_object.find_all('tr', attrs = {'id':re.compile('totals.')})
        for tovs in tovss:
            tov = tovs.find('td', attrs = {'data-stat':'tov'}).text
            turnovers.append(tov)
    except:
        turnovers = None
    return turnovers

Compile into a single dataframe

In [4]:
data_list = []
# rate limit needed because basketball reference blocks users that send more than 20 requests per minute
rate_limit = 4
def get_data():
    for letter in string.ascii_lowercase:
        timestart = timeit.default_timer()
        url = f'https://www.basketball-reference.com/players/{letter}'
        req = get(url, headers = headers)
        # print(req)
        soup = BeautifulSoup(req.content, 'html.parser')
        links = soup.find_all('a', href=re.compile(f'players/{letter}'))
        # print(links)

        for k, item in enumerate(links):
            timestart_per_player = timeit.default_timer()
            main_url = 'https://www.basketball-reference.com'
            sub_url = item.get('href')
            print(sub_url)
            data_url = main_url + sub_url
            request = get(data_url, headers=headers)
            soup_get = BeautifulSoup(request.content, 'html.parser')

            debut = player_debut(soup_get)
            # print(debut)
            if (debut == None or int(debut) < 1979):
                timestop_per_player = timeit.default_timer()
                if ((timestop_per_player - timestart_per_player) < rate_limit):
                    time.sleep(rate_limit - timestop_per_player + timestart_per_player)
                continue
            
            name = player_name(soup_get)
            height = player_height(soup_get)
            weight = player_weight(soup_get)
            season = player_season(soup_get)
            age = player_age(soup_get)
            games_played = player_games_played(soup_get)
            games_started = player_games_started(soup_get)
            minutes_played = player_minutes_played(soup_get)
            fg = player_fg(soup_get)
            fga = player_fga(soup_get)
            threes = player_threes(soup_get)
            threes_attempted = player_threes_attempted(soup_get)
            ft = player_ft(soup_get)
            fta = player_fta(soup_get)
            orb = player_orb(soup_get)
            drb = player_drb(soup_get)
            assists = player_assists(soup_get)
            steals = player_steals(soup_get)
            blocks = player_blocks(soup_get)
            turnovers = player_turnovers(soup_get)

            for i in range(len(season)):
                data = {'Name': name, 'Height': height, 'Weight': weight, 'Season': season[i], 'Age': age[i], 'Games_Played': games_played[i],
                        'Game_Started': games_started[i], 'Minutes_Played': minutes_played[i], 'Field_Goals': fg[i], 'Field_Goals_Attempted': fga[i],
                        'Threes_Made': threes[i], 'Threes_Attempted': threes_attempted[i], 'Free_Throws': ft[i], 'Free_Throws_Attempted': fta[i],
                        'Offensive_Rebounds': orb[i], 'Defensive_Rebounds': drb[i], 'Assists': assists[i], 'Steals': steals[i], 'Blocks': blocks[i],
                        'Turnovers': turnovers[i]}
                data_list.append(data)
            # break
            timestop_per_player = timeit.default_timer()
            if ((timestop_per_player - timestart_per_player) < rate_limit):
                time.sleep(rate_limit - timestop_per_player + timestart_per_player)
        # break
        timestop = timeit.default_timer()
        print(f'You scraped player names starting with: {letter}')
        print('Time :', timestop - timestart)
        
    return data_list

nba_df = pd.DataFrame(get_data())
nba_df.head()


/players/a/abdelal01.html
/players/a/abdulza01.html
/players/a/abdulka01.html
/players/a/abdulma02.html
/players/a/abdulta01.html
/players/a/abdursh01.html
/players/a/abernto01.html
/players/a/ablefo01.html
/players/a/abramjo01.html
/players/a/abrinal01.html
/players/a/achiupr01.html
/players/a/ackeral01.html
/players/a/ackerdo01.html
/players/a/acresma01.html
/players/a/actonbu01.html
/players/a/acyqu01.html
/players/a/adamsal01.html
/players/a/adamsdo01.html
/players/a/adamsge01.html
/players/a/adamsha01.html
/players/a/adamsja01.html
/players/a/adamsjo01.html
/players/a/adamsmi01.html
/players/a/adamsst01.html
/players/a/addisra01.html
/players/a/adebaba01.html
/players/a/adelde01.html
/players/a/adelmri01.html
/players/a/adrieje01.html
/players/a/afflaar01.html
/players/a/agbajoc01.html
/players/a/agerma01.html
/players/a/aguirma01.html
/players/a/ahearbl01.html
/players/a/aingeda01.html
/players/a/aitchma01.html
/players/a/ajincal01.html
/players/a/akinhe01.html
/players/a/akognjo

Unnamed: 0,Name,Height,Weight,Season,Age,Games_Played,Game_Started,Minutes_Played,Field_Goals,Field_Goals_Attempted,Threes_Made,Threes_Attempted,Free_Throws,Free_Throws_Attempted,Offensive_Rebounds,Defensive_Rebounds,Assists,Steals,Blocks,Turnovers
0,Alaa Abdelnaby,6-10,240lb,1991,22,43,0,290,55,116,0,0,25,44,27,62,12,4,12,22
1,Alaa Abdelnaby,6-10,240lb,1992,23,71,1,934,178,361,0,0,76,101,81,179,30,25,16,66
2,Alaa Abdelnaby,6-10,240lb,1993,24,75,52,1311,245,473,0,1,88,116,126,211,27,25,26,97
3,Alaa Abdelnaby,6-10,240lb,1993,24,12,0,159,26,56,0,1,12,16,12,25,10,6,4,13
4,Alaa Abdelnaby,6-10,240lb,1993,24,63,52,1152,219,417,0,0,76,100,114,186,17,19,22,84


In [5]:
nba_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29781 entries, 0 to 29780
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Name                   29781 non-null  object
 1   Height                 29781 non-null  object
 2   Weight                 29672 non-null  object
 3   Season                 29781 non-null  object
 4   Age                    29781 non-null  object
 5   Games_Played           29781 non-null  object
 6   Game_Started           29781 non-null  object
 7   Minutes_Played         29781 non-null  object
 8   Field_Goals            29781 non-null  object
 9   Field_Goals_Attempted  29781 non-null  object
 10  Threes_Made            29781 non-null  object
 11  Threes_Attempted       29781 non-null  object
 12  Free_Throws            29781 non-null  object
 13  Free_Throws_Attempted  29781 non-null  object
 14  Offensive_Rebounds     29781 non-null  object
 15  Defensive_Rebounds 

In [6]:
nba_df.to_csv('Datasets/nba_player_data.csv', index_label = False)