In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# using beautiful soup to get copy of rankings webpage
def get_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    return soup

soup21 = get_soup('https://hoopshype.com/nba2k/2020-2021/')
soup20 = get_soup('https://hoopshype.com/nba2k/')
soup19 = get_soup('https://hoopshype.com/nba2k/2018-2019/')
soup18 = get_soup('https://hoopshype.com/nba2k/2017-2018/')
soup17 = get_soup('https://hoopshype.com/nba2k/2016-2017/')
soup16 = get_soup('https://hoopshype.com/nba2k/2015-2016/')

In [3]:
# finding html element on webpage to retrieve rankings
def get_players(soup):
    ranking = soup.find('div', class_='wrapper-holder')
    table = ranking.find('table').tbody
    players = []
    rankings = []
    for player in table.find_all('tr'):
        if player.a is not None:
            players.append(player.a.text.strip())
            rankings.append(player.find('td', class_='value').text.strip())
        
    nba2k = pd.DataFrame({'players': players, 'rankings': rankings})
    nba2k['rankings'] = nba2k['rankings'].astype(int)

    return nba2k

In [4]:
nba2k21 = get_players(soup21)
nba2k20 = get_players(soup20)
nba2k19 = get_players(soup19)
nba2k18 = get_players(soup18)
nba2k17 = get_players(soup17)
nba2k16 = get_players(soup16)

- 2k21 = 19-20 season
- 2k20 = 18-19 season
- 2k19 = 17-18 season
- 2k18 = 16-17 season
- 2k17 = 15-16 season
- 2k16 = 14-15 season

In [5]:
# table on stats.nba.com is interactive so needs to use selenium to 'fake' browser activity
# open_nba w str 'season' opens nba stats page for given season and returns webpage
# get_stats retrieves each player's stats and appends to player list and stats list
from selenium import webdriver
from selenium.webdriver.support.ui import Select

def open_nba(season): 
    browser = webdriver.Chrome('/usr/local/bin/chromedriver')
    browser.get('https://stats.nba.com/players/traditional/?SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1&Season='+str(season))
    browser.maximize_window() 
    browser.implicitly_wait(20)
    # get the select tag to change to 'all' to load full table
    select = Select(browser.find_element_by_xpath('/html/body/main/div[2]/div/div[2]/div/div/nba-stat-table/div[3]/div/div/select'))
    select.select_by_visible_text('All')
    return browser

def get_stats(browser):
    # find stats table and add to player and stat lists
    table = browser.find_element_by_class_name('nba-stat-table__overflow')
    players = []
    stats = []
    for line_id, line in enumerate(table.text.split('\n')[1:]):
        if line_id % 2 == 0:
            players.append(line)
        else:
            stats.append(line.split(' '))  
    
    return players, stats

In [6]:
browser_21 = open_nba('2019-20')
players_21, stats_21 = get_stats(browser_21)

In [7]:
browser_20 = open_nba('2018-19')
players_20, stats_20 = get_stats(browser_20)

In [8]:
browser_19 = open_nba('2017-18')
players_19, stats_19 = get_stats(browser_19)

In [9]:
browser_18 = open_nba('2016-17')
players_18, stats_18 = get_stats(browser_18)

In [10]:
browser_17 = open_nba('2015-16')
players_17, stats_17 = get_stats(browser_17)

In [11]:
browser_16 = open_nba('2014-15')
players_16, stats_16 = get_stats(browser_16)

In [12]:
# creates pandas DataFrame from given player and stats list, and indicates season from season str
def create_nba_df(players, stats, season):
    nba = pd.DataFrame({'PLAYER': players, 
                        'TEAM': (stat[0] for stat in stats), 
                        'AGE': (int(stat[1]) for stat in stats),
                        'SEASON': season,
                        'GP': (int(stat[2]) for stat in stats),
                        'W': (int(stat[3]) for stat in stats),
                        'L': (int(stat[4]) for stat in stats),
                        'MIN': (float(stat[5]) for stat in stats),
                        'PTS': (float(stat[6]) for stat in stats),
                        'FGM': (float(stat[7]) for stat in stats),
                        'FGA': (float(stat[8]) for stat in stats),
                        'FG%': (float(stat[9]) for stat in stats),
                        '3PM': (float(stat[10]) for stat in stats),
                        '3PA': (float(stat[11]) for stat in stats),
                        '3P%': (float(stat[12]) for stat in stats),
                        'FTM': (float(stat[13]) for stat in stats),
                        'FTA': (float(stat[14]) for stat in stats),
                        'FT%': (float(stat[15]) for stat in stats),
                        'OREB': (float(stat[16]) for stat in stats),
                        'DREB': (float(stat[17]) for stat in stats),
                        'REB': (float(stat[18]) for stat in stats),
                        'AST': (float(stat[19]) for stat in stats),
                        'TOV': (float(stat[20]) for stat in stats),
                        'STL': (float(stat[21]) for stat in stats),
                        'BLK': (float(stat[22]) for stat in stats),
                        'PF': (float(stat[23]) for stat in stats),
                        'FP': (float(stat[24]) for stat in stats),
                        'DD2': (float(stat[25]) for stat in stats),
                        'TD3': (float(stat[26]) for stat in stats),
                        '+/-': (float(stat[27]) for stat in stats)})
    return nba
    

In [13]:
nba_21 = create_nba_df(players_21, stats_21, '2019-20')
nba_20 = create_nba_df(players_20, stats_20, '2018-19')
nba_19 = create_nba_df(players_19, stats_19, '2017-18')
nba_18 = create_nba_df(players_18, stats_18, '2016-17')
nba_17 = create_nba_df(players_17, stats_17, '2015-16')
nba_16 = create_nba_df(players_16, stats_16, '2014-15')

In [14]:
# problems with fuzzy match with jr smith 
nba2k21.loc[nba2k21['players'] == 'JR Smith'] = 'J.R. Smith', 74
nba_21.loc[203, 'PLAYER'] = 'J.R. Smith'

nba2k19.loc[nba2k19['players'] == 'JR Smith'] = 'J.R. Smith', 75
nba_19.loc[196, 'PLAYER'] = 'J.R. Smith'

In [15]:
from fuzzymatcher import link_table, fuzzy_left_join

def fuzzy_join(nba_stats, nba_2k):
    nba_df = fuzzy_left_join(nba_stats, nba_2k, ['PLAYER'], ['players'])
    nba_df = nba_df[nba_df['best_match_score'] > 0.20]
    nba_df.drop(['best_match_score', '__id_left', '__id_right', 'players'], axis=1, inplace=True)
    
    return nba_df

nba_21_df = fuzzy_join(nba_21, nba2k21)
nba_20_df = fuzzy_join(nba_20, nba2k20)
nba_19_df = fuzzy_join(nba_19, nba2k19)
nba_18_df = fuzzy_join(nba_18, nba2k18)
nba_17_df = fuzzy_join(nba_17, nba2k17)
nba_16_df = fuzzy_join(nba_16, nba2k16)

In [16]:
nba_df =  pd.concat([nba_21_df, nba_20_df, nba_19_df, nba_18_df, nba_17_df, nba_16_df])
nba_df.to_csv('nba_rankings_2014-2020')

In [17]:
nba_df

Unnamed: 0,PLAYER,TEAM,AGE,SEASON,GP,W,L,MIN,PTS,FGM,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,rankings
0,Aaron Gordon,ORL,24,2019-20,62,30,32,32.5,14.4,5.4,...,3.7,1.6,0.8,0.6,2.0,31.9,20.0,1.0,-1.1,80.0
1,Aaron Holiday,IND,23,2019-20,66,42,24,24.5,9.5,3.5,...,3.4,1.3,0.8,0.2,1.8,19.3,3.0,0.0,1.7,76.0
2,Abdel Nader,OKC,26,2019-20,55,37,18,15.8,6.3,2.2,...,0.7,0.8,0.4,0.4,1.4,11.1,0.0,0.0,-1.5,71.0
3,Adam Mokoka,CHI,21,2019-20,11,3,8,10.2,2.9,1.1,...,0.4,0.2,0.4,0.0,1.5,5.5,0.0,0.0,4.5,68.0
4,Admiral Schofield,WAS,23,2019-20,33,9,24,11.2,3.0,1.1,...,0.5,0.2,0.2,0.1,1.5,6.3,0.0,0.0,-1.7,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,Will Barton,DEN,24,2014-15,58,30,28,17.0,6.8,2.6,...,1.4,1.1,0.8,0.3,1.2,14.7,2.0,0.0,-0.5,73.0
643,Wilson Chandler,DEN,28,2014-15,78,28,50,31.7,13.9,5.4,...,1.7,1.4,0.7,0.4,3.0,25.7,8.0,0.0,-2.8,77.0
645,Zach LaVine,MIN,20,2014-15,77,15,62,24.7,10.1,3.7,...,3.6,2.5,0.7,0.1,2.1,18.8,4.0,0.0,-7.0,73.0
646,Zach Randolph,MEM,33,2014-15,71,51,20,32.4,16.1,6.4,...,2.2,2.2,1.0,0.2,2.5,33.3,38.0,0.0,4.9,84.0
