# All-Time Scoring Leaders 

In this script I will scrap data from Wikipedia and Basketball Reference to get the points of the 50 biggest scorers in NBA history from 1954 to 2023.

## Dependencies

In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from modules.datasets import BReferenceScraper
import time

## Web Scraping Data
### Wikipedia All-Time List

In [2]:
def all_time_leaders(url: str, output_path=None) -> list:
    """Returns a list of all time leaders for a given stat."""
    html = urlopen(url)
    bs = BeautifulSoup(html, 'lxml')
    
    headers = [
        th.get_text() 
        for th in bs.find_all('tr', limit=2)[0].find_all('th')
        [1:]
    ]

    rows = bs.find_all('tr')[1:]
    rows_data = [
        [td.get_text() for td in rows[i].find_all('td')]
        for i in range(len(rows))
    ][4:54]
    names = [row[1] for row in rows_data]
    names_replaced = [name
                      .replace('*', '') 
                      .replace('\n', '')
                      .replace('^', '')
                      .replace('§', '')
                      .replace('†', '')
                      for name in names]    
    return names_replaced

In [3]:
names = all_time_leaders("https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_career_scoring_leaders")
names.sort()

### Headshots

In [6]:
headshot_urls = [BReferenceScraper.player_headshot(name) for name in names]


Adrian Dantley
https://www.basketball-reference.com/players/d/dantlad01.html

Alex English
https://www.basketball-reference.com/players/e/englial01.html

Allen Iverson
https://www.basketball-reference.com/players/i/iversal01.html

Antawn Jamison
https://www.basketball-reference.com/players/j/jamisan01.html

Bob Pettit
https://www.basketball-reference.com/players/p/pettibo01.html

Carmelo Anthony
https://www.basketball-reference.com/players/a/anthoca01.html

Charles Barkley
https://www.basketball-reference.com/players/b/barklch01.html

Chris Paul
https://www.basketball-reference.com/players/p/paulch01.html

Clyde Drexler
https://www.basketball-reference.com/players/d/drexlcl01.html

David Robinson
https://www.basketball-reference.com/players/r/robinda01.html

DeMar DeRozan
https://www.basketball-reference.com/players/d/derozde01.html

Dirk Nowitzki
https://www.basketball-reference.com/players/n/nowitdi01.html

Dominique Wilkins
https://www.basketball-reference.com/players/w/wilkido01.h

### Basketball Reference

In [9]:
def breference_url(name: str) -> str:
    """Returns the url for a player's basketball reference page."""
    names = name.replace('-', ' ').replace("'", '').lower().split(' ')
    url = 'https://www.basketball-reference.com/players/' + names[1][0] + '/' + names[1][0:5] + names[0][0:2] + '01.html'
    return url

def scrap_seasons(name: str) -> pd.DataFrame:
    """Returns a dataframe of a player's season stats."""
    time.sleep(10)
    url = breference_url(name)
    html = urlopen(url)
    bs = BeautifulSoup(html, 'lxml')
    
    table = bs.find_all('table')[0]
    df = pd.read_html(str(table))[0]
    df = df[~df.Age.isna()]
    # value don't start with "Didn' Play"
    df = df[~df.Tm.str.startswith("Did")]
    df['Name'] = name
    return df

def scrap_seasons_all(names: list) -> pd.DataFrame:
    """Returns a dataframe of all players' season stats."""
    df = pd.DataFrame()
    for name in names:
        print(name)
        df = pd.concat([df, scrap_seasons(name)])
    df.to_csv('../data/breference/raw/all_time_leaders.csv', index=False)
    return df

df = scrap_seasons_all(names)
df

Kareem Abdul-Jabbar
LeBron James
Karl Malone
Kobe Bryant
Michael Jordan
Dirk Nowitzki
Wilt Chamberlain
Shaquille O'Neal
Carmelo Anthony
Moses Malone
Elvin Hayes
Hakeem Olajuwon
Oscar Robertson
Kevin Durant
Dominique Wilkins
Tim Duncan
Paul Pierce
John Havlicek
Kevin Garnett
Vince Carter
Alex English
Reggie Miller
Jerry West
Patrick Ewing
Ray Allen
Allen Iverson
James Harden
Russell Westbrook
Charles Barkley
Robert Parish
Adrian Dantley
Dwyane Wade
Elgin Baylor
Clyde Drexler
Gary Payton
Larry Bird
Hal Greer
Chris Paul
DeMar DeRozan
Stephen Curry
Walt Bellamy
Pau Gasol
Bob Pettit
David Robinson
George Gervin
LaMarcus Aldridge
Mitch Richmond
Joe Johnson
Tom Chambers
Antawn Jamison


Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Name
0,1969-70,22.0,MIL,NBA,C,82.0,,43.1,11.4,22.1,...,,,14.5,4.1,,,,3.5,28.8,Kareem Abdul-Jabbar
1,1970-71,23.0,MIL,NBA,C,82.0,,40.1,13.0,22.5,...,,,16.0,3.3,,,,3.2,31.7,Kareem Abdul-Jabbar
2,1971-72,24.0,MIL,NBA,C,81.0,,44.2,14.3,24.9,...,,,16.6,4.6,,,,2.9,34.8,Kareem Abdul-Jabbar
3,1972-73,25.0,MIL,NBA,C,76.0,,42.8,12.9,23.3,...,,,16.1,5.0,,,,2.7,30.2,Kareem Abdul-Jabbar
4,1973-74,26.0,MIL,NBA,C,81.0,,43.8,11.7,21.7,...,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0,Kareem Abdul-Jabbar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2009-10,33.0,CLE,NBA,PF,25.0,23.0,32.4,6.5,13.4,...,1.5,6.2,7.7,1.3,1.1,0.5,1.2,2.7,15.8,Antawn Jamison
14,2010-11,34.0,CLE,NBA,PF,56.0,38.0,32.9,6.7,15.6,...,1.5,5.2,6.7,1.7,0.9,0.5,1.4,2.4,18.0,Antawn Jamison
15,2011-12,35.0,CLE,NBA,PF,65.0,65.0,33.1,6.5,16.1,...,1.9,4.4,6.3,2.0,0.8,0.7,1.4,2.5,17.2,Antawn Jamison
16,2012-13,36.0,LAL,NBA,PF,76.0,6.0,21.5,3.5,7.6,...,1.4,3.3,4.8,0.7,0.4,0.3,0.7,1.6,9.4,Antawn Jamison


# Preprocessing Data

In [7]:
df = pd.read_csv('../data/breference/raw/all_time_leaders.csv')
df.head()

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Name
0,1969-70,22.0,MIL,NBA,C,82.0,,43.1,11.4,22.1,...,,,14.5,4.1,,,,3.5,28.8,Kareem Abdul-Jabbar
1,1970-71,23.0,MIL,NBA,C,82.0,,40.1,13.0,22.5,...,,,16.0,3.3,,,,3.2,31.7,Kareem Abdul-Jabbar
2,1971-72,24.0,MIL,NBA,C,81.0,,44.2,14.3,24.9,...,,,16.6,4.6,,,,2.9,34.8,Kareem Abdul-Jabbar
3,1972-73,25.0,MIL,NBA,C,76.0,,42.8,12.9,23.3,...,,,16.1,5.0,,,,2.7,30.2,Kareem Abdul-Jabbar
4,1973-74,26.0,MIL,NBA,C,81.0,,43.8,11.7,21.7,...,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0,Kareem Abdul-Jabbar


In [8]:
# get the total points per season
df['PTS'] = df['PTS'].astype(float)
df['G'] = df['G'].astype(int)
df['Total Points'] = df.apply(lambda x: x['PTS']*x['G'], axis=1).astype(int)
df_totals = df[['Name', 'Season', 'Total Points']]
df_totals

Unnamed: 0,Name,Season,Total Points
0,Kareem Abdul-Jabbar,1969-70,2361
1,Kareem Abdul-Jabbar,1970-71,2599
2,Kareem Abdul-Jabbar,1971-72,2818
3,Kareem Abdul-Jabbar,1972-73,2295
4,Kareem Abdul-Jabbar,1973-74,2187
...,...,...,...
858,Antawn Jamison,2009-10,395
859,Antawn Jamison,2010-11,1008
860,Antawn Jamison,2011-12,1118
861,Antawn Jamison,2012-13,714


In [10]:
df_pivot = df_totals.pivot_table(index='Name', columns='Season', values='Total Points')
df_pivot.fillna(0, inplace=True)
df_pivot = df_pivot.astype(int)
df_pivot = df_pivot.cumsum(axis=1)
# concat with url headshots
df_pivot['headshot'] = headshot_urls

In [11]:
df_pivot.to_csv('../data/breference/refined/all_time_leaders.csv')