This is a code review of Josh Cohen's original NBA Analysis:
https://github.com/joshc3453/NBA_Analysis/blob/main/nba_scraper_players_22_23_clean.ipynb

This is for a Buffalo Data Science talk about Bad Data Science Coding Practices.

# NBA 22-23 Analysis & Classification

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', None)

# Web Scraping & Merging Datasets

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2023_totals.html'
html = urlopen(url)
soup = BeautifulSoup(html, features='lxml')

In [3]:
headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
rows = soup.findAll('tr')[1:]
rows_data = [[td.getText() for td in rows[i].findAll('td')]
                     for i in range(len(rows))]
headers = headers[1:]

In [4]:
nba = pd.DataFrame(rows_data, columns = headers)
nba = nba.mask(nba.eq('None')).dropna()
nba = nba.apply(pd.to_numeric, errors='ignore')

In [None]:
# @badcode
# Mid-code import
# This may cause a reproducibility issue - people may not be able to run your code because they won't
# have access to your Google Drive.  Furthermore, there is no insight on how NBA_22_23.xlsx was generated.

# @badcode
# Bad variable name df2 does not tell me what the data actually is

from google.colab import drive
drive.mount('/content/gdrive')

mypath='/content/gdrive/MyDrive/'

fn = "NBA_22_23.xlsx"
df2 = pd.read_excel(mypath+fn)

In [None]:
nba = nba.merge(df2, how='left', left_on='Player', right_on='player')

In [None]:
# Dropping duplicate column 'player' and 'pos'
nba.drop(['player', 'pos'], axis=1, inplace=True)

# Feature Engineering

In [5]:
# @badcode
# Inconsistent naming convention - the 'PG' in PPG & MPG is uppercase, while it is lower case for 3Ppg, TRBpg, ASTpg

# @badcode
# Magic numbers - 0.44 in 0.44*nba['FTA']; 2 in 2*nba['TSA']

# @badcode
# DRY violation a lot of division by nba['G']

nba['PPG'] = nba['PTS']/nba['G']
nba['3Ppg'] = nba['3P']/nba['G']
nba['TRBpg'] = nba['TRB']/nba['G']
nba['ASTpg'] = nba['AST']/nba['G']
nba['TSA'] = nba['FGA']+(0.44*nba['FTA'])
nba['TS%'] = nba['PTS']/(2*nba['TSA'])
nba['MPG'] = nba['MP']/nba['G']

In [6]:
# @refactored
for col in ('PTS', '3P', 'TRB', 'AST', 'MP'):
    nba[f'{col}pg'] = nba[col] / nba['G']
    
nba['TSA'] = nba['FGA']+(0.44*nba['FTA'])
nba['TS%'] = nba['PTS']/(2*nba['TSA'])

In [None]:
# @badcode
# Useless comment - Adjusting decimal displays

# @badcode
# DRY violation

# @badcode
# Redundant code
# nba['number'].round(decimals=0) is redundant because it is converted to int in the next cell

# Adjusting decimal displays
nba['PPG'] = nba['PPG'].round(decimals=2)
nba['3Ppg'] = nba['3Ppg'].round(decimals=3)
nba['TRBpg'] = nba['TRBpg'].round(decimals=2)
nba['ASTpg'] = nba['ASTpg'].round(decimals=2)
nba['number'] = nba['number'].round(decimals=0)
nba['TS%'] = nba['TS%'].round(decimals=3)
nba['MPG'] = nba['MPG'].round(decimals=2)

In [7]:
# @refactored
cols_to_round = dict.fromkeys(('PPG', 'TRBpg', 'ASTpg', 'MPG'), 2)
cols_to_round.update({'3Ppg': 3, 'TS%': 3})
nba = nba.round(cols_to_round)

In [None]:
# @badcode
# Useless comment - Converting dtypes

# @badcode
# Confounding / similar naming
# height_inches vs height_in

# @badcode
# Inconsistent naming
# the unit of measurement is spelled out in height_feet, height_inches, but abbreviation is used for height_in, weight_lbs

# @badcode
# DRY Violation

# Converting dtypes
nba['number'] = nba['number'].astype('Int64')
nba['height_feet'] = nba['height_feet'].astype('Int64')
nba['height_inches'] = nba['height_inches'].astype('Int64')
nba['height_in'] = nba['height_in'].astype('Int64')
nba['weight_lbs'] = nba['weight_lbs'].astype('Int64')
nba['birth_day'] = nba['birth_day'].astype('Int64')
nba['birth_year'] = nba['birth_year'].astype('Int64')

In [None]:
# @refactored
int_cols = ['number', 'height_feet', 'height_inches', 'height_in', 'weight_lbs', 'birth_day', 'birth_year']
nba[int_cols] = nba[int_cols].astype('Int64')

In [None]:
# Renaming similarly-named column
nba.rename(columns={"height_in": "total_height_in"}, inplace=True)

In [None]:
nba.head()

# Data Analysis

## Team Points Per Game

In this analysis, I will extract team points per game using only individual player data. This will be accomplished by grouping the dataframe by team and finding the maximum amount of games played by a player for that team. This should give us the total games played by that team. Note: This assumes that at least one player from each team has played in every game so far this season.
I then divide the team's total points by the games played to get the team's points per game.

In [None]:
# @badcode
# Useless comments

# @badcode
# We only want to calculate Team Points Per Game
# However, we are creating unneccessary dataframes using nba.groupby(by='Tm').max() and nba.groupby(by='Tm').sum().
# In the 2 groupby examples, we are calculating the max and sum for all numeric columns.  
# This is overkill for just calculating Team Points Per Game

# Finding the max games played by a player on each team
team_max_df = nba.groupby(by='Tm').max()
# Finding the total points scored by each team
team_sum_df = nba.groupby(by='Tm').sum()
# Creating a points per game feature for the dataframe
team_sum_df['team_ppg'] = team_sum_df['PTS']/team_max_df['G'].values
# Formatting
team_sum_df['team_ppg'] = team_sum_df['team_ppg'].round(decimals=1)
team_sum_df.reset_index(inplace=True)
# Viewing teams and points per game, sorted
team_sum_df = team_sum_df[['Tm', 'team_ppg']].sort_values(by='team_ppg', ascending=False)
team_sum_df

In [None]:
# @badcode
# Bad assumption
# We assume 'Note: This assumes that at least one player from each team has played in every game so far this season.'
# Because of this assumption, in the code above, we use team_max_df['G'].values in the denominator to calculate Team Points Per Game

# In an NBA season, all teams should play the same amount of games.  However, when we check the code for the denominator,
# our assumption is incorrect.
nba.groupby('Tm')['G'].max()

In [8]:
# @refactored
n_games_per_team_per_season = 82
team_ppg = (nba.groupby(by='Tm')['PTS'].sum() / n_games_per_team_per_season).sort_values(ascending=False).round(1)
team_ppg

# TOT stands for Two Other Teams - meaning a player played for two or more teams that year

Tm
TOT    395.1
SAC    120.7
GSW    118.9
ATL    118.4
BOS    117.9
OKC    117.5
LAL    117.2
UTA    117.1
MIL    116.9
MEM    116.9
IND    116.3
NYK    116.0
DEN    115.8
MIN    115.8
PHI    115.2
NOP    114.4
DAL    114.2
PHO    113.6
LAC    113.6
POR    113.4
BRK    113.4
WAS    113.2
CHI    113.1
SAS    113.0
TOR    112.9
CLE    112.3
ORL    111.4
CHO    111.0
HOU    110.7
DET    110.3
MIA    109.5
Name: PTS, dtype: float64

## Top 5 Player Statistics

Top 5 player statistics uses a minimum game threshold of over 13 games. At this point in the season, 13 seems to capture active players while weeding out less active players. We don't want players with only a few games played to skew or inflate our results. Additionally, the results below are exactly inline with ESPN's statistics.

### Points Per Game

In [None]:
# @badcode
# DRY violation
# The same code pattern is repeated throughout

# @badcode
# DRY violation
# top_5_<<stat_name>> is used as the variable name.  What happens if you are asked to get the top 3 or top 10?
# You would have to change it in the code, as well as the variable name

# @badcode
# DRY violation
# 17 and 13 are repeated multiple times - they should be assigned to a variable

# @badcode
# Inconsistent code / magic number - In the text, we mention that we use 13 as a minimum game threshold.
# However, we see that 17 is also used.

top_5_ppg = nba[['Player', 'G', 'Tm', 'PPG']]
top_5_ppg[top_5_ppg['G']>17].sort_values('PPG', ascending=False)[:5]

### Assists

In [None]:
top_5_ast = nba[['Player', 'G', 'Tm', 'ASTpg']]
top_5_ast[top_5_ast['G']>17].sort_values('ASTpg', ascending=False)[:5]

### 3 Pointers Made

In [None]:
top_5_3p = nba[['Player', 'G', 'Tm', '3Ppg']]
top_5_3p[top_5_3p['G']>13].sort_values('3Ppg', ascending=False)[:5]

### Rebounds

In [None]:
top_5_reb = nba[['Player', 'G', 'Tm', 'TRBpg']]
top_5_reb[top_5_reb['G']>13].sort_values('TRBpg', ascending=False)[:5]

In [9]:
# @refactored code
min_games_played_threshold = 13
top_n = 5
eligible_candidates = nba[nba['G'] > min_games_played_threshold]

top_players = {
    stat: nba[['Player', 'G', 'Tm', stat]].sort_values(stat, ascending=False).head(top_n)
    for stat in ('PPG', 'ASTpg', '3Ppg', 'TRBpg')
}

In [10]:
top_players['PPG']

Unnamed: 0,Player,G,Tm,PPG
191,Joel Embiid,66,PHI,33.08
166,Luka Dončić,66,DAL,32.39
387,Damian Lillard,58,POR,32.17
217,Shai Gilgeous-Alexander,68,OKC,31.4
12,Giannis Antetokounmpo,63,MIL,31.1


In [11]:
top_players['ASTpg']

Unnamed: 0,Player,G,Tm,ASTpg
254,James Harden,58,PHI,10.66
248,Tyrese Haliburton,56,IND,10.45
701,Trae Young,73,ATL,10.15
330,Nikola Jokić,69,DEN,9.83
164,Spencer Dinwiddie,26,BRK,9.08


## Free Throw Analysis

I first create a new dataframe, "ft_df" to analyze player free throws. Next, I create a new feature called "points_forgone" which is a measure of free throws made minus free throws attempted. Since every free throw counts for one point each, one missed free throw equals one missed point, or a point forgone.
> All Games qualifier is on pace for at least 125 made free throws

In [None]:
ft_df = nba[['Player', 'FT', 'FTA', 'FT%']]
ft_df['points_forgone'] = ft_df['FTA']-ft_df['FT']

In [None]:
# @badcode
# Useless comment - a better comment would be the rationale behind using a minimum of 30 free throw attempts

# @badcode
# Magic number - 30

# @badcode
# DRY violation - 30 & 5

# A look at the top 5 players by free throw percentage with
# a minimum of 30 free throw attempts.
ft_df[ft_df['FTA']>30].sort_values('FT%', ascending=False)[:5]

In [None]:
ft_df.sort_values('points_forgone', ascending=False)[:5]

In [None]:
ft_df[ft_df['FTA'] > 30]['FT%'].mean()

In [None]:
# @badcode
# Magic number - it's bad practice to refer to columns by their integer location (i.e. 0 & 28).
# What happens if the order of your columns changed, or if the columns were added or removed?
# You would then have to change your code.

# @badcode
# DRY violation - sort_values is called twice.  It's expensive and unnecessary.

# Finding the player on each team with the most points scored
for i in nba['Tm'].unique():
  print('*****')
  print('Team:', i)
  print(nba[nba['Tm']==i].sort_values(by='PTS', ascending=False).iloc[0][0], ':',
        nba[nba['Tm']==i].sort_values(by='PTS', ascending=False).iloc[0][28],
        'pts')
print('*****')

In [62]:
# @badcode
# DRY violation - this code is very similar as the code above

top_player_list = []
for i in nba['Tm'].unique():
  top_player_list.append(nba[nba['Tm']==i].sort_values(by='PTS', ascending=False).iloc[0][0])

In [63]:
# @badcode
# The comment and code logic is wrong - notice that there are multiple players from the same team
# ex. TOT, BRK, etc.  However, we only want to grab the top points scorer from each team.
# Therefore, each team should only have 1 entry.

# This query returns a dataframe of the top player for each team (PTS)
top_player_df = nba.query("Player == @top_player_list")
top_player_df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PPG,3Ppg,TRBpg,ASTpg,TSA,TS%,MPG,PTSpg,MPpg
2,Bam Adebayo,C,25,MIA,75,75,2598,602,1114,0.54,1,12,0.083,601,1102,0.545,0.541,324,402,0.806,184,504,688,240,88,61,187,208,1529,20.39,0.013,9.17,3.2,1290.88,0.592,34.64,20.386667,34.64
12,Giannis Antetokounmpo,PF,28,MIL,63,63,2024,707,1278,0.553,47,171,0.275,660,1107,0.596,0.572,498,772,0.645,137,605,742,359,52,51,246,197,1959,31.1,0.746,11.78,5.7,1617.68,0.605,32.13,31.095238,32.126984
65,Bojan Bogdanović,PF,33,DET,59,59,1893,430,882,0.488,145,353,0.411,285,529,0.539,0.57,268,303,0.884,36,187,223,152,34,8,135,113,1273,21.58,2.458,3.78,2.58,1015.32,0.627,32.08,21.576271,32.084746
68,Devin Booker,SG,26,PHO,53,53,1835,527,1067,0.494,111,316,0.351,416,751,0.554,0.546,306,358,0.855,46,194,240,293,51,18,145,159,1471,27.75,2.094,4.53,5.53,1224.52,0.601,34.62,27.754717,34.622642
79,Mikal Bridges,SF-SG,26,TOT,83,83,2963,593,1267,0.468,169,442,0.382,424,825,0.514,0.535,316,353,0.895,79,285,364,273,91,61,127,159,1671,20.13,2.036,4.39,3.29,1422.32,0.587,35.7,20.13253,35.698795
80,Mikal Bridges,SF,26,PHO,56,56,2040,354,764,0.463,101,261,0.387,253,503,0.503,0.529,156,174,0.897,56,187,243,201,65,45,78,115,965,17.23,1.804,4.34,3.59,840.56,0.574,36.43,17.232143,36.428571
81,Mikal Bridges,SG,26,BRK,27,27,923,239,503,0.475,68,181,0.376,171,322,0.531,0.543,160,179,0.894,23,98,121,72,26,16,49,44,706,26.15,2.519,4.48,2.67,581.76,0.607,34.19,26.148148,34.185185
166,Luka Dončić,PG,23,DAL,66,66,2391,719,1449,0.496,185,541,0.342,534,908,0.588,0.56,515,694,0.742,54,515,569,529,90,33,236,166,2138,32.39,2.803,8.62,8.02,1754.36,0.609,36.23,32.393939,36.227273
180,Kevin Durant,PF,34,TOT,47,47,1672,483,862,0.56,93,230,0.404,390,632,0.617,0.614,307,334,0.919,17,296,313,235,34,67,156,99,1366,29.06,1.979,6.66,5.0,1008.96,0.677,35.57,29.06383,35.574468
181,Kevin Durant,PF,34,BRK,39,39,1403,410,734,0.559,71,189,0.376,339,545,0.622,0.607,267,286,0.934,14,248,262,207,32,57,136,92,1158,29.69,1.821,6.72,5.31,859.84,0.673,35.97,29.692308,35.974359


In [None]:
# @refactored
top_scorers = nba.sort_values('PTS', ascending=False)
best_scorers_by_team = top_scorers.groupby('Tm').head(1)
for Team, player_name, pts in best_scorers_by_team[['Tm', 'Player', 'PTS']].to_records(index=False):
    print(f'*****\n{Team=}\n{player_name} : {pts=}')

In [65]:
best_scorers_by_team

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,PPG,3Ppg,TRBpg,ASTpg,TSA,TS%,MPG,PTSpg,MPpg
613,Jayson Tatum,SF,24,BOS,74,74,2732,727,1559,0.466,240,686,0.35,487,873,0.558,0.543,531,622,0.854,78,571,649,342,78,51,213,160,2225,30.07,3.243,8.77,4.62,1832.68,0.607,36.92,30.067568,36.918919
191,Joel Embiid,C,28,PHI,66,66,2284,728,1328,0.548,66,200,0.33,662,1128,0.587,0.573,661,771,0.857,113,557,670,274,66,112,226,205,2183,33.08,1.0,10.15,4.15,1667.24,0.655,34.61,33.075758,34.606061
166,Luka Dončić,PG,23,DAL,66,66,2391,719,1449,0.496,185,541,0.342,534,908,0.588,0.56,515,694,0.742,54,515,569,529,90,33,236,166,2138,32.39,2.803,8.62,8.02,1754.36,0.609,36.23,32.393939,36.227273
217,Shai Gilgeous-Alexander,PG,24,OKC,68,68,2416,704,1381,0.51,58,168,0.345,646,1213,0.533,0.531,669,739,0.905,59,270,329,371,112,65,192,192,2135,31.4,0.853,4.84,5.46,1706.16,0.626,35.53,31.397059,35.529412
12,Giannis Antetokounmpo,PF,28,MIL,63,63,2024,707,1278,0.553,47,171,0.275,660,1107,0.596,0.572,498,772,0.645,137,605,742,359,52,51,246,197,1959,31.1,0.746,11.78,5.7,1617.68,0.605,32.13,31.095238,32.126984
185,Anthony Edwards,SG,21,MIN,79,79,2842,707,1541,0.459,213,578,0.369,494,963,0.513,0.528,319,422,0.756,47,411,458,350,125,58,259,186,1946,24.63,2.696,5.8,4.43,1726.68,0.564,35.97,24.632911,35.974684
534,Julius Randle,PF,28,NYK,77,77,2737,658,1432,0.459,218,636,0.343,440,796,0.553,0.536,402,531,0.757,141,626,767,316,49,21,216,233,1936,25.14,2.831,9.96,4.1,1665.64,0.581,35.55,25.142857,35.545455
443,Donovan Mitchell,SG,26,CLE,68,68,2432,679,1402,0.484,245,635,0.386,434,767,0.566,0.572,319,368,0.867,63,226,289,301,99,27,180,168,1922,28.26,3.603,4.25,4.43,1563.92,0.614,35.76,28.264706,35.764706
701,Trae Young,PG,24,ATL,73,73,2541,597,1390,0.429,154,460,0.335,443,930,0.476,0.485,566,639,0.886,56,161,217,741,80,9,300,104,1914,26.22,2.11,2.97,10.15,1671.16,0.573,34.81,26.219178,34.808219
374,Zach LaVine,SG,27,CHI,77,77,2768,673,1388,0.485,204,544,0.375,469,844,0.556,0.558,363,428,0.848,42,303,345,327,69,18,194,159,1913,24.84,2.649,4.48,4.25,1576.32,0.607,35.95,24.844156,35.948052


In [68]:
# Original code dataframe
top_player_df['Tm'].duplicated().sum()

4

In [67]:
# Refactored code dataframe
best_scorers_by_team['Tm'].duplicated().sum()

0

# Data Visualization

In [None]:
plt.hist(nba[nba['3PA']>50]['3P%'], facecolor='orange', edgecolor='brown', bins=20)
plt.title('NBA 3 Point Percentage for Players with over 50 Three Point Attempts')
plt.xlabel('3P%')
plt.ylabel('Frequency')
plt.show()
print(f'Average 3P%: ', {nba[nba['3PA']>50]['3P%'].mean()*100})

# Machine Learning

In this section, I will build a machine learning classification model to try to predict which position a player plays based on their player statistics. I begin by dropping unncessary columns to reduce noise from the data. For example, age should not have any effect on which position a player plays. Additionally, I will remove shot percentages in favor or shot attempts and shots made (For example, I will drop 3P% and keep 3P and 3PA).  I will also drop Offensive Rebounds (ORB) and Defensive Rebounds (DRB) and keep Total Rebounds (TRB)

In [71]:
# @badcode
# Mid-code violation
# Imports should be at the top

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [69]:
# @badcode
# This should be in the Feature Engineering section

nba['BLKpg']=nba['BLK']/nba['G']
nba['PFpg']=nba['PF']/nba['G']
nba['TOVpg']=nba['TOV']/nba['G']

In [None]:
ml_df = nba[['PPG', 'TRBpg', 'ASTpg', 'BLKpg', 'TOVpg', 'PFpg',
             'total_height_in', 'weight_lbs', 'Pos']]

In [None]:
X = ml_df.iloc[:, :-1]
y = ml_df.iloc[:, -1:]

In [None]:
# @badcode
# This should have a seed via the random_state argument to ensure repeatability

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.30)

In [None]:
# @badcode
# DRY violation

# @badcode
# Not good practice to re-use variables

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

predictions = dtc.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=dtc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=dtc.classes_)
acc = acc = accuracy_score(y_test, predictions)
disp.plot()
plt.show()
print('Decision Tree Classifier Accuracy Score: ', acc)

In [None]:
knc = KNeighborsClassifier(n_neighbors=2)
knc.fit(X_train, y_train)

predictions = knc.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=knc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=knc.classes_)
disp.plot()

plt.show()
acc = acc = accuracy_score(y_test, predictions)
print('K Nearest Neighbor Classifier Accuracy Score: ', acc)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

predictions = rfc.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=rfc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=rfc.classes_)
disp.plot()

plt.show()
acc = acc = accuracy_score(y_test, predictions)
print('Random Forest Accuracy Score: ', acc)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

predictions = gnb.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=gnb.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=gnb.classes_)
disp.plot()
plt.show()
acc = acc = accuracy_score(y_test, predictions)
print('Gaussian Naive Bayes Accuracy Score: ', acc)

In [None]:
ml_df.corr()

In [None]:
sns.boxplot(nba['Pos'], nba['total_height_in'])

In [None]:
plt.figure(figsize=(12,8))

plt.subplot(2, 2, 1)
fig = sns.boxplot(data=nba, x='Pos', y='weight_lbs')
fig.set_title('Weight (lbs.)')


plt.subplot(2, 2, 2)
fig = sns.boxplot(data=nba, x='Pos', y='total_height_in')
fig.set_title('Height (in.)')

plt.subplot(2, 2, 3)
fig = sns.boxplot(data=nba, x='Pos', y='TRBpg')
fig.set_title('Rebounds per game')

plt.subplot(2, 2, 4)
fig = sns.boxplot(data=nba, x='Pos', y='BLKpg')
fig.set_title('Blocks per game')

plt.tight_layout()