In [18]:
import re
import time
import tabulate

In [1]:
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import numpy as np
import unicodedata
import datetime as dt
import lxml
from urllib.request import urlopen

'''
Get NBA team names and put them in a list format
'''
nba_tables = pd.read_html('https://en.wikipedia.org/wiki/National_Basketball_Association#Teams', index_col=0, header=0)
teams = nba_tables[3]


In [4]:
teams.reset_index(inplace=True)
teams = teams[teams.Team != 'Western Conference']
teams = teams[teams.Team != 'Eastern Conference']

teams_list = list(teams.Team)
clean_teams = [team.replace(' ', '_') for team in teams_list]


In [6]:
clean_teams

['Boston_Celtics',
 'Brooklyn_Nets',
 'New_York_Knicks',
 'Philadelphia_76ers',
 'Toronto_Raptors',
 'Chicago_Bulls',
 'Cleveland_Cavaliers',
 'Detroit_Pistons',
 'Indiana_Pacers',
 'Milwaukee_Bucks',
 'Atlanta_Hawks',
 'Charlotte_Hornets',
 'Miami_Heat',
 'Orlando_Magic',
 'Washington_Wizards',
 'Denver_Nuggets',
 'Minnesota_Timberwolves',
 'Oklahoma_City_Thunder',
 'Portland_Trail_Blazers',
 'Utah_Jazz',
 'Golden_State_Warriors',
 'Los_Angeles_Clippers',
 'Los_Angeles_Lakers',
 'Phoenix_Suns',
 'Sacramento_Kings',
 'Dallas_Mavericks',
 'Houston_Rockets',
 'Memphis_Grizzlies',
 'New_Orleans_Pelicans',
 'San_Antonio_Spurs']

In [7]:
nba_players = []

In [8]:
'''
Get roster table from each team's wiki page
'''
for team in clean_teams:
    url = f'https://en.wikipedia.org/wiki/{team}'
    html = urlopen(url)
    soup = BeautifulSoup(html, 'lxml')
    tables = soup.find_all('table', {"class": "toccolours"})
    table = tables[0]
    tbody = table.find_all('tbody')[0]
    actualtable = tbody.find_all('table')[0]
    actualrows = actualtable.find_all('tr')[1:]
    for tr in actualrows:
        td = tr.find_all('td')
        row = [unicodedata.normalize('NFKD', i.text.strip()) for i in td]   # gets rid of the weird \x0 spaces
        row.append(team.replace('_', ' '))          # add column for team.  may just want to use 2nd index later 
        nba_players.append(row)

In [25]:
headers = ['position', 'number', 'name', 'height', 'weight', 'dob', 'college', 'team']
nba_df = pd.DataFrame(data=nba_players, columns=headers)
nba_df.dob = nba_df.dob.str.replace('–', '-')       # need to find a way around those long hyphens besides manually
nba_df.dob = pd.to_datetime(nba_df.dob)


In [27]:
#teamz.head(5).to_markdown(tablefmt='grid')

#nba_df.head(3).to_markdown()

In [52]:
'''
YOUNGEST TEAM IN NBA - METHOD #1: 
converts date of birth to nanoseconds in order to run the mean, then converts back to dt
'''
nba_df['dob_ns'] = nba_df.dob.values.astype(np.int64)
pd.to_datetime(nba_df.groupby(by='team')['dob_ns'].mean()).sort_values(ascending=False)


'| team                  |      dob_ns |\n|:----------------------|------------:|\n| Oklahoma City Thunder | 9.23845e+17 |\n| San Antonio Spurs     | 9.10374e+17 |\n| Orlando Magic         | 9.06307e+17 |\n| Indiana Pacers        | 8.98105e+17 |\n| Houston Rockets       | 8.91841e+17 |'

In [56]:
'''
YOUNGEST TEAM IN NBA - METHOD #2: 
subtracts each player's DoB from right now which creates a timedelta object, then calculate a year figure in int format
'''
nba_df['current_age'] = nba_df.dob.apply(lambda x: (dt.datetime.now() - x).days / 365.25)
youngest_teams = nba_df.groupby('team')['current_age'].mean()
youngest_teams = youngest_teams.sort_values(ascending=True).head(5)


In [60]:
youngest_teams = pd.DataFrame(data=youngest_teams)

In [63]:
youngest_teams.current_age = youngest_teams.current_age.round(2)

In [64]:
youngest_teams.to_markdown()

Unnamed: 0_level_0,current_age
team,Unnamed: 1_level_1
Oklahoma City Thunder,23.35
San Antonio Spurs,23.78
Orlando Magic,23.91
Indiana Pacers,24.17
Houston Rockets,24.37


In [52]:
# Top 5 youngest teams
#nba_df.groupby(by="team")['current_age'].mean().sort_values()

team
Oklahoma City Thunder    23.601884
Memphis Grizzlies        24.079398
San Antonio Spurs        24.211620
Orlando Magic            24.418036
Detroit Pistons          24.463552
Name: current_age, dtype: float64

In [53]:
# Top 5 oldest teams
nba_df.groupby('team')['current_age'].mean().sort_values(ascending=False).head()

team
Los Angeles Lakers    30.184483
Milwaukee Bucks       29.009069
Miami Heat            28.908483
Brooklyn Nets         28.655957
Phoenix Suns          28.266538
Name: current_age, dtype: float64

In [None]:
nba_df['height_in'] = nba_df['height'].apply(lambda x: x.split('(')[0])
nba_df.height_in = nba_df['height'].apply(lambda x: (int(x.split(' ')[0]) * 12) + (int(x.split(' ')[2])))
nba_df['weight_int'] = nba_df['weight'].apply(lambda x: int(x.split(' ')[0]))
nba_df['bmi'] = (703 * nba_df.weight_int) / (nba_df.height_in**2)

In [54]:
# Top 5 lightest teams
nba_df.groupby('team')['weight_int'].mean().sort_values(ascending=True).head()

team
Golden State Warriors     209.411765
Indiana Pacers            209.875000
Toronto Raptors           209.941176
Minnesota Timberwolves    210.437500
Oklahoma City Thunder     210.764706
Name: weight_int, dtype: float64

In [55]:
# Top 5 heaviest teams
nba_df.groupby('team')['weight_int'].mean().sort_values(ascending=False)

team
Utah Jazz                 224.437500
Orlando Magic             223.625000
Brooklyn Nets             221.647059
Miami Heat                221.235294
Milwaukee Bucks           220.875000
Boston Celtics            220.294118
Dallas Mavericks          220.117647
Phoenix Suns              217.764706
Philadelphia 76ers        217.705882
Sacramento Kings          217.500000
Washington Wizards        216.882353
Chicago Bulls             216.647059
San Antonio Spurs         216.647059
Detroit Pistons           216.000000
Cleveland Cavaliers       215.882353
Denver Nuggets            215.647059
Los Angeles Lakers        215.470588
New York Knicks           215.470588
Los Angeles Clippers      215.235294
New Orleans Pelicans      214.333333
Houston Rockets           213.875000
Atlanta Hawks             212.750000
Memphis Grizzlies         212.176471
Portland Trail Blazers    211.736842
Charlotte Hornets         211.470588
Oklahoma City Thunder     210.764706
Minnesota Timberwolves    210.437

In [58]:
# Top 5 tallest teams
nba_df.groupby('team')['height_in'].mean().sort_values(ascending=False).head(5)

team
Orlando Magic            79.625000
Sacramento Kings         79.375000
Oklahoma City Thunder    79.352941
Washington Wizards       78.823529
Charlotte Hornets        78.764706
Detroit Pistons          78.687500
Dallas Mavericks         78.588235
Memphis Grizzlies        78.529412
Chicago Bulls            78.529412
New York Knicks          78.470588
Name: height_in, dtype: float64

In [57]:
# Top 10 colleges by representation
nba_df.pivot_table(index='college', aggfunc='size').sort_values(ascending=False).head(10)

college
Kentucky               28
Duke                   21
Texas                  14
North Carolina         12
UCLA                   12
Kansas                 10
Michigan               10
Southern California    10
Washington             10
Arizona                 9
dtype: int64

In [22]:
nba_df.to_csv('nba_df.csv')

In [3]:
nba_df = pd.read_csv('nba_df.csv')