In [1]:
import pandas as pd
import numpy as np
import os
import time
import pickle
from bs4 import BeautifulSoup

## Basic Functions

In [2]:
def read_html(filename):
    soup = BeautifulSoup(open(filename), "html.parser")
    return soup

In [3]:
def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

## Content

In [4]:
def check_content(league, season, team_id):
    directory = f'data_{league}_{season}'
    
    playtypes_dict = {}
    
    for side in ['offense', 'defense']:
        soup = read_html(f'{directory}/{team_id}_{side}.html')
        team = soup.find_all('h1')[0].text.split('-')[-1].strip()
        print(f'{team} - {side}')
        
        tables = soup.find_all('table')
        print('# of tables:', len(tables))

        playtypes = []
        for idx in range(len(tables)):
            table = tables[idx]
            cols = table.find_all('tr', {'class': 'TierHeader'})
            if cols:
                playtype = cols[0].find_all('td')[0].text
                print(idx, playtype)
                if idx > 10:
                    playtypes.append(playtype.strip())

        playtypes_dict[side] = playtypes
        print()
        
    return playtypes_dict

In [5]:
check_content('cba', 2021, '528')

Shanghai Bilibili Sharks - offense
# of tables: 22
1 Overall Offense
2 Play Types
3 Offense Including Passes
4 During Pass Out Situations
5 During Trapping Situations
6 Shot Attempt - Half Court
7 Catch and Shoot - Half Court
8 Dribble Jumper - Half Court
9 Jump Shot Range - half court
10  Overall
11  Transition
12  Spot Up
13  P&R Ball Handler
14  Cut
15  Offensive Rebounds (put backs)
16  Post-Up
17  P&R Roll Man
18  Isolation
19  Off Screen
20  Hand Off
21  Miscellaneous

Shanghai Bilibili Sharks - defense
# of tables: 18
1 Overall Defense
2 Play Types
3 Offense Including Passes
4 During Pass Out Situations
5 During Trapping Situations
6 Shot Attempt - Half Court
7 Catch and Shoot - Half Court
8 Dribble Jumper - Half Court
9 Jump Shot Range - half court
10  Overall
11  Spot Up
12  P&R Ball Handler
13  Post-Up
14  P&R Roll Man
15  Off Screen
16  Isolation
17  Hand Off



{'offense': ['Transition',
  'Spot Up',
  'P&R Ball Handler',
  'Cut',
  'Offensive Rebounds (put backs)',
  'Post-Up',
  'P&R Roll Man',
  'Isolation',
  'Off Screen',
  'Hand Off',
  'Miscellaneous'],
 'defense': ['Spot Up',
  'P&R Ball Handler',
  'Post-Up',
  'P&R Roll Man',
  'Off Screen',
  'Isolation',
  'Hand Off']}

In [6]:
check_content('nba', 2021, '1')

Boston Celtics - offense
# of tables: 23
1 Overall Offense
2 Play Types
3 Offense Including Passes
4 During Pass Out Situations
5 During Trapping Situations
6 Shot Attempt - Half Court
7 Catch and Shoot - Half Court
8 Dribble Jumper - Half Court
9 Jump Shot Range - half court
10  Overall
11  Spot Up
12  P&R Ball Handler
13  Transition
14  Isolation
15  Cut
16  Offensive Rebounds (put backs)
17  Off Screen
18  P&R Roll Man
19  Post-Up
20  Hand Off
21  Miscellaneous
22  #7 Jaylen Brown

Boston Celtics - defense
# of tables: 18
1 Overall Defense
2 Play Types
3 Offense Including Passes
4 During Pass Out Situations
5 During Trapping Situations
6 Shot Attempt - Half Court
7 Catch and Shoot - Half Court
8 Dribble Jumper - Half Court
9 Jump Shot Range - half court
10  Overall
11  Spot Up
12  P&R Ball Handler
13  Isolation
14  P&R Roll Man
15  Hand Off
16  Post-Up
17  Off Screen



{'offense': ['Spot Up',
  'P&R Ball Handler',
  'Transition',
  'Isolation',
  'Cut',
  'Offensive Rebounds (put backs)',
  'Off Screen',
  'P&R Roll Man',
  'Post-Up',
  'Hand Off',
  'Miscellaneous',
  '#7 Jaylen Brown'],
 'defense': ['Spot Up',
  'P&R Ball Handler',
  'Isolation',
  'P&R Roll Man',
  'Hand Off',
  'Post-Up',
  'Off Screen']}

In [7]:
check_content('euroleague', 2020, '521')

Gasteiz - offense
# of tables: 22
1 Overall Offense
2 Play Types
3 Offense Including Passes
4 During Pass Out Situations
5 During Trapping Situations
6 Shot Attempt - Half Court
7 Catch and Shoot - Half Court
8 Dribble Jumper - Half Court
9 Jump Shot Range - half court
10  Overall
11  Spot Up
12  P&R Ball Handler
13  Transition
14  Cut
15  Post-Up
16  Off Screen
17  Offensive Rebounds (put backs)
18  P&R Roll Man
19  Isolation
20  Hand Off
21  Miscellaneous

Gasteiz - defense
# of tables: 18
1 Overall Defense
2 Play Types
3 Offense Including Passes
4 During Pass Out Situations
5 During Trapping Situations
6 Shot Attempt - Half Court
7 Catch and Shoot - Half Court
8 Dribble Jumper - Half Court
9 Jump Shot Range - half court
10  Overall
11  Spot Up
12  P&R Ball Handler
13  Isolation
14  Post-Up
15  P&R Roll Man
16  Off Screen
17  Hand Off



{'offense': ['Spot Up',
  'P&R Ball Handler',
  'Transition',
  'Cut',
  'Post-Up',
  'Off Screen',
  'Offensive Rebounds (put backs)',
  'P&R Roll Man',
  'Isolation',
  'Hand Off',
  'Miscellaneous'],
 'defense': ['Spot Up',
  'P&R Ball Handler',
  'Isolation',
  'Post-Up',
  'P&R Roll Man',
  'Off Screen',
  'Hand Off']}

Therefore, set a dictionary for the top index of playtypes

In [8]:
side_idx_cap = {'offense': 22, 'defense': 18}

## Data Prep

In [9]:
def get_playtype_df(tables, idx, including_team=False):
    data = [[td.text for td in row.find_all('td')]
            for row in tables[idx].find_all('tr')]
    data[0][0] = data[0][0].strip()
    playtype = data[0][0]
    start_point = 1 if including_team else 2
    df = pd.DataFrame(data[start_point:], columns=data[0])\
           .rename(columns={playtype: 'Player'})
    df = df[df['Points'] != '-']
    for col in ['Poss', 'Points']:
        df[col] = df[col].apply(int)
    df['PPP'] = df['PPP'].apply(float)
    df['Rank'] = df['Rank'].apply(lambda x: x.replace('%', '') if '%' in x else x)
    output_cols = ['Player', 'Poss', 'Points', 'PPP', 'Rank', 'Rating']
    output = df[output_cols].rename(columns={col: f'{playtype}_{col}'
                                             for col in output_cols 
                                             if col != 'Player'})
    return output

In [10]:
def get_team_playtype_df(soup, cap):
    team = soup.find_all('h1')[0].text.split(' - ')[-1].strip()
    tables = soup.find_all('table')
#     print(team)
    
    # Get data
    playtype_df = get_playtype_df(tables, 10)
    playtype_df['Team'] = team
    
    for i in range(11, cap):
        playtype_df = playtype_df.merge(get_playtype_df(tables, i),
                                        how='left', on='Player')
    
    # Issues with NBA
    if len(tables) > cap:
        prob_poss_dict = {row[0]: int(row[1])
                          for row in get_playtype_df(tables, cap, including_team=True).values}
        playtype_df['Overall_Poss'] = playtype_df['Overall_Poss'] - \
                                      playtype_df['Player'].apply(lambda x: 
                                      prob_poss_dict[x] if x in prob_poss_dict.keys() else 0)
    
    # Test if possession sums up right
    poss_cols = [col for col in playtype_df.columns if 'Poss' in col and col != 'Overall_Poss']
    overall_minus_playtypes_sum = playtype_df['Overall_Poss'] - \
                                  playtype_df[poss_cols].sum(axis=1)
    poss_test_res = len(set(overall_minus_playtypes_sum))
    if poss_test_res != 1:
        n_players = len([n for n in list(overall_minus_playtypes_sum) if n !=0 ])
        print(f'Issue found with {team}: {n_players} players with unequal possessions')
        
    # Get Poss% columns
    for col in poss_cols:
        playtype_df[f'{col}%'] = playtype_df[col] / playtype_df['Overall_Poss']
        playtype_df[f'{col}%'] = playtype_df[f'{col}%'].fillna(0)
        playtype_df[col] = playtype_df[col].fillna(0)
    
    return playtype_df

In [11]:
def get_league_playtype_df(directory, side, side_idx_cap=side_idx_cap):
    team_playtype_dfs = []
    for file in os.listdir(directory):
        if side in file:
            soup = read_html(f'{directory}/{file}')
            team_playtype_dfs.append(get_team_playtype_df(soup, side_idx_cap[side]))
    return pd.concat(team_playtype_dfs)

In [12]:
def get_season_playtype_df(season, league):
    print(season)
    season_dfs = {}
    directory = f'data_{league}_{season}'
    print('offense')
    offense_df = get_league_playtype_df(directory, 'offense')
    offense_df['Season'] = season
    print('defense')
    defense_df = get_league_playtype_df(directory, 'defense')
    defense_df['Season'] = season
    return {'offense': offense_df, 'defense': defense_df}

### CBA

In [13]:
cba_dfs = {season: get_season_playtype_df(season, 'cba')
           for season in range(2017, 2022)}

2017
offense
defense
2018
offense
defense
2019
offense
defense
2020
offense
defense
2021
offense
defense


In [14]:
cba_cols = {type_: cba_dfs[2021][type_].columns
            for type_ in ['offense', 'defense']}
for type_ in ['offense', 'defense']:
    pd.concat([cba_dfs[season][type_][cba_cols[type_]] for season in range(2017, 2022)])\
      .reset_index().drop('index', axis=1)\
      .to_csv(f'cba_{type_}.csv', index=False)

### Euroleague

In [15]:
# euroleague_dfs = {season: get_season_playtype_df(season, 'euroleague')
#                   for season in range(2016, 2021)}

In [16]:
# euroleague_cols = {type_: euroleague_dfs[2020][type_].columns
#                    for type_ in ['offense', 'defense']}
# for type_ in ['offense', 'defense']:
#     pd.concat([euroleague_dfs[season][type_][euroleague_cols[type_]] for season in range(2016, 2021)])\
#       .reset_index().drop('index', axis=1)\
#       .to_csv(f'euroleague_{type_}.csv', index=False)

### NBA

In [17]:
nba_dfs = {season: get_season_playtype_df(season, 'nba')
           for season in range(2016, 2022)}

2016
offense
Issue found with Memphis Grizzlies: 9 players with unequal possessions
Issue found with Washington Wizards: 11 players with unequal possessions
Issue found with Charlotte Hornets: 6 players with unequal possessions
Issue found with San Antonio Spurs: 4 players with unequal possessions
Issue found with Atlanta Hawks: 7 players with unequal possessions
Issue found with Boston Celtics: 7 players with unequal possessions
Issue found with Philadelphia 76ers: 8 players with unequal possessions
Issue found with Golden State Warriors: 9 players with unequal possessions
Issue found with Oklahoma City Thunder: 7 players with unequal possessions
Issue found with Cleveland Cavaliers: 8 players with unequal possessions
Issue found with Detroit Pistons: 7 players with unequal possessions
Issue found with Dallas Mavericks: 9 players with unequal possessions
Issue found with Denver Nuggets: 8 players with unequal possessions
Issue found with Chicago Bulls: 7 players with unequal possessio

Issue found with Chicago Bulls: 10 players with unequal possessions
Issue found with Houston Rockets: 10 players with unequal possessions
Issue found with Brooklyn Nets: 7 players with unequal possessions
Issue found with Phoenix Suns: 12 players with unequal possessions
Issue found with Milwaukee Bucks: 8 players with unequal possessions
Issue found with New Orleans Pelicans: 10 players with unequal possessions
Issue found with Portland Trail Blazers: 12 players with unequal possessions
defense
Issue found with Utah Jazz: 17 players with unequal possessions
Issue found with Memphis Grizzlies: 25 players with unequal possessions
Issue found with Toronto Raptors: 20 players with unequal possessions
Issue found with Los Angeles Lakers: 21 players with unequal possessions
Issue found with Washington Wizards: 24 players with unequal possessions
Issue found with Charlotte Hornets: 15 players with unequal possessions
Issue found with Atlanta Hawks: 21 players with unequal possessions
Issue f

Issue found with Minnesota Timberwolves: 15 players with unequal possessions
Issue found with Portland Trail Blazers: 16 players with unequal possessions
Issue found with Indiana Pacers: 19 players with unequal possessions
Issue found with Orlando Magic: 27 players with unequal possessions
Issue found with Miami Heat: 20 players with unequal possessions
2021
offense
Issue found with Memphis Grizzlies: 8 players with unequal possessions
Issue found with Utah Jazz: 10 players with unequal possessions
Issue found with Los Angeles Lakers: 8 players with unequal possessions
Issue found with Washington Wizards: 6 players with unequal possessions
Issue found with Charlotte Hornets: 6 players with unequal possessions
Issue found with Atlanta Hawks: 7 players with unequal possessions
Issue found with Philadelphia 76ers: 9 players with unequal possessions
Issue found with Los Angeles Clippers: 14 players with unequal possessions
Issue found with Sacramento Kings: 11 players with unequal possessi

In [18]:
nba_cols = {type_: nba_dfs[2021][type_].columns
            for type_ in ['offense', 'defense']}
for type_ in ['offense', 'defense']:
    pd.concat([nba_dfs[season][type_][nba_cols[type_]] for season in range(2016, 2022)])\
      .reset_index().drop('index', axis=1)\
      .to_csv(f'nba_{type_}.csv', index=False)

### NBL

In [19]:
# nbl_dfs = {season: get_season_playtype_df(season, 'nbl')
#            for season in range(2016, 2021)}

In [20]:
# nbl_cols = {type_: nbl_dfs[2020][type_].columns
#             for type_ in ['offense', 'defense']}
# for type_ in ['offense', 'defense']:
#     pd.concat([nbl_dfs[season][type_][nbl_cols[type_]] for season in range(2016, 2021)])\
#       .reset_index().drop('index', axis=1)\
#       .to_csv(f'nbl_{type_}.csv', index=False)

### NCAA 

In [21]:
# ncaa_dfs = {season: get_season_playtype_df(season, 'ncaa')
#             for season in range(2016, 2021)}

In [22]:
# ncaa_cols = {type_: ncaa_dfs[2020][type_].columns
#              for type_ in ['offense', 'defense']}
# for type_ in ['offense', 'defense']:
#     pd.concat([ncaa_dfs[season][type_][ncaa_cols[type_]] for season in range(2016, 2021)])\
#       .reset_index().drop('index', axis=1)\
#       .to_csv(f'ncaa_{type_}.csv', index=False)