In [1]:
import pandas as pd
import numpy as np
import os
import time
import pickle
from bs4 import BeautifulSoup

## Basic Functions

In [2]:
def read_html(filename):
    soup = BeautifulSoup(open(filename), "html.parser")
    return soup

In [3]:
def dump_pickle(file, filename):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()

def load_pickle(filename):
    infile = open(filename, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file

## Content

In [4]:
def get_team_cumulative_df(soup, season):
    tables = soup.find_all('table')
    data = [[td.text for td in row.find_all('td')]
             for row in tables[0].find_all('tr')]
    df = pd.DataFrame(data[1:len(data)-1], columns=data[0])
    df['Season'] = season
    team = df.columns[0]
    df['Team'] = team
    return df.rename(columns={team:'Player'})

In [5]:
def get_season_cumulative_df(season, league):
    print(season)
    season_dfs = {}
    directory = f'data_{league}_{season}'
    
    team_cumulative_dfs = []
    for file in os.listdir(directory):
        if 'box' in file:
            soup = read_html(f'{directory}/{file}')
            team_cumulative_dfs.append(get_team_cumulative_df(soup, season))
    return pd.concat(team_cumulative_dfs)

### CBA

In [6]:
cba_dfs = {season: get_season_cumulative_df(season, 'cba')
           for season in range(2017, 2021)}

2017
2018
2019
2020


### Euroleague

In [7]:
euroleague_dfs = {season: get_season_cumulative_df(season, 'euroleague')
                  for season in range(2016, 2021)}

2016
2017
2018
2019
2020


In [8]:
s_df = pd.read_csv('euroleague_offense.csv')
s_df['Player_Alt'] = s_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))\
                                   .apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))
c_df = pd.concat([euroleague_dfs[season] for season in range(2016, 2021)])
c_df['Player_Alt'] = c_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))\
                                   .apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))
temp = s_df.merge(c_df.drop('Player', axis=1), 
                  on=['Player_Alt', 'Team', 'Season'], 
                  how='left')
temp[temp['Min'].isna()][['Player', 'Min', 'Team', 'Season', 'Overall_Poss']]

Unnamed: 0,Player,Min,Team,Season,Overall_Poss
79,#36 Alex Suarez,,Real Madrid,2016,1


In [9]:
s_df.merge(c_df.drop('Player', axis=1), 
           on=['Player_Alt', 'Team', 'Season'], 
           how='left')\
    .drop('Player_Alt', axis=1)\
    .to_csv('euroleague_offense_cumulative.csv', index=False)

### NBA

In [10]:
nba_dfs = {season: get_season_cumulative_df(season, 'nba')
           for season in range(2016, 2021)}

2016
2017
2018
2019
2020


In [11]:
s_df = pd.read_csv('nba_offense.csv')
s_df['Player_Alt'] = s_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))\
                                   .apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))
c_df = pd.concat([nba_dfs[season] for season in range(2016, 2021)])
c_df['Player_Alt'] = c_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))\
                                   .apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))
temp = s_df.merge(c_df.drop('Player', axis=1), 
                  on=['Player_Alt', 'Team', 'Season'], 
                  how='left')
temp[temp['Min'].isna()][['Player', 'Min', 'Team', 'Season', 'Overall_Poss']]

Unnamed: 0,Player,Min,Team,Season,Overall_Poss
1410,#5 Richard Solomon,,Oklahoma City Thunder,2018,1


In [12]:
s_df.merge(c_df.drop('Player', axis=1), 
           on=['Player_Alt', 'Team', 'Season'], 
           how='left')\
    .drop('Player_Alt', axis=1)\
    .to_csv('nba_offense_cumulative.csv', index=False)

### NBL

In [13]:
nbl_dfs = {season: get_season_cumulative_df(season, 'nbl')
           for season in range(2016, 2021)}

2016
2017
2018
2019
2020


In [14]:
s_df = pd.read_csv('nbl_offense.csv')
s_df['Player_Alt'] = s_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))\
                                   .apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))
c_df = pd.concat([nbl_dfs[season] for season in range(2016, 2021)])
c_df['Player_Alt'] = c_df['Player'].apply(lambda x: ' '.join(x.split(' ')[1:]))\
                                   .apply(lambda x: x.replace(' ', '').lower())\
                                   .apply(lambda x: x.replace("'", ''))\
                                   .apply(lambda x: x.replace('.', ''))
temp = s_df.merge(c_df.drop('Player', axis=1), 
                  on=['Player_Alt', 'Team', 'Season'], 
                  how='left')
temp[temp['Min'].isna()][['Player', 'Min', 'Team', 'Season', 'Overall_Poss']]

Unnamed: 0,Player,Min,Team,Season,Overall_Poss
293,#32 Deng Acouth,,Sydney Kings,2018,1


In [15]:
s_df.merge(c_df.drop('Player', axis=1), 
           on=['Player_Alt', 'Team', 'Season'], 
           how='left')\
    .drop('Player_Alt', axis=1)\
    .to_csv('nbl_offense_cumulative.csv', index=False)