In [3]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
from plotnine import *
import plotly
import matplotlib.pyplot as plt 
from pandas_profiling import ProfileReport
from bball_reference import BBallScrape

pd.set_option("display.max_rows", 10000, "display.max_columns", 10000)


In [4]:
def FormatTeams(df): 
    
    df.team = df.team.str.strip().str.replace(' ', '_').str.lower().str.replace(
                '\*', '')

    return df

In [5]:

def ConvertCols(x): 
    column = np.array(x.fillna('100000-11').str.split('-').to_list())
    column = column.astype(np.float) 
    wins = column.T[0] 
    losses = column.T[1]

    
    return wins / np.add(wins, losses) 

In [6]:
def ReadData(first_year, last_year): 

    

    try: 
        standings = pd.read_excel('csv_files/standings.xlsx') 
        team = pd.read_excel('csv_files/team_stats.xlsx') 
        opponent = pd.read_excel('csv_files/opponent_stats.xlsx') 

    except: 

        scrape = BBallScrape(first_year) 

        standings = scrape.Standings()
        team, opponent = scrape.TeamStats()

        for year in tqdm(range((first_year+1), (last_year + 1))): 
            print(year)
            scrape = BBallScrape(year) 

            standings = standings.append(scrape.Standings())
            team_new, opponent_new = scrape.TeamStats()
            team = team.append(team_new) 
            opponent = opponent.append(opponent_new) 
        

        standings.to_excel('csv_files/standings.xlsx')
        team.to_excel('csv_files/team_stats.xlsx')
        opponent.to_excel('csv_files/opponent_stats.xlsx') 

    standings.columns = standings.columns.str.lower() 
    team.columns = team.columns.str.lower()
    opponent.columns = opponent.columns.str.lower()

    standings['year'] = standings.year_x

    SubsetYears = lambda x: x[(x.year >= first_year) & (x.year <= last_year)]

    team = SubsetYears(team) 
    opponent = SubsetYears(opponent) 
    standings = SubsetYears(standings) 

    return team[(team.year >= first_year) & (team.year <= last_year)], opponent, standings 

#team, opponent, standings = ReadData(2000, 2019)


In [7]:
def FormatStandings(standings): 



    f_standings = standings[~standings.th.str.contains('Division', na = False) ].copy()

     

    f_standings['year'] = f_standings.year_x.astype(int) 
    f_standings = f_standings.drop(columns = ['year_x', 'year_y', 'th', 
                                        'w', 'l', 'w/l%', 'gb', 'ps/g', 
                                        'pa/g', 'srs', 'playoffs', 'unnamed: 0'])

    f_standings.columns = f_standings.columns.str.lower() 

    f_standings = FormatTeams(f_standings) 

    f_standings = f_standings.dropna(how = 'all', axis = 1) 

    f_standings = pd.concat([
        f_standings[['team', 'year']], f_standings.iloc[:,1:-1].transform(ConvertCols)
    ], axis = 1)

    f_standings = f_standings.replace(f_standings.iloc[0,2], np.NaN)
    f_standings.index = f_standings.team 
    f_standings = f_standings.drop(columns = ['team'])
    f_standings = f_standings[~f_standings.index.isna()]


    return f_standings 

#standings = FormatStandings(standings) 

In [8]:
def MatchColNames(df, year): 
        df_year = df[df.year == year].copy()

        df_year = df_year.dropna(how = 'all', axis = 1) 

        names = [] 

        for col in df: 
            try: 
                names.append(df_year[df_year[col].isna()].index.to_list()[0])
            except: 
                pass
        
        df_year.columns = ['year'] + names 
        return df_year 

def FilterCols(df): 

    

    years = df.year.unique() 
    df_year = MatchColNames(df, years[0])


    for year in years[1:]: 
        print(year) 
        df_year = df_year.append(MatchColNames(df, year))

    return df_year 

#standings = FilterCols(standings) 

In [9]:
from sklearn.preprocessing import MinMaxScaler

def FormatStats(df, offense): 
    df.dropna(how = 'any', inplace = True)
    df['playoffs'] = df.team.str.contains('\*', regex = True).astype('int')
    df_group = FormatTeams(df)
    df_group = df.drop(columns = ['unnamed: 0', 'playoffs', 'g', 'mp'])
    df_group = df_group.groupby(['year'])
    df_group = df_group.transform(lambda x: (x - x.mean())/(x.std())) 
    #df_group = df_group.drop(columns = ['level_0', 'index'])
    df_group[df_group.columns] = MinMaxScaler().fit_transform(df_group) 
    
    

    if offense is True: 
        o_d = 'offense_'
    else: 
        o_d = 'defense_'

    df_group.columns = [o_d + str(col) for col in df_group.columns]

    df_playoff = pd.concat(
        [df[['team', 'year']], df_group, df['playoffs']], axis = 1
    )

    df = df_playoff.drop(columns = ['playoffs'])



   
    
    return df

#team = FormatStats(team)
#opponent = FormatStats(opponent) 

In [10]:
def TeamWin(x): 
    if x < 0.4: 
        a = 0
    elif x <= 0.6: 
        a = 1
    else: 
        a = 2
    return a

In [11]:
def GamePredictions(standings, team_data): 
    df = standings.copy() 


    game_predictions = pd.DataFrame(columns = ['team', 'opponent', 'year', 'team_win'])

    for year in standings.year.unique(): 
        standings_year = df[df.year == year] 

        team_names = standings_year.index.unique().to_numpy() 

        for first_name in team_names: 
            for second_name in team_names: 

                try: 
                    team_win = standings_year.unstack()[first_name][second_name]
                except: 
                    print(year, first_name, second_name) 
                    team_win = None
                    pass

                game_predictions = game_predictions.append({
                    'team': first_name, 
                    'opponent': second_name, 
                    'year': int(year), 'team_win': team_win
                    }, ignore_index=True)

    game_predictions = game_predictions.dropna() 
    game_predictions['year'] = game_predictions.year.astype(str) 
    game_predictions['both_teams'] = game_predictions.apply(lambda x: sorted(x[['team', 'opponent', 'year']]), axis = 1) 
    game_predictions = game_predictions.groupby(game_predictions.both_teams.apply(tuple, 1)).first().reset_index(drop = True) 
    game_predictions = game_predictions.drop(columns = ['both_teams']).reset_index().drop(columns = ['index'])
    game_predictions['year'] = game_predictions.year.astype(int)

    game_predictions = game_predictions.merge(
        team_data, on = ['team', 'year'])
    game_predictions = game_predictions.merge(
        team_data, left_on = ['opponent', 'year'], right_on = ['team', 'year']
    )
    game_predictions = game_predictions.drop(columns = ['team_y'])
    game_predictions = game_predictions.rename({'team_x': 'team'}, axis = 'columns')

    game_predictions['team_win'] = game_predictions.team_win.transform(TeamWin)

    return game_predictions

#game_predictions = GamePredictionY(standings) 

In [None]:
team, opponent, standings = ReadData(2000, 2019) 
standings = FormatStandings(standings) 
team_formatted = FormatStats(team, True)
opponent_formatted = FormatStats(opponent, False) 
standings = FilterCols(standings) 
game_predictions_offense = GamePredictions(standings, team) 
game_predictions_defense = GamePredictions(standings, opponent) 




2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [None]:
import seaborn as sns
fig, ax= plt.subplots(2, 2, figsize = (15, 15)) 

offense_defense_cutoff = 25


sns.heatmap(
    game_predictions_offense.iloc[:,:offense_defense_cutoff].corr(), ax = ax[0, 0], cmap = 'BuPu_r'
    ).set_title('Team Offense')
sns.heatmap(
    game_predictions_offense.iloc[:,offense_defense_cutoff:].corr(), ax = ax[1, 0], cmap = 'BuPu_r'
).set_title('Opponent Offense')

sns.heatmap(
    game_predictions_defense.iloc[:,:offense_defense_cutoff].corr(), ax = ax[0, 1], cmap = 'BuPu_r'
    ).set_title('Team Defense')
sns.heatmap(
    game_predictions_defense.iloc[:,offense_defense_cutoff:].corr(), ax = ax[1, 1], cmap = 'BuPu_r'
).set_title('Opponent Defense')

fig.tight_layout()

plt.show()


In [None]:
game_predictions_offense.head()

In [None]:
cols = [col for col in game_predictions_offense.columns if '2' not in col] 
cols = [col for col in cols if 'a_' not in col] 
game_predictions_offense[cols].head()

def FilterCols(df): 
    '''Remove 2point stats, and all attempts from data''' 
    cols = [col for col in df.columns if '2' not in col]
    cols = [col for col in cols if 'a_' not in col] 
    df = df[cols]
    df.columns = df.columns.str.replace('_x', '').str.replace('_y', '')


    return  df

predictions_offense_filtered = FilterCols(game_predictions_offense) 
predictions_defense_filtered = FilterCols(game_predictions_defense) 
predictions_total = predictions_offense_filtered.merge(
    predictions_defense_filtered, on = ['team', 'opponent', 'year', 'team_win']
)

predictions_offense_filtered.to_csv('csv_files/game_predictions_offense.csv') 
predictions_defense_filtered.to_csv('csv_files/game_predictions_defense.csv') 
predictions_total.to_csv('csv_files/game_predictions_offense')

In [None]:
predictions_defense_filtered.head()

In [None]:
ggplot(aes(x= 'year', y = 'fg'), data = team.groupby(['year']).mean().reset_index()) + geom_line() + geom_point(color = 'green') + labs(
    x = 'Field Goals Attempted', y = 'Year', title = 'Average Field Goal Percent By Year'
)

In [None]:
ggplot(aes(x= 'year', y = 'offense_fg'), data = team_formatted.groupby(['year']).mean().reset_index()) + geom_line() + geom_point(color = 'green') + labs(
    x = 'Field Goals Attempted', y = 'Year', title = 'Average Field Goal Percent By Year'
)


In [None]:
team.head()