In [1]:
import pandas as pd

# Load data

In [2]:
awards_players_df = pd.read_csv('../data/awards_players.csv'); awards_players_df.Name = "awards_players"
coaches_df = pd.read_csv('../data/coaches.csv'); coaches_df.Name = "coaches"
players_teams_df = pd.read_csv('../data/players_teams.csv'); players_teams_df.Name = "players_teams"
players_df = pd.read_csv('../data/players.csv'); players_df.Name = "players"
series_post_df = pd.read_csv('../data/series_post.csv'); series_post_df.Name = "series_post"
teams_post_df = pd.read_csv('../data/teams_post.csv'); teams_post_df.Name = "teams_post"
teams_df = pd.read_csv('../data/teams.csv'); teams_df.Name = "teams"

season11_coaches_df = pd.read_csv('../data/season11/coaches.csv'); season11_coaches_df.Name = "season11_coaches"
season11_players_teams_df = pd.read_csv('../data/season11/players_teams.csv'); season11_players_teams_df.Name = "season11_players_teams"
season11_teams_df = pd.read_csv('../data/season11/teams.csv'); season11_teams_df.Name = "season11_teams"

dfs = [awards_players_df, coaches_df, players_teams_df, players_df, series_post_df, teams_post_df, teams_df, season11_coaches_df, season11_players_teams_df, season11_teams_df]

# Clean data

### Drop noisy data

In [3]:
players_df.drop(players_df[players_df['height'] <= 0].index, inplace = True)

### Remove duplicates


In [4]:
for df in dfs:
    df.drop_duplicates(inplace=True)
    display(f"Dropping dupplicates from dataframe {df.Name}...")

'Dropping dupplicates from dataframe awards_players...'

'Dropping dupplicates from dataframe coaches...'

'Dropping dupplicates from dataframe players_teams...'

'Dropping dupplicates from dataframe players...'

'Dropping dupplicates from dataframe series_post...'

'Dropping dupplicates from dataframe teams_post...'

'Dropping dupplicates from dataframe teams...'

'Dropping dupplicates from dataframe season11_coaches...'

'Dropping dupplicates from dataframe season11_players_teams...'

'Dropping dupplicates from dataframe season11_teams...'

### Drop unnecessary columns

In [5]:
players_df.drop(['birthDate', 'deathDate', 'college', 'collegeOther'], axis=1, inplace=True)
teams_df.drop(["franchID", "firstRound", "semis", "finals", "attend", "name", "arena"], axis=1, inplace=True);
season11_teams_df.drop(["lgID", "franchID", "name", "arena"], axis=1, inplace=True)
teams_post_df.drop([], axis=1, inplace=True)
series_post_df.drop([], axis=1, inplace=True)
players_teams_df.drop([], axis=1, inplace=True)
awards_players_df.drop(['award'], axis=1, inplace=True)
coaches_df.drop([], axis=1, inplace=True)

### Drop single value columns

In [6]:
needed_features = ['year']

def drop_single_value_columns(df):
    for column in df.columns:
        if len(df[column].unique()) == 1 and column not in needed_features:
            display(f"Dropping column {column} from dataframe {df.Name}")

            df.drop(column, axis=1, inplace=True)

for df in dfs:
    drop_single_value_columns(df)

'Dropping column lgID from dataframe awards_players'

'Dropping column lgID from dataframe coaches'

'Dropping column lgID from dataframe players_teams'

'Dropping column firstseason from dataframe players'

'Dropping column lastseason from dataframe players'

'Dropping column lgIDWinner from dataframe series_post'

'Dropping column lgIDLoser from dataframe series_post'

'Dropping column lgID from dataframe teams_post'

'Dropping column lgID from dataframe teams'

'Dropping column divID from dataframe teams'

'Dropping column seeded from dataframe teams'

'Dropping column tmORB from dataframe teams'

'Dropping column tmDRB from dataframe teams'

'Dropping column tmTRB from dataframe teams'

'Dropping column opptmORB from dataframe teams'

'Dropping column opptmDRB from dataframe teams'

'Dropping column opptmTRB from dataframe teams'

'Dropping column lgID from dataframe season11_coaches'

'Dropping column stint from dataframe season11_coaches'

'Dropping column stint from dataframe season11_players_teams'

'Dropping column lgID from dataframe season11_players_teams'

### Drop rows with missing values

In [7]:
players_df.drop(players_df[players_df['height'] == 0].index, inplace = True)
players_df.drop(players_df[players_df['weight'] == 0].index, inplace = True)

### Rename columns and replace values

In [8]:
players_df.rename(columns={'bioID': 'playerID', 'stint': 'player_stint'}, inplace=True)
players_df.replace('F-C', 'C-F', inplace=True)
players_df.replace('F-G', 'G-F', inplace=True)

coaches_df.rename(columns={'won': 'coach_wins', 'lost': 'coach_losses', 'post_wins': 'coach_post_wins', 'post_losses': 'coach_post_losses', 'stint': 'coach_stint'}, inplace=True)

teams_df.rename(columns={'GP': 'team_GP', 'rank': 'current_year_rank'}, inplace=True)

players_teams_df.rename(columns={'GP': 'player_team_GP'}, inplace=True)

teams_post_df.rename(columns={'W': 'team_post_wins', 'L': 'team_post_losses'}, inplace=True)

series_post_df.rename(columns={'W': 'series_post_wins', 'L': 'series_post_lost'}, inplace=True)

### Add new columns

In [9]:
for year, lg_data in players_teams_df.groupby('year'):
    lg_ast = lg_data['assists'].sum() + lg_data['PostAssists'].sum()
    lg_fg = lg_data['fgMade'].sum() + lg_data['PostfgMade'].sum()
    lg_ft = lg_data['ftMade'].sum() + lg_data['PostftMade'].sum()
    lg_to = lg_data['turnovers'].sum() + lg_data['PostTurnovers'].sum()
    lg_fga = lg_data['fgAttempted'].sum() + lg_data['PostfgAttempted'].sum()
    lg_fta = lg_data['ftAttempted'].sum() + lg_data['PostftAttempted'].sum()
    lg_trb = lg_data['rebounds'].sum() + lg_data['PostRebounds'].sum()
    lg_orb = lg_data['oRebounds'].sum() + lg_data['PostoRebounds'].sum()
    lg_pf = lg_data['PF'].sum() + lg_data['PostPF'].sum()
    lg_pts = lg_data['points'].sum() + lg_data['PostPoints'].sum()

    factor = 2 / 3 - ((0.5 * lg_ast / lg_fg) / (2 * lg_fg / lg_ft))
    vop = lg_pts / (lg_fga - lg_orb + lg_to + 0.44 * lg_fta)
    drbp = (lg_trb - lg_orb) / lg_trb

    for tmID, tm_data in lg_data.groupby('tmID'):
        tm_ast = tm_data['assists'].sum() + tm_data['PostAssists'].sum()
        tm_fg = tm_data['fgMade'].sum() + tm_data['PostfgMade'].sum()

        for player, player_data in tm_data.groupby('playerID'):
            min = player_data['minutes'].sum() + player_data['PostMinutes'].sum()
            _3p = player_data['threeMade'].sum() + player_data['PostthreeMade'].sum()
            ast = player_data['assists'].sum() + player_data['PostAssists'].sum()
            fg = player_data['fgMade'].sum() + player_data['PostfgMade'].sum()
            ft = player_data['ftMade'].sum() + player_data['PostftMade'].sum()
            to = player_data['turnovers'].sum() + player_data['PostTurnovers'].sum()
            fga = player_data['fgAttempted'].sum() + player_data['PostfgAttempted'].sum()
            fta = player_data['ftAttempted'].sum() + player_data['PostftAttempted'].sum()
            trb = player_data['rebounds'].sum() + player_data['PostRebounds'].sum()
            orb = player_data['oRebounds'].sum() + player_data['PostoRebounds'].sum()
            stl = player_data['steals'].sum() + player_data['PostSteals'].sum()
            blk = player_data['blocks'].sum() + player_data['PostBlocks'].sum()
            pf = player_data['PF'].sum() + player_data['PostPF'].sum()

            uper = 1 / min * (_3p + (1.5 * ast) +
                              ((2 - factor * tm_ast / tm_fg) * fg) +
                              (0.5 * ft * (2 - 1 / 3 * tm_ast / tm_fg)) -
                              (vop * to) - (vop * drbp * (fga - fg)) -
                              (vop * 0.44 * (0.44 + (0.56 * drbp)) *
                               (fta - ft)) + (vop * (1 - drbp) * (trb - orb)) +
                              (vop * drbp * orb) + (vop * stl) +
                              (vop * drbp * blk) -
                              (pf *
                               (lg_ft / lg_pf - 0.44 * lg_fta / lg_pf * vop)))

            players_teams_df.loc[(players_teams_df['year'] == year) &
                                 (players_teams_df['playerID'] == player) &
                                 (players_teams_df['tmID'] == tmID),
                                 'uper'] = uper

    lg_uper = players_teams_df[players_teams_df['year'] == year]['uper'].mean()
    players_teams_df.loc[
        players_teams_df['year'] == year,
        'per'] = players_teams_df[players_teams_df['year'] == year]['uper'] * (
            15 / lg_uper)

players_teams_df['per'].describe()


  uper = 1 / min * (_3p + (1.5 * ast) +
  uper = 1 / min * (_3p + (1.5 * ast) +
  uper = 1 / min * (_3p + (1.5 * ast) +
  uper = 1 / min * (_3p + (1.5 * ast) +


count    1874.000000
mean       15.000000
std         8.880203
min      -123.636226
25%        11.204041
50%        15.593892
75%        19.789928
max        73.331826
Name: per, dtype: float64

In [10]:
awards_players_df['award'] = True

main_df = pd.concat([season11_teams_df, teams_df])
main_df.dropna(inplace=True, axis=1)

# show the first 5 rows of the new teams dataframe
display(main_df.head())

all_players_teams_df = pd.concat([season11_players_teams_df, players_teams_df])
all_players_teams_df.dropna(inplace=True, axis=1)

all_coaches_df = pd.concat([season11_coaches_df, coaches_df])
all_coaches_df.dropna(inplace=True, axis=1)

for tmID, team_data in main_df.groupby('tmID'):
    for year in team_data['year'].values:
        current_year_data = team_data[team_data['year'] == year]

        # get players that played in the previous year for the team
        player_ids = all_players_teams_df[
            (all_players_teams_df['tmID'] == tmID)
            & (all_players_teams_df['year'] == year)]['playerID'].values
        players = players_df[players_df['playerID'].isin(player_ids)]
        players_teams = players_teams_df[players_teams_df['playerID'].isin(
            player_ids)]

        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)),
                    'avg_per'] = players_teams[players_teams['year'] <
                                               year]['per'].mean()
        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)),
                    'avg_height'] = players['height'].mean()
        # main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)),
        #             'avg_weight'] = players['weight'].mean()

        # register average height and weight for each position for each team
        # for position, players_per_position in players.groupby('pos'):
        #     main_df.loc[((main_df['tmID'] == tmID) &
        #                  (main_df['year'] == year)),
        #                 f'avg_height_pos_{position}'] = players_per_position[
        #                     'height'].mean()

        #     players_teams = players_teams_df[
        #         (players_teams_df['playerID'].isin(
        #             players_per_position['playerID'].values))
        #         & (players_teams_df['year'] == year)]
        #     main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)), f'avg_weight_pos_{position}'] = players_per_position['weight'].mean()
        #     main_df.loc[
        #         (main_df['tmID'] == tmID) & (main_df['year'] == year),
        #         f'avg_per_pos_{position}'] = players_teams['per'].mean()

        # count number of players awarded in the previous year
        # this gave the same accuracies but with a leakage somewhere
        players_awards = awards_players_df[
            (awards_players_df['playerID'].isin(player_ids))
            & (awards_players_df['year'] < year)]
        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)),
                    'players_awarded'] = players_awards['award'].count()

        # count number of coaches awarded in the previous year
        # coaches = all_coaches_df[(all_coaches_df['tmID'] == tmID)
        #                          & (all_coaches_df['year'] == year)]
        # coaches_ids = coaches['coachID'].values
        # coaches_awards = awards_players_df[
        #     (awards_players_df['playerID'].isin(coaches_ids))
        #     & (awards_players_df['year'] < year)]
        # main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)),
        #             'coaches_awarded'] = coaches_awards['award'].count()


for tmID, team_data in teams_df.groupby('tmID'):
    for year in team_data['year'].values:
        next_year = year + 1

        current_year_data = team_data[team_data['year'] == year]

        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)), 'playoff'] = current_year_data['playoff'].values

        # register previous year rank for each team
        # current_year_rank_series_data = current_year_data['current_year_rank'].values
        # current_year_rank = current_year_rank_series_data[0] if len(current_year_rank_series_data) > 0 else -1
        # main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), 'previous_year_rank'] = current_year_rank

        # shift numerical stats one year forward
        # for column in teams_df.loc[:,'o_fgm':'d_pts']:
        #     main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), f'previous_year_{column}'] = current_year_data[column].values[0] if len(current_year_data[column].values) > 0 else -1

        # add previous year win percentage
        # previous_year_win_percentage = current_year_data['won'].sum() / current_year_data['team_GP'].sum()
        # main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), 'previous_year_win_percentage'] = previous_year_win_percentage

        # previous_year_post_win_percentage = teams_post_df.loc[((teams_post_df['tmID'] == tmID) & (teams_post_df['year'] == year)), 'team_post_wins'].sum() / (teams_post_df.loc[((teams_post_df['tmID'] == tmID) & (teams_post_df['year'] == year)), 'team_post_wins'].sum() + teams_post_df.loc[((teams_post_df['tmID'] == tmID) & (teams_post_df['year'] == year)), 'team_post_losses'].sum())
        # main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), 'previous_year_post_win_percentage'] = previous_year_post_win_percentage

        # calculate AVG PPG for previous year
        # previous_year_avg_ppg = current_year_data['o_pts'].sum() / current_year_data['team_GP'].sum()
        # main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), 'previous_year_avg_ppg'] = previous_year_avg_ppg

        # register previous year coach stats
        # current_year_coach = coaches_df[(coaches_df['tmID'] == tmID) & (coaches_df['year'] == year)]

main_df['avg_per'].fillna(15, inplace=True)

main_df.sort_values(by=['tmID', 'year'], inplace=True)


Unnamed: 0,year,tmID,confID
0,11,ATL,EA
1,11,CHI,EA
2,11,CON,EA
3,11,IND,EA
4,11,LAS,WE


# Drop irrelevant columns
### After merging

In [11]:
# main_df = main_df.drop(["rank", "firstRound", "semis", "finals", "attend", "stint_x", "stint_y", "birthDate", "deathDate", "height", "weight", "award"], axis=1)
# main_df['award'].fillna(False, inplace=True) # mark non-award winners as False
# main_df['team_post_wins'].fillna(0, inplace=True)
# main_df['team_post_losses'].fillna(0, inplace=True)teams_df

# Drop the first year of each team
# main_df.dropna(inplace=True, subset=['previous_year_rank'])

# main_df.dropna(inplace=True)

main_df

Unnamed: 0,year,tmID,confID,avg_per,avg_height,players_awarded,playoff
0,9,ATL,EA,15.180034,73.214286,2.0,N
1,10,ATL,EA,19.189636,72.538462,5.0,Y
0,11,ATL,EA,16.671915,72.916667,3.0,
2,1,CHA,EA,15.000000,71.692308,0.0,N
3,2,CHA,EA,17.270528,71.500000,0.0,Y
...,...,...,...,...,...,...,...
138,7,WAS,EA,18.873522,71.833333,2.0,Y
139,8,WAS,EA,18.314123,72.600000,2.0,N
140,9,WAS,EA,18.359449,71.625000,2.0,N
141,10,WAS,EA,17.856622,72.166667,0.0,Y


# Export clean data to a .CSV file

In [12]:
main_df.to_csv("../data/clean/main_df.csv", index=False)
main_df.head(20)

Unnamed: 0,year,tmID,confID,avg_per,avg_height,players_awarded,playoff
0,9,ATL,EA,15.180034,73.214286,2.0,N
1,10,ATL,EA,19.189636,72.538462,5.0,Y
0,11,ATL,EA,16.671915,72.916667,3.0,
2,1,CHA,EA,15.0,71.692308,0.0,N
3,2,CHA,EA,17.270528,71.5,0.0,Y
4,3,CHA,EA,15.845493,71.307692,0.0,Y
5,4,CHA,EA,17.544791,71.181818,0.0,Y
6,5,CHA,EA,16.630257,72.0,0.0,N
7,6,CHA,EA,17.237768,72.0625,0.0,N
8,7,CHA,EA,16.498319,72.916667,0.0,N


### Information about the resulting dataset

In [13]:
main_df.info()
main_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 154 entries, 0 to 11
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   year             154 non-null    int64  
 1   tmID             154 non-null    object 
 2   confID           154 non-null    object 
 3   avg_per          154 non-null    float64
 4   avg_height       154 non-null    float64
 5   players_awarded  154 non-null    float64
 6   playoff          142 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 9.6+ KB


Unnamed: 0,year,avg_per,avg_height,players_awarded
count,154.0,154.0,154.0,154.0
mean,5.746753,17.191409,71.782845,2.032468
std,3.192233,1.929389,1.33603,3.043087
min,1.0,9.973721,66.0,0.0
25%,3.0,15.747074,71.551948,0.0
50%,6.0,17.243668,72.0,1.0
75%,8.75,18.474363,72.491667,2.75
max,11.0,23.207324,73.5,16.0
