In [1]:
import pandas as pd

# Load data

In [2]:
awards_players_df = pd.read_csv('../data/awards_players.csv'); awards_players_df.Name = "awards_players"
coaches_df = pd.read_csv('../data/coaches.csv'); coaches_df.Name = "coaches"
players_teams_df = pd.read_csv('../data/players_teams.csv'); players_teams_df.Name = "players_teams"
players_df = pd.read_csv('../data/players.csv'); players_df.Name = "players"
series_post_df = pd.read_csv('../data/series_post.csv'); series_post_df.Name = "series_post"
teams_post_df = pd.read_csv('../data/teams_post.csv'); teams_post_df.Name = "teams_post"
teams_df = pd.read_csv('../data/teams.csv'); teams_df.Name = "teams"

season11_coaches_df = pd.read_csv('../data/season11/coaches.csv'); season11_coaches_df.Name = "season11_coaches"
season11_players_teams_df = pd.read_csv('../data/season11/players_teams.csv'); season11_players_teams_df.Name = "season11_players_teams"
season11_teams_df = pd.read_csv('../data/season11/teams.csv'); season11_teams_df.Name = "season11_teams"

dfs = [awards_players_df, coaches_df, players_teams_df, players_df, series_post_df, teams_post_df, teams_df, season11_coaches_df, season11_players_teams_df, season11_teams_df]

# Clean data

### Drop noisy data

In [3]:
players_df.drop(players_df[players_df['height'] <= 0].index, inplace = True)

### Remove duplicates


In [4]:
for df in dfs:
    df.drop_duplicates(inplace=True)
    display(f"Dropping dupplicates from dataframe {df.Name}...")

'Dropping dupplicates from dataframe awards_players...'

'Dropping dupplicates from dataframe coaches...'

'Dropping dupplicates from dataframe players_teams...'

'Dropping dupplicates from dataframe players...'

'Dropping dupplicates from dataframe series_post...'

'Dropping dupplicates from dataframe teams_post...'

'Dropping dupplicates from dataframe teams...'

'Dropping dupplicates from dataframe season11_coaches...'

'Dropping dupplicates from dataframe season11_players_teams...'

'Dropping dupplicates from dataframe season11_teams...'

### Drop unnecessary columns

In [5]:
players_df.drop(['birthDate', 'deathDate', 'college', 'collegeOther'], axis=1, inplace=True)
teams_df.drop(["franchID", "firstRound", "semis", "finals", "attend", "name", "arena"], axis=1, inplace=True);
season11_teams_df.drop(["lgID", "franchID", "name", "arena"], axis=1, inplace=True)
teams_post_df.drop([], axis=1, inplace=True)
series_post_df.drop([], axis=1, inplace=True)
players_teams_df.drop([], axis=1, inplace=True)
awards_players_df.drop(['award'], axis=1, inplace=True)
coaches_df.drop([], axis=1, inplace=True)

### Drop single value columns

In [6]:
needed_features = ['year']

def drop_single_value_columns(df):
    for column in df.columns:
        if len(df[column].unique()) == 1 and column not in needed_features:
            display(f"Dropping column {column} from dataframe {df.Name}")

            df.drop(column, axis=1, inplace=True)

for df in dfs:
    drop_single_value_columns(df)

'Dropping column lgID from dataframe awards_players'

'Dropping column lgID from dataframe coaches'

'Dropping column lgID from dataframe players_teams'

'Dropping column firstseason from dataframe players'

'Dropping column lastseason from dataframe players'

'Dropping column lgIDWinner from dataframe series_post'

'Dropping column lgIDLoser from dataframe series_post'

'Dropping column lgID from dataframe teams_post'

'Dropping column lgID from dataframe teams'

'Dropping column divID from dataframe teams'

'Dropping column seeded from dataframe teams'

'Dropping column tmORB from dataframe teams'

'Dropping column tmDRB from dataframe teams'

'Dropping column tmTRB from dataframe teams'

'Dropping column opptmORB from dataframe teams'

'Dropping column opptmDRB from dataframe teams'

'Dropping column opptmTRB from dataframe teams'

'Dropping column lgID from dataframe season11_coaches'

'Dropping column stint from dataframe season11_coaches'

'Dropping column stint from dataframe season11_players_teams'

'Dropping column lgID from dataframe season11_players_teams'

### Drop rows with missing values

In [7]:
players_df.drop(players_df[players_df['height'] == 0].index, inplace = True)
players_df.drop(players_df[players_df['weight'] == 0].index, inplace = True)

### Rename columns and replace values

In [8]:
players_df.rename(columns={'bioID': 'playerID', 'stint': 'player_stint'}, inplace=True)
players_df.replace('F-C', 'C-F', inplace=True)
players_df.replace('F-G', 'G-F', inplace=True)

coaches_df.rename(columns={'won': 'coach_wins', 'lost': 'coach_losses', 'post_wins': 'coach_post_wins', 'post_losses': 'coach_post_losses', 'stint': 'coach_stint'}, inplace=True)

teams_df.rename(columns={'GP': 'team_GP', 'rank': 'current_year_rank'}, inplace=True)

players_teams_df.rename(columns={'GP': 'player_team_GP'}, inplace=True)

teams_post_df.rename(columns={'W': 'team_post_wins', 'L': 'team_post_losses'}, inplace=True)

series_post_df.rename(columns={'W': 'series_post_wins', 'L': 'series_post_lost'}, inplace=True)

### Add new columns

In [9]:
awards_players_df['award'] = True

main_df = pd.concat([season11_teams_df, teams_df])
main_df.dropna(inplace=True, axis=1)

# show the first 5 rows of the new teams dataframe
display(main_df.head())

all_players_teams_df = pd.concat([season11_players_teams_df, players_teams_df])
all_players_teams_df.dropna(inplace=True, axis=1)

all_coaches_df = pd.concat([season11_coaches_df, coaches_df])
all_coaches_df.dropna(inplace=True, axis=1)

for tmID, team_data in main_df.groupby('tmID'):
    for year in team_data['year'].values:
        current_year_data = team_data[team_data['year'] == year]

        # get players that played in the previous year for the team
        player_ids = all_players_teams_df[(all_players_teams_df['tmID'] == tmID) & (all_players_teams_df['year'] == year)]['playerID'].values
        players = players_df[players_df['playerID'].isin(player_ids)]

        # register average height and weight for each position for each team
        for position, players_per_position in players.groupby('pos'):
            main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)), f'avg_height_pos_{position}'] = players_per_position['height'].mean()
            main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)), f'avg_weight_pos_{position}'] = players_per_position['weight'].mean()

        # count number of players awarded in the previous year
        # this gave the same accuracies but with a leakage somewhere
        players_awards = awards_players_df[(awards_players_df['playerID'].isin(player_ids)) & (awards_players_df['year'] == year)]
        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)), 'players_awarded'] = players_awards['award'].count()


for tmID, team_data in teams_df.groupby('tmID'):
    for year in team_data['year'].values:
        next_year = year + 1

        current_year_data = team_data[team_data['year'] == year]
        
        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == year)), 'playoff'] = current_year_data['playoff'].values

        # register previous year rank for each team
        current_year_rank_series_data = current_year_data['current_year_rank'].values
        current_year_rank = current_year_rank_series_data[0] if len(current_year_rank_series_data) > 0 else -1
        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), 'previous_year_rank'] = current_year_rank

        # shift numerical stats one year forward
        for column in teams_df.loc[:,'o_fgm':'min']:
            main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), f'previous_year_{column}'] = current_year_data[column].values[0] if len(current_year_data[column].values) > 0 else -1

        # calculate AVG PPG for previous year
        previous_year_avg_ppg = current_year_data['o_pts'].sum() / current_year_data['team_GP'].sum()

        main_df.loc[((main_df['tmID'] == tmID) & (main_df['year'] == next_year)), 'previous_year_avg_ppg'] = previous_year_avg_ppg

        # register previous year coach stats
        # current_year_coach = coaches_df[(coaches_df['tmID'] == tmID) & (coaches_df['year'] == year)]

main_df.sort_values(by=['tmID', 'year'], inplace=True)


Unnamed: 0,year,tmID,confID
0,11,ATL,EA
1,11,CHI,EA
2,11,CON,EA
3,11,IND,EA
4,11,LAS,WE


# Drop irrelevant columns
### After merging

In [10]:
# main_df = main_df.drop(["rank", "firstRound", "semis", "finals", "attend", "stint_x", "stint_y", "birthDate", "deathDate", "height", "weight", "award"], axis=1)
# main_df['award'].fillna(False, inplace=True) # mark non-award winners as False
# main_df['team_post_wins'].fillna(0, inplace=True)
# main_df['team_post_losses'].fillna(0, inplace=True)teams_df

# Drop the first year of each team
# main_df.dropna(inplace=True, subset=['previous_year_rank'])

# main_df.dropna(inplace=True)

main_df

Unnamed: 0,year,tmID,confID,avg_height_pos_C,avg_weight_pos_C,avg_height_pos_C-F,avg_weight_pos_C-F,avg_height_pos_F,avg_weight_pos_F,avg_height_pos_G,...,previous_year_lost,previous_year_team_GP,previous_year_homeW,previous_year_homeL,previous_year_awayW,previous_year_awayL,previous_year_confW,previous_year_confL,previous_year_min,previous_year_avg_ppg
0,9,ATL,EA,79.0,218.0,76.00,186.500000,74.400000,175.200000,68.750000,...,,,,,,,,,,
1,10,ATL,EA,77.0,158.0,77.00,190.000000,74.400000,169.600000,69.000000,...,30.0,34.0,1.0,16.0,3.0,14.0,2.0,18.0,6825.0,74.529412
0,11,ATL,EA,77.5,234.0,77.00,190.000000,74.666667,173.333333,69.500000,...,16.0,34.0,12.0,5.0,6.0,11.0,10.0,12.0,6950.0,84.147059
2,1,CHA,EA,76.0,215.0,74.50,182.500000,71.666667,154.666667,69.000000,...,,,,,,,,,,
3,2,CHA,EA,77.0,219.5,75.00,182.500000,70.500000,156.500000,68.833333,...,24.0,32.0,5.0,11.0,3.0,13.0,5.0,16.0,6475.0,68.312500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,7,WAS,EA,79.0,170.0,75.50,192.500000,71.000000,171.500000,69.250000,...,18.0,34.0,10.0,7.0,6.0,11.0,9.0,11.0,6900.0,66.558824
139,8,WAS,EA,78.0,210.0,75.50,192.500000,72.600000,172.400000,69.250000,...,16.0,34.0,13.0,4.0,5.0,12.0,12.0,8.0,6850.0,80.794118
140,9,WAS,EA,75.0,190.0,74.75,190.500000,72.750000,170.250000,68.333333,...,18.0,34.0,8.0,9.0,8.0,9.0,8.0,12.0,6900.0,76.029412
141,10,WAS,EA,75.0,190.0,75.00,189.666667,72.666667,178.000000,68.000000,...,24.0,34.0,6.0,11.0,4.0,13.0,6.0,14.0,6825.0,69.647059


# Export clean data to a .CSV file

In [11]:
main_df.to_csv("../data/clean/main_df.csv", index=False)
main_df.head(20)

Unnamed: 0,year,tmID,confID,avg_height_pos_C,avg_weight_pos_C,avg_height_pos_C-F,avg_weight_pos_C-F,avg_height_pos_F,avg_weight_pos_F,avg_height_pos_G,...,previous_year_lost,previous_year_team_GP,previous_year_homeW,previous_year_homeL,previous_year_awayW,previous_year_awayL,previous_year_confW,previous_year_confL,previous_year_min,previous_year_avg_ppg
0,9,ATL,EA,79.0,218.0,76.0,186.5,74.4,175.2,68.75,...,,,,,,,,,,
1,10,ATL,EA,77.0,158.0,77.0,190.0,74.4,169.6,69.0,...,30.0,34.0,1.0,16.0,3.0,14.0,2.0,18.0,6825.0,74.529412
0,11,ATL,EA,77.5,234.0,77.0,190.0,74.666667,173.333333,69.5,...,16.0,34.0,12.0,5.0,6.0,11.0,10.0,12.0,6950.0,84.147059
2,1,CHA,EA,76.0,215.0,74.5,182.5,71.666667,154.666667,69.0,...,,,,,,,,,,
3,2,CHA,EA,77.0,219.5,75.0,182.5,70.5,156.5,68.833333,...,24.0,32.0,5.0,11.0,3.0,13.0,5.0,16.0,6475.0,68.3125
4,3,CHA,EA,76.0,204.666667,74.0,187.5,70.5,156.5,68.333333,...,14.0,32.0,11.0,5.0,7.0,9.0,15.0,6.0,6500.0,64.21875
5,4,CHA,EA,76.0,199.0,74.0,202.5,71.666667,162.666667,68.8,...,14.0,32.0,11.0,5.0,7.0,9.0,12.0,9.0,6450.0,70.03125
6,5,CHA,EA,75.333333,183.0,75.0,220.0,72.2,164.6,69.2,...,16.0,34.0,13.0,4.0,5.0,12.0,12.0,12.0,6850.0,65.205882
7,6,CHA,EA,75.0,202.0,75.333333,185.0,73.5,162.25,68.5,...,18.0,34.0,10.0,7.0,6.0,11.0,8.0,12.0,6900.0,61.529412
8,7,CHA,EA,76.0,202.0,76.0,160.0,72.666667,162.333333,68.333333,...,28.0,34.0,5.0,12.0,1.0,16.0,4.0,16.0,6945.0,61.617647


### Information about the resulting dataset

In [12]:
main_df.info()
main_df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 0 to 11
Data columns (total 57 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   154 non-null    int64  
 1   tmID                   154 non-null    object 
 2   confID                 154 non-null    object 
 3   avg_height_pos_C       141 non-null    float64
 4   avg_weight_pos_C       141 non-null    float64
 5   avg_height_pos_C-F     132 non-null    float64
 6   avg_weight_pos_C-F     132 non-null    float64
 7   avg_height_pos_F       153 non-null    float64
 8   avg_weight_pos_F       153 non-null    float64
 9   avg_height_pos_G       154 non-null    float64
 10  avg_weight_pos_G       154 non-null    float64
 11  avg_height_pos_G-F     122 non-null    float64
 12  avg_weight_pos_G-F     122 non-null    float64
 13  players_awarded        154 non-null    float64
 14  playoff                142 non-null    object 
 15  previou

Unnamed: 0,year,avg_height_pos_C,avg_weight_pos_C,avg_height_pos_C-F,avg_weight_pos_C-F,avg_height_pos_F,avg_weight_pos_F,avg_height_pos_G,avg_weight_pos_G,avg_height_pos_G-F,...,previous_year_lost,previous_year_team_GP,previous_year_homeW,previous_year_homeL,previous_year_awayW,previous_year_awayL,previous_year_confW,previous_year_confL,previous_year_min,previous_year_avg_ppg
count,154.0,141.0,141.0,132.0,132.0,153.0,153.0,154.0,154.0,122.0,...,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,5.746753,73.948582,195.804019,75.357955,186.931818,73.498584,173.412418,68.473405,148.355398,71.247678,...,16.616541,33.338346,10.18797,6.481203,6.533835,10.135338,10.601504,10.526316,6737.909774,70.87764
std,3.192233,10.358431,17.543369,0.997226,10.694852,0.868665,8.059695,0.985062,5.484231,1.25409,...,5.098606,0.94458,3.05046,3.019135,2.76764,2.787369,3.535405,3.560258,197.229733,6.334401
min,1.0,9.0,158.0,72.0,160.0,70.5,154.666667,65.5,131.0,69.0,...,4.0,32.0,1.0,0.0,1.0,3.0,2.0,2.0,6400.0,56.9375
25%,3.0,75.5,185.0,75.0,180.75,73.0,168.666667,68.0,145.05,70.333333,...,13.0,32.0,8.0,4.0,5.0,8.0,8.0,8.0,6500.0,66.088235
50%,6.0,76.333333,193.0,75.25,187.0,73.5,173.25,68.5,148.55,71.5,...,16.0,34.0,11.0,6.0,6.0,10.0,11.0,10.0,6825.0,69.764706
75%,8.75,77.0,204.5,76.0,192.3125,74.0,177.666667,69.25,152.3,72.0,...,20.0,34.0,12.0,8.0,8.0,12.0,13.0,13.0,6875.0,74.823529
max,11.0,79.0,250.0,78.0,239.0,75.5,207.5,70.666667,159.5,75.0,...,30.0,34.0,16.0,16.0,13.0,16.0,19.0,19.0,7025.0,92.823529
