In [153]:
import pandas as pd

# Load data

In [154]:
awards_players_df = pd.read_csv('../data/awards_players.csv'); awards_players_df.Name = "awards_players"
coaches_df = pd.read_csv('../data/coaches.csv'); coaches_df.Name = "coaches"
players_teams_df = pd.read_csv('../data/players_teams.csv'); players_teams_df.Name = "players_teams"
players_df = pd.read_csv('../data/players.csv'); players_df.Name = "players"
series_post_df = pd.read_csv('../data/series_post.csv'); series_post_df.Name = "series_post"
teams_post_df = pd.read_csv('../data/teams_post.csv'); teams_post_df.Name = "teams_post"
teams_df = pd.read_csv('../data/teams.csv'); teams_df.Name = "teams"

dfs = [awards_players_df, coaches_df, players_teams_df, players_df, series_post_df, teams_post_df, teams_df]

# Clean data

### Drop noisy data

In [155]:
players_df.drop(players_df[players_df['height'] <= 0].index, inplace = True)

### Remove duplicates


In [156]:
for df in dfs:
    df.drop_duplicates(inplace=True)
    display(f"Dropping dupplicates from dataframe {df.Name}...")

'Dropping dupplicates from dataframe awards_players...'

'Dropping dupplicates from dataframe coaches...'

'Dropping dupplicates from dataframe players_teams...'

'Dropping dupplicates from dataframe players...'

'Dropping dupplicates from dataframe series_post...'

'Dropping dupplicates from dataframe teams_post...'

'Dropping dupplicates from dataframe teams...'

### Drop unneeded columns

In [157]:
players_df.drop(['birthDate', 'deathDate', 'college', 'collegeOther'], axis=1, inplace=True)
teams_df.drop(["confID", "franchID", "rank", "firstRound", "semis", "finals", "attend", "name", "arena"], axis=1, inplace=True);
teams_post_df.drop([], axis=1, inplace=True)
series_post_df.drop([], axis=1, inplace=True)
players_teams_df.drop([], axis=1, inplace=True)
awards_players_df.drop(['award'], axis=1, inplace=True)
coaches_df.drop([], axis=1, inplace=True)

### Drop single value columns

In [158]:
def drop_single_value_columns(df):
    for column in df.columns:
        if len(df[column].unique()) == 1:
            display(f"Dropping column {column} from datarame {df.Name}")

            df.drop(column, axis=1, inplace=True)

for df in dfs:
    drop_single_value_columns(df)

'Dropping column lgID from datarame awards_players'

'Dropping column lgID from datarame coaches'

'Dropping column lgID from datarame players_teams'

'Dropping column firstseason from datarame players'

'Dropping column lastseason from datarame players'

'Dropping column lgIDWinner from datarame series_post'

'Dropping column lgIDLoser from datarame series_post'

'Dropping column lgID from datarame teams_post'

'Dropping column lgID from datarame teams'

'Dropping column divID from datarame teams'

'Dropping column seeded from datarame teams'

'Dropping column tmORB from datarame teams'

'Dropping column tmDRB from datarame teams'

'Dropping column tmTRB from datarame teams'

'Dropping column opptmORB from datarame teams'

'Dropping column opptmDRB from datarame teams'

'Dropping column opptmTRB from datarame teams'

### Rename columns

In [159]:
players_df.rename(columns={'bioID': 'playerID', 'stint': 'player_stint'}, inplace=True)

coaches_df.rename(columns={'won': 'coach_wins', 'lost': 'coach_losses', 'post_wins': 'coach_post_wins', 'post_losses': 'coach_post_losses', 'stint': 'coach_stint'}, inplace=True)

teams_df.rename(columns={'GP': 'team_GP'}, inplace=True)

players_teams_df.rename(columns={'GP': 'player_team_GP'}, inplace=True)

teams_post_df.rename(columns={'W': 'team_post_wins', 'L': 'team_post_losses'}, inplace=True)

series_post_df.rename(columns={'W': 'series_post_wins', 'L': 'series_post_lost'}, inplace=True)

# Merge relevant data

In [160]:
# Merge teams_df with coaches_df
main_df = pd.merge(teams_df, coaches_df, on=['year', 'tmID'], how='left')

# Merge teams_df with teams_post_df
main_df = pd.merge(main_df, teams_post_df, on=['year', 'tmID'], how='left')

# Merge with players_teams_df
main_df = pd.merge(main_df, players_teams_df, on=['year', 'tmID'], how='left')

# Merge with awards_players_df
main_df = pd.merge(main_df, awards_players_df, on=['playerID','year'], how='left')

# Merge with players_df
main_df = pd.merge(main_df, players_df, on=['playerID'], how='left')

# Display the resulting merged data frame
main_df.head()

Unnamed: 0,year,tmID,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,o_oreb,...,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,pos,height,weight
0,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,C,79.0,218
1,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F-G,72.0,140
2,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F-C,77.0,190
3,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,G,69.0,147
4,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F,75.0,175


# Drop irrelevant columns
### After merging

In [161]:
# main_df = main_df.drop(["rank", "firstRound", "semis", "finals", "attend", "stint_x", "stint_y", "birthDate", "deathDate", "height", "weight", "award"], axis=1)

# Export clean data to a .CSV file

In [162]:
main_df.to_csv("../data/clean/main_df.csv", index=False)
main_df.head(20)

Unnamed: 0,year,tmID,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,o_oreb,...,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,pos,height,weight
0,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,C,79.0,218
1,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F-G,72.0,140
2,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F-C,77.0,190
3,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,G,69.0,147
4,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F,75.0,175
5,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,G,66.0,143
6,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,G,68.0,143
7,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F,74.0,180
8,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F,76.0,170
9,9,ATL,N,895,2258,542,725,202,598,340,...,0,0,0,0,0,0,0,F,73.0,185


### Information about the resulting dataset

In [163]:
main_df.info()
main_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 94 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                2160 non-null   int64  
 1   tmID                2160 non-null   object 
 2   playoff             2160 non-null   object 
 3   o_fgm               2160 non-null   int64  
 4   o_fga               2160 non-null   int64  
 5   o_ftm               2160 non-null   int64  
 6   o_fta               2160 non-null   int64  
 7   o_3pm               2160 non-null   int64  
 8   o_3pa               2160 non-null   int64  
 9   o_oreb              2160 non-null   int64  
 10  o_dreb              2160 non-null   int64  
 11  o_reb               2160 non-null   int64  
 12  o_asts              2160 non-null   int64  
 13  o_pf                2160 non-null   int64  
 14  o_stl               2160 non-null   int64  
 15  o_to                2160 non-null   int64  
 16  o_blk 

Unnamed: 0,year,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,o_oreb,o_dreb,o_reb,...,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,height,weight
count,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,...,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0
mean,5.349537,859.920833,2039.74537,487.872222,652.553704,154.270833,454.681944,330.469444,732.106944,1062.576389,...,3.659722,12.050926,5.062037,3.611111,2.782407,2.843056,0.991667,0.026852,71.730093,167.073611
std,2.887989,84.889513,173.368002,71.320194,85.155917,43.861896,116.50994,40.154703,82.425646,104.442487,...,6.597573,24.154401,10.674382,8.448611,6.716372,7.698095,2.956207,0.172767,5.779155,25.002189
min,1.0,647.0,1740.0,333.0,469.0,62.0,205.0,242.0,537.0,793.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
25%,3.0,794.0,1910.0,435.0,582.0,124.0,377.0,301.0,653.0,969.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,150.0
50%,5.0,862.0,2031.0,482.0,651.0,157.0,443.0,333.5,724.0,1074.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,165.0
75%,8.0,912.0,2175.0,539.0,721.0,177.0,524.0,357.0,792.0,1143.0,...,5.0,14.0,5.0,3.0,2.0,1.0,0.0,0.0,75.0,183.0
max,10.0,1128.0,2485.0,668.0,882.0,283.0,802.0,452.0,931.0,1311.0,...,43.0,188.0,82.0,68.0,62.0,85.0,32.0,2.0,80.0,253.0
