In [129]:
import pandas as pd

# Load data

In [130]:
awards_players_df = pd.read_csv('../data/awards_players.csv'); awards_players_df.Name = "awards_players"
coaches_df = pd.read_csv('../data/coaches.csv'); coaches_df.Name = "coaches"
players_teams_df = pd.read_csv('../data/players_teams.csv'); players_teams_df.Name = "players_teams"
players_df = pd.read_csv('../data/players.csv'); players_df.Name = "players"
series_post_df = pd.read_csv('../data/series_post.csv'); series_post_df.Name = "series_post"
teams_post_df = pd.read_csv('../data/teams_post.csv'); teams_post_df.Name = "teams_post"
teams_df = pd.read_csv('../data/teams.csv'); teams_df.Name = "teams"

dfs = [awards_players_df, coaches_df, players_teams_df, players_df, series_post_df, teams_post_df, teams_df]

# Clean data

### Drop noisy data

In [131]:
players_df.drop(players_df[players_df['height'] <= 0].index, inplace = True)

### Remove duplicates


In [132]:
for df in dfs:
    df.drop_duplicates(inplace=True)
    display(f"Dropping dupplicates from dataframe {df.Name}...")

'Dropping dupplicates from dataframe awards_players...'

'Dropping dupplicates from dataframe coaches...'

'Dropping dupplicates from dataframe players_teams...'

'Dropping dupplicates from dataframe players...'

'Dropping dupplicates from dataframe series_post...'

'Dropping dupplicates from dataframe teams_post...'

'Dropping dupplicates from dataframe teams...'

### Drop unneeded columns

In [133]:
players_df.drop(['birthDate', 'deathDate', 'collegeOther'], axis=1, inplace=True)
teams_df.drop(["confID", "franchID", "rank", "firstRound", "semis", "finals", "attend", "name", "arena"], axis=1, inplace=True);
teams_post_df.drop([], axis=1, inplace=True)
series_post_df.drop([], axis=1, inplace=True)
players_teams_df.drop([], axis=1, inplace=True)
awards_players_df.drop(['award'], axis=1, inplace=True)
coaches_df.drop([], axis=1, inplace=True)

### Drop single value columns

In [134]:
def drop_single_value_columns(df):
    for column in df.columns:
        if len(df[column].unique()) == 1:
            display(f"Dropping column {column} from datarame {df.Name}")

            df.drop(column, axis=1, inplace=True)

for df in dfs:
    drop_single_value_columns(df)

'Dropping column lgID from datarame awards_players'

'Dropping column lgID from datarame coaches'

'Dropping column lgID from datarame players_teams'

'Dropping column firstseason from datarame players'

'Dropping column lastseason from datarame players'

'Dropping column lgIDWinner from datarame series_post'

'Dropping column lgIDLoser from datarame series_post'

'Dropping column lgID from datarame teams_post'

'Dropping column lgID from datarame teams'

'Dropping column divID from datarame teams'

'Dropping column seeded from datarame teams'

'Dropping column tmORB from datarame teams'

'Dropping column tmDRB from datarame teams'

'Dropping column tmTRB from datarame teams'

'Dropping column opptmORB from datarame teams'

'Dropping column opptmDRB from datarame teams'

'Dropping column opptmTRB from datarame teams'

### Rename columns

In [135]:
players_df.rename(columns={'bioID': 'playerID', 'stint': 'player_stint'}, inplace=True)

coaches_df.rename(columns={'won': 'coach_wins', 'lost': 'coach_losses', 'post_wins': 'coach_post_wins', 'post_losses': 'coach_post_losses', 'stint': 'coach_stint'}, inplace=True)

teams_df.rename(columns={'GP': 'team_GP'}, inplace=True)

players_teams_df.rename(columns={'GP': 'player_team_GP'}, inplace=True)

teams_post_df.rename(columns={'W': 'team_post_wins', 'L': 'team_post_losses'}, inplace=True)

series_post_df.rename(columns={'W': 'series_post_wins', 'L': 'series_post_lost'}, inplace=True)

# Merge relevant data

In [136]:
# Merge teams_df with coaches_df
main_df = pd.merge(teams_df, coaches_df, on=['year', 'tmID'], how='left')

# Merge teams_df with teams_post_df
main_df = pd.merge(main_df, teams_post_df, on=['year', 'tmID'], how='left')

# Merge with players_teams_df
main_df = pd.merge(main_df, players_teams_df, on=['year', 'tmID'], how='left')

# Merge with awards_players_df
main_df = pd.merge(main_df, awards_players_df, on=['playerID','year'], how='left')

# Merge with players_df
main_df = pd.merge(main_df, players_df, on=['playerID'], how='left')

# Display the resulting merged data frame
main_df.head()

Unnamed: 0,year,tmID,franchID,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,...,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,pos,height,weight,college
0,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,C,79.0,218,Duke
1,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F-G,72.0,140,
2,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F-C,77.0,190,
3,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,G,69.0,147,Michigan State
4,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F,75.0,175,Pepperdine


# Drop irrelevant columns
### After merging

In [137]:
# main_df = main_df.drop(["rank", "firstRound", "semis", "finals", "attend", "stint_x", "stint_y", "birthDate", "deathDate", "height", "weight", "award"], axis=1)

# Export clean data to a .CSV file

In [138]:
main_df.to_csv("../data/clean/main_df.csv", index=False)
main_df.head(20)

Unnamed: 0,year,tmID,franchID,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,...,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,pos,height,weight,college
0,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,C,79.0,218,Duke
1,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F-G,72.0,140,
2,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F-C,77.0,190,
3,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,G,69.0,147,Michigan State
4,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F,75.0,175,Pepperdine
5,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,G,66.0,143,North Carolina
6,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,G,68.0,143,Louisiana Tech
7,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F,74.0,180,North Carolina
8,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F,76.0,170,Purdue
9,9,ATL,ATL,N,895,2258,542,725,202,598,...,0,0,0,0,0,0,F,73.0,185,UC Santa Barbara


### Information about the resulting dataset

In [141]:
main_df.info()
main_df.describe()

      year tmID franchID playoff  o_fgm  o_fga  o_ftm  o_fta  o_3pm  o_3pa   
1284     1  ORL      CON       Y    833   1911    397    546    145    424  \
1285     1  ORL      CON       Y    833   1911    397    546    145    424   
1286     1  ORL      CON       Y    833   1911    397    546    145    424   
1287     1  ORL      CON       Y    833   1911    397    546    145    424   
1288     1  ORL      CON       Y    833   1911    397    546    145    424   
...    ...  ...      ...     ...    ...    ...    ...    ...    ...    ...   
1976     3  UTA      SAS       Y    843   1911    643    844     89    247   
1977     3  UTA      SAS       Y    843   1911    643    844     89    247   
1978     3  UTA      SAS       Y    843   1911    643    844     89    247   
1979     3  UTA      SAS       Y    843   1911    643    844     89    247   
1980     3  UTA      SAS       Y    843   1911    643    844     89    247   

      ...  PostfgMade  PostftAttempted  PostftMade  PostthreeAt