In [8]:
import pandas as pd
import numpy as np

# select data > clean data > construct data > integrate data > format data

# data
# • ... quality
#       – accuracy, completeness, consistency, timeliness, believability, interpretability
# • … cleaning
#       – e.g. missing/noisy values, outliers
# • integration from multiple sources
#       – entity identification problem is challenging
# • reduction
#       – curse of dimensionality and dimensionality reduction
#       – numerosity reduction
# • transformation and discretization

In [99]:
# LOAD DATA
awards_players_df = pd.read_csv('../data/awards_players.csv')
coaches_df = pd.read_csv('../data/coaches.csv')
players_teams_df = pd.read_csv('../data/players_teams.csv')
players_df = pd.read_csv('../data/players.csv')
series_post_df = pd.read_csv('../data/series_post.csv')
teams_post_df = pd.read_csv('../data/teams_post.csv')
teams_df = pd.read_csv('../data/teams.csv')

In [110]:
# CLEAN DATA

# REMOVE DUPLICATES IF THERE ARE ANY
awards_players_df = awards_players_df.drop_duplicates()
coaches_df = coaches_df.drop_duplicates()
players_teams_df = players_teams_df.drop_duplicates()
players_df = players_df.drop_duplicates()
series_post_df = series_post_df.drop_duplicates()
teams_post_df = teams_post_df.drop_duplicates()
teams_df = teams_df.drop_duplicates()

# REPLACE NaN VALUES WITH 0
# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

# Convert "firstRound", "semis", "finals" columns to binary (W: 1, L: -1, NQ: 0)
teams_df["firstRound"] = teams_df["firstRound"].map({"W": 1, "L": -1, np.nan: 0})
teams_df["semis"] = teams_df["semis"].map({"W": 1, "L": -1, np.nan: 0})
teams_df["finals"] = teams_df["finals"].map({"W": 1, "L": -1, np.nan: 0})

players_df.rename(columns={'bioID': 'playerID'}, inplace=True)
print(awards_players_df.columns)


Index(['playerID', 'award', 'year'], dtype='object')


In [101]:
# SELECT RELEVANT DATA
teams_df = teams_df.drop(["divID", "lgID", "seeded"], axis=1)
players_df = players_df.drop(["collegeOther", "deathDate"], axis=1)
coaches_df = coaches_df.drop(["lgID"], axis=1)
series_post_df = series_post_df.drop(["lgIDWinner", "lgIDLoser"], axis=1)
players_teams_df = players_teams_df.drop(["lgID"], axis=1)
teams_post_df = teams_post_df.drop(["lgID"], axis=1)
awards_players_df = awards_players_df.drop(["lgID"], axis=1)



# etc...

In [111]:
# MERGE DATA

# Merge teams_df with coaches_df
df = pd.merge(teams_df, coaches_df, on=['year', 'tmID'], how='left')

# Merge with players_teams_df
df = pd.merge(df, players_teams_df, on=['year', 'tmID'], how='left')

# Merge with players_df
df = pd.merge(df, players_df, on=['playerID'], how='left')

# Merge with awards_players_df
df = pd.merge(df, awards_players_df, on=['playerID','year'], how='left')

# Merge with series_post_df
df = pd.merge(df, series_post_df, on=['year'], how='left')

# Merge teams_df with coaches_df
df = pd.merge(df, teams_post_df, on=['year', 'tmID'], how='left')


# Display the resulting merged data frame
df.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,birthDate,award,round,series,tmIDWinner,tmIDLoser,W_x,L_x,W_y,L_y
0,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,A,SAS,SAC,2,1,,
1,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,B,LAS,SEA,2,1,,
2,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,C,DET,IND,2,1,,
3,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,D,NYL,CON,2,1,,
4,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,CF,E,SAS,LAS,2,1,,


In [112]:
# EXPORT CLEAN DATA TO A .CSV FILE
df = df.drop_duplicates()
df.to_csv("../data/clean/main_df.csv", index=False)

display(df)


Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,birthDate,award,round,series,tmIDWinner,tmIDLoser,W_x,L_x,W_y,L_y
0,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,A,SAS,SAC,2,1,,
1,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,B,LAS,SEA,2,1,,
2,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,C,DET,IND,2,1,,
3,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,FR,D,NYL,CON,2,1,,
4,9,ATL,ATL,EA,7,,,,,Atlanta Dream,...,1985-04-04,,CF,E,SAS,LAS,2,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15115,10,WAS,WAS,EA,4,,,,,Washington Mystics,...,1976-05-10,,FR,C,IND,WAS,2,0,0.0,2.0
15116,10,WAS,WAS,EA,4,,,,,Washington Mystics,...,1976-05-10,,FR,D,DET,ATL,2,0,0.0,2.0
15117,10,WAS,WAS,EA,4,,,,,Washington Mystics,...,1976-05-10,,CF,E,PHO,LAS,2,1,0.0,2.0
15118,10,WAS,WAS,EA,4,,,,,Washington Mystics,...,1976-05-10,,CF,F,IND,DET,2,1,0.0,2.0
