In [3]:
import pandas as pd
import numpy as np

# select data > clean data > construct data > integrate data > format data

# data
# • ... quality
#       – accuracy, completeness, consistency, timeliness, believability, interpretability
# • … cleaning
#       – e.g. missing/noisy values, outliers
# • integration from multiple sources
#       – entity identification problem is challenging
# • reduction
#       – curse of dimensionality and dimensionality reduction
#       – numerosity reduction
# • transformation and discretization

In [27]:
# LOAD DATA
awards_players_df = pd.read_csv('../data/awards_players.csv')
coaches_df = pd.read_csv('../data/coaches.csv')
players_teams_df = pd.read_csv('../data/players_teams.csv')
players_df = pd.read_csv('../data/players.csv')
series_post_df = pd.read_csv('../data/series_post.csv')
teams_post_df = pd.read_csv('../data/teams_post.csv')
teams_df = pd.read_csv('../data/teams.csv')

In [28]:
# CLEAN DATA

# REMOVE DUPLICATES IF THERE ARE ANY
awards_players_df = awards_players_df.drop_duplicates()
coaches_df = coaches_df.drop_duplicates()
players_teams_df = players_teams_df.drop_duplicates()
players_df = players_df.drop_duplicates()
series_post_df = series_post_df.drop_duplicates()
teams_post_df = teams_post_df.drop_duplicates()
teams_df = teams_df.drop_duplicates()

# REPLACE NaN VALUES WITH 0
# players_df

# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

players_df.rename(columns={'bioID': 'playerID'}, inplace=True)

In [29]:
# DROP IRRELEVANT DATA
teams_df = teams_df.drop(["divID", "lgID", "seeded", "firstRound", "semis", "finals"], axis=1)
players_df = players_df.drop(["collegeOther", "deathDate", "firstseason", "lastseason"], axis=1)
coaches_df = coaches_df.drop(["lgID"], axis=1)
series_post_df = series_post_df.drop(["lgIDWinner", "lgIDLoser"], axis=1)
players_teams_df = players_teams_df.drop(["lgID"], axis=1)
teams_post_df = teams_post_df.drop(["lgID"], axis=1)
awards_players_df = awards_players_df.drop(["lgID"], axis=1)

# drop stint from players_teams, coaches?

In [30]:
# MERGE DATA

# Merge teams_df with coaches_df
df = pd.merge(teams_df, coaches_df, on=['year', 'tmID'], how='left')

# Merge teams_df with teams_post_df
df = pd.merge(df, teams_post_df, on=['year', 'tmID'], how='left')

# Merge with players_teams_df
df = pd.merge(df, players_teams_df, on=['year', 'tmID'], how='left')

# Merge with awards_players_df
df = pd.merge(df, awards_players_df, on=['playerID','year'], how='left')

# Merge with players_df
df = pd.merge(df, players_df, on=['playerID'], how='left')

# Merge with series_post_df
# df = pd.merge(df, series_post_df, on=['year'], how='left')

# Display the resulting merged data frame
df.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,name,o_fgm,o_fga,o_ftm,...,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,award,pos,height,weight,college,birthDate
0,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,C,79.0,218,Duke,1985-04-04
1,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,F-G,72.0,140,,1982-03-13
2,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,F-C,77.0,190,,1982-03-03
3,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,G,69.0,147,Michigan State,1983-06-17
4,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,F,75.0,175,Pepperdine,1983-03-21


In [31]:
# EXPORT CLEAN DATA TO A .CSV FILE
df = df.drop_duplicates()
df.to_csv("../data/clean/main_df.csv", index=False)

display(df)


Unnamed: 0,year,tmID,franchID,confID,rank,playoff,name,o_fgm,o_fga,o_ftm,...,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,award,pos,height,weight,college,birthDate
0,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,C,79.0,218,Duke,1985-04-04
1,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,F-G,72.0,140,,1982-03-13
2,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,F-C,77.0,190,,1982-03-03
3,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,G,69.0,147,Michigan State,1983-06-17
4,9,ATL,ATL,EA,7,0,Atlanta Dream,895,2258,542,...,0,0,0,0,,F,75.0,175,Pepperdine,1983-03-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,10,WAS,WAS,EA,4,1,Washington Mystics,933,2205,580,...,5,0,0,0,Most Improved Player,F-C,74.0,184,Maryland,1986-10-27
2156,10,WAS,WAS,EA,4,1,Washington Mystics,933,2205,580,...,0,2,1,0,,F,73.0,185,UC Santa Barbara,1983-08-10
2157,10,WAS,WAS,EA,4,1,Washington Mystics,933,2205,580,...,3,0,0,1,,C-F,75.0,185,North Carolina State,1976-05-03
2158,10,WAS,WAS,EA,4,1,Washington Mystics,933,2205,580,...,0,0,0,0,,F,73.0,172,Baylor,1984-02-14
