In [None]:
import pandas as pd

# select data > clean data > construct data > integrate data > format data

# data
# • ... quality
#       – accuracy, completeness, consistency, timeliness, believability, interpretability
# • … cleaning
#       – e.g. missing/noisy values, outliers
# • integration from multiple sources
#       – entity identification problem is challenging
# • reduction
#       – curse of dimensionality and dimensionality reduction
#       – numerosity reduction
# • transformation and discretization

# Load data

In [None]:
awards_players_df = pd.read_csv('../data/awards_players.csv')
coaches_df = pd.read_csv('../data/coaches.csv')
players_teams_df = pd.read_csv('../data/players_teams.csv')
players_df = pd.read_csv('../data/players.csv')
series_post_df = pd.read_csv('../data/series_post.csv')
teams_post_df = pd.read_csv('../data/teams_post.csv')
teams_df = pd.read_csv('../data/teams.csv')

# Clean data

In [None]:
# REMOVE DUPLICATES IF THERE ARE ANY
awards_players_df = awards_players_df.drop_duplicates()
coaches_df = coaches_df.drop_duplicates()
players_teams_df = players_teams_df.drop_duplicates()
players_df = players_df.drop_duplicates()
series_post_df = series_post_df.drop_duplicates()
teams_post_df = teams_post_df.drop_duplicates()
teams_df = teams_df.drop_duplicates()

# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

players_df.rename(columns={'bioID': 'playerID'}, inplace=True)
coaches_df.rename(columns={'stint': 'coachStint'}, inplace=True)
players_teams_df.rename(columns={'stint': 'playerStint'}, inplace=True)

###  Drop irrelevant data

In [None]:
teams_df = teams_df.drop(["divID", "lgID", "seeded"], axis=1)
coaches_df = coaches_df.drop(["lgID"], axis=1)
players_df = players_df.drop(["collegeOther", "firstseason", "lastseason", "birthDate", "deathDate"], axis=1)
awards_players_df = awards_players_df.drop(["lgID"], axis=1)
players_teams_df = players_teams_df.drop(["lgID", "playerStint", "height", "weight"], axis=1)
teams_post_df = teams_post_df.drop(["lgID"], axis=1)
series_post_df = series_post_df.drop(["lgIDWinner", "lgIDLoser"], axis=1)

# drop stint from players_teams, coaches?

### Drop columns that only have 0 values in teams_dF

In [None]:
zero_columns = teams_df.columns[(teams_df == 0).all()]
print(zero_columns)

teams_df = teams_df.drop(columns=zero_columns)

# Merge data

In [None]:
# Merge teams_df with coaches_df
main_df = pd.merge(teams_df, coaches_df, on=['year', 'tmID'], how='left')

# Merge teams_df with teams_post_df
main_df = pd.merge(main_df, teams_post_df, on=['year', 'tmID'], how='left')

# Merge with players_teams_df
main_df = pd.merge(main_df, players_teams_df, on=['year', 'tmID'], how='left')

# Merge with awards_players_df
main_df = pd.merge(main_df, awards_players_df, on=['playerID','year'], how='left')

# Merge with players_df
main_df = pd.merge(main_df, players_df, on=['playerID'], how='left')

# Merge with series_post_df
# df = pd.merge(df, series_post_df, on=['year'], how='left')

# Display the resulting merged data frame
main_df.head()

## Export train data

## Export test data

# Export clean data to a .CSV file

In [None]:
main_df = main_df.drop_duplicates()
main_df.to_csv("../data/clean/main_df.csv", index=False)

main_df.head()

### Information about the resulting dataset

In [None]:
main_df.info()
main_df.describe()