In [1]:
import pandas as pd
import numpy as np

# select data > clean data > construct data > integrate data > format data

# data
# • ... quality
#       – accuracy, completeness, consistency, timeliness, believability, interpretability
# • … cleaning
#       – e.g. missing/noisy values, outliers
# • integration from multiple sources
#       – entity identification problem is challenging
# • reduction
#       – curse of dimensionality and dimensionality reduction
#       – numerosity reduction
# • transformation and discretization

# Load data

In [2]:
awards_players_df = pd.read_csv('../data/awards_players.csv')
coaches_df = pd.read_csv('../data/coaches.csv')
players_teams_df = pd.read_csv('../data/players_teams.csv')
players_df = pd.read_csv('../data/players.csv')
series_post_df = pd.read_csv('../data/series_post.csv')
teams_post_df = pd.read_csv('../data/teams_post.csv')
teams_df = pd.read_csv('../data/teams.csv')

# Clean data

In [3]:
# REMOVE DUPLICATES IF THERE ARE ANY
awards_players_df = awards_players_df.drop_duplicates()
coaches_df = coaches_df.drop_duplicates()
players_teams_df = players_teams_df.drop_duplicates()
players_df = players_df.drop_duplicates()
series_post_df = series_post_df.drop_duplicates()
teams_post_df = teams_post_df.drop_duplicates()
teams_df = teams_df.drop_duplicates()

# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

players_df.rename(columns={'bioID': 'playerID'}, inplace=True)
coaches_df.rename(columns={'stint': 'coachStint'}, inplace=True)
players_teams_df.rename(columns={'stint': 'playerStint'}, inplace=True)

###  Drop irrelevant data

In [4]:
teams_df = teams_df.drop(["divID", "lgID", "seeded"], axis=1)
coaches_df = coaches_df.drop(["lgID"], axis=1)
players_df = players_df.drop(["collegeOther", "firstseason", "lastseason", "birthDate", "deathDate"], axis=1)
awards_players_df = awards_players_df.drop(["lgID"], axis=1)
players_teams_df = players_teams_df.drop(["lgID"], axis=1)
teams_post_df = teams_post_df.drop(["lgID"], axis=1)
series_post_df = series_post_df.drop(["lgIDWinner", "lgIDLoser"], axis=1)

# drop stint from players_teams, coaches?

### Drop columns that only have 0 values in teams_dF

In [5]:
zero_columns = teams_df.columns[(teams_df == 0).all()]
print(zero_columns)

teams_df = teams_df.drop(columns=zero_columns)

Index(['tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'], dtype='object')


# Merge data

In [10]:
# Merge teams_df with coaches_df
main_df = pd.merge(teams_df, coaches_df, on=['year', 'tmID'], how='left')

# Merge teams_df with teams_post_df
main_df = pd.merge(main_df, teams_post_df, on=['year', 'tmID'], how='left')

# Merge with players_teams_df
main_df = pd.merge(main_df, players_teams_df, on=['year', 'tmID'], how='left')

# Merge with awards_players_df
main_df = pd.merge(main_df, awards_players_df, on=['playerID','year'], how='left')

# Merge with players_df
main_df = pd.merge(main_df, players_df, on=['playerID'], how='left')

# Merge with series_post_df
# df = pd.merge(df, series_post_df, on=['year'], how='left')

# Display the resulting merged data frame
main_df.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,award,pos,height,weight,college
0,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,C,79.0,218,Duke
1,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,F-G,72.0,140,
2,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,F-C,77.0,190,
3,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,G,69.0,147,Michigan State
4,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,F,75.0,175,Pepperdine


# Export clean data to a .CSV file

In [7]:
main_df = main_df.drop_duplicates()
main_df.to_csv("../data/clean/main_df.csv", index=False)

main_df.head()

Unnamed: 0,year,tmID,franchID,confID,rank,playoff,firstRound,semis,finals,name,...,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,award,pos,height,weight,college
0,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,C,79.0,218,Duke
1,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,F-G,72.0,140,
2,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,F-C,77.0,190,
3,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,G,69.0,147,Michigan State
4,9,ATL,ATL,EA,7,0,,,,Atlanta Dream,...,0,0,0,0,0,,F,75.0,175,Pepperdine


### Information about the resulting dataset

In [8]:
main_df.info()
main_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Columns: 105 entries, year to college
dtypes: float64(3), int64(89), object(13)
memory usage: 1.7+ MB


Unnamed: 0,year,rank,playoff,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,o_oreb,...,PostPF,PostfgAttempted,PostfgMade,PostftAttempted,PostftMade,PostthreeAttempted,PostthreeMade,PostDQ,height,weight
count,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,...,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0,2160.0
mean,5.349537,4.205093,0.538426,859.920833,2039.74537,487.872222,652.553704,154.270833,454.681944,330.469444,...,3.659722,12.050926,5.062037,3.611111,2.782407,2.843056,0.991667,0.026852,71.730093,167.073611
std,2.887989,2.126624,0.498637,84.889513,173.368002,71.320194,85.155917,43.861896,116.50994,40.154703,...,6.597573,24.154401,10.674382,8.448611,6.716372,7.698095,2.956207,0.172767,5.779155,25.002189
min,1.0,1.0,0.0,647.0,1740.0,333.0,469.0,62.0,205.0,242.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0
25%,3.0,2.0,0.0,794.0,1910.0,435.0,582.0,124.0,377.0,301.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,150.0
50%,5.0,4.0,1.0,862.0,2031.0,482.0,651.0,157.0,443.0,333.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,165.0
75%,8.0,6.0,1.0,912.0,2175.0,539.0,721.0,177.0,524.0,357.0,...,5.0,14.0,5.0,3.0,2.0,1.0,0.0,0.0,75.0,183.0
max,10.0,8.0,1.0,1128.0,2485.0,668.0,882.0,283.0,802.0,452.0,...,43.0,188.0,82.0,68.0,62.0,85.0,32.0,2.0,80.0,253.0
