# Competition data

In this section we will be including the new dataset provided for the competition from which we will try to predict which teams will be going into the playoffs.


In [1]:
players_teams_csv_file_path = "./datasets/original/players_teams.csv"
coaches_csv_file_path = "./datasets/original/coaches.csv"
teams_csv_file_path = "./datasets/original/teams.csv"
awards_players_csv_file_path = "./datasets/original/awards_players.csv"

aggregated_table_csv_file_path = "./datasets/generated/aggregated_table.csv"

players_teams_selected_attributes = ["playerID", "year", "tmID", "points", "rebounds", "assists", "steals", "blocks", "turnovers", "fgAttempted", "fgMade", "ftAttempted", "ftMade", "threeAttempted", "threeMade"]
coaches_selected_attributes = ["coachID", "year", "tmID", "stint", "won", "lost"]
teams_selected_attributes = ["year", "tmID", "won", "lost", "playoff"]
awards_players_selected_attributes = ["playerID", "year"]

teams_competition_table_path = "./datasets/competition_final/original/teams.csv"
players_teams_competition_table_path = "./datasets/competition_final/original/players_teams.csv"
coaches_competition_table_path = "./datasets/competition_final/original/coaches.csv"

aggregated_attributes = ["tmID", "year", "playoff", "averageWinRate", "averagePoints", "averageRebounds", "averageAssists", "averageSteals", "averageBlocks", "averageTurnovers", "averageFGRatio", "averageFTRatio", "averageThreeRatio", "coachWinRate", "numberOfAwardedPlayers", "listOfPlayers"]


In [24]:
import pandas as pd

initial_df_teams_players = pd.read_csv(players_teams_csv_file_path)
df_teams_players = initial_df_teams_players[players_teams_selected_attributes]

initial_df_coaches = pd.read_csv(coaches_csv_file_path)
df_coaches = initial_df_coaches[coaches_selected_attributes]

initial_df_teams = pd.read_csv(teams_csv_file_path)
df_teams = initial_df_teams[teams_selected_attributes]

initial_df_awards_players = pd.read_csv(awards_players_csv_file_path)
df_awards_players = initial_df_awards_players[awards_players_selected_attributes]

# COMPETITION DATA
initial_df_competition_coaches = pd.read_csv(coaches_competition_table_path)
df_competition_coaches = initial_df_competition_coaches[list(set(initial_df_competition_coaches) & set(coaches_selected_attributes))]

initial_df_competition_teams = pd.read_csv(teams_competition_table_path)
df_competition_teams = initial_df_competition_teams[list(set(initial_df_competition_teams) & set(teams_selected_attributes))]

initial_df_competition_teams_players = pd.read_csv(players_teams_competition_table_path)
df_competition_teams_players = initial_df_competition_teams_players[list(set(initial_df_competition_teams_players) & set(players_teams_selected_attributes))]


First we have to combine the new and old datasets

In [25]:
df_teams = pd.concat([df_teams, df_competition_teams]) 
df_coaches = pd.concat([df_coaches, df_competition_coaches])
df_teams_players = pd.concat([df_teams_players, df_competition_teams_players])

df_teams_players.tail()


Unnamed: 0,playerID,year,tmID,points,rebounds,assists,steals,blocks,turnovers,fgAttempted,fgMade,ftAttempted,ftMade,threeAttempted,threeMade
143,wrighmo01w,11,MIN,,,,,,,,,,,,
144,wrighta01w,11,SEA,,,,,,,,,,,,
145,youngso01w,11,SAS,,,,,,,,,,,,
146,youngta01w,11,CHI,,,,,,,,,,,,
147,zellosh01w,11,IND,,,,,,,,,,,,


Select only the columns we want.

In [27]:
df_final_temp = df_teams[["tmID", "year", "playoff"]]
df_final_temp = df_final_temp.sort_values(["tmID", "year"], ascending=False)

df_final_temp["players"] = "" 
df_final_temp = df_final_temp.reset_index()
df_final_temp = df_final_temp.drop("index", axis=1)
df_final_temp.head()

for index, row in df_final_temp.iterrows():
  currentYear = row["year"]
  currentTeamId = row["tmID"]
  
  mask = df_teams_players[(df_teams_players["year"] == currentYear ) & (df_teams_players["tmID"] == currentTeamId)]
  
  df_final_temp["players"][index] = mask["playerID"].values


df_final_temp.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_temp["players"][index] = mask["playerID"].values


Unnamed: 0,tmID,year,playoff,players
0,WAS,11,,"[ajavoma01w, anosini01w, cheekjo01w, chriska02..."
1,WAS,10,Y,"[ajavoma01w, beardal01w, blueni01w, colemma01w..."
2,WAS,9,N,"[beardal01w, blueni01w, currimo01w, gardnan01w..."
3,WAS,8,N,"[beardal01w, blueni01w, currimo01w, goringi01w..."
4,WAS,7,Y,"[beardal01w, blueni01w, byearla01w, jamesta01w..."
5,WAS,6,N,"[amachma01w, beardal01w, brownki01w, choneka01..."
6,WAS,5,Y,"[beardal01w, brownki01w, choneka01w, dalesst01..."
7,WAS,4,N,"[brownki01w, burgean01w, dalesst01w, henniso01..."
8,WAS,3,Y,"[brownki01w, bullevi01w, burgean01w, dalesst01..."
9,WAS,2,N,"[aldrima01w, bauerca01w, bullevi01w, burgean01..."


Calculate all the averages for all tables, so we can populate missing values later.

In [28]:
df_teams_players_average = df_teams_players.groupby("year").agg("mean", numeric_only=True)
df_teams_average = df_teams.groupby("year").agg("mean", numeric_only=True)
df_coaches_average = df_coaches.groupby("year").agg("mean", numeric_only=True)

df_teams_players_average = df_teams_players_average.reset_index()
df_teams_average = df_teams_average.reset_index()
df_coaches_average = df_coaches_average.reset_index()

df_teams_average.head(10)

Unnamed: 0,year,won,lost
0,1,16.0,16.0
1,2,16.0,16.0
2,3,16.0,16.0
3,4,17.0,17.0
4,5,17.0,17.0
5,6,17.0,17.0
6,7,17.0,17.0
7,8,17.0,17.0
8,9,17.0,17.0
9,10,17.0,17.0


Execute aggregation algorithm.

In [30]:
year_offset = 1

# loop all temp rows
df_final = df_final_temp
new_columns = ["averageWinRate", "averagePoints", "averageRebounds", "averageAssists", "averageSteals", "averageBlocks", "averageTurnovers", "averageFGRatio", "averageFTRatio", "averageThreeRatio", "coachWinRate", "numberOfAwardedPlayers"]
df_final[new_columns] = ""

for idx in range(len(df_final)):
  row = df_final.iloc[idx]
    
  currentYear = row["year"]
  currentTeamId = row["tmID"]
  players = row["players"]
  coach_id = df_coaches[(df_coaches["tmID"] == currentTeamId) & (df_coaches["year"] == currentYear)]["coachID"].values[0]
  
  if(currentYear == 1):
    continue
  
  previousYearExists = any((df_final["year"] == currentYear - year_offset) & (df_final["tmID"] == currentTeamId))
  
  # for all players get data from currentYear - year_offset
  players_data = df_teams_players[(df_teams_players["playerID"].isin(players)) & (df_teams_players["year"] == currentYear - year_offset)]
  teams_data = df_teams[(df_teams["tmID"] == currentTeamId) & (df_teams["year"] == currentYear - year_offset)]
  coach_data = df_coaches[(df_coaches["coachID"] == coach_id) & (df_coaches["year"] == currentYear - year_offset)]
  awards_data = df_awards_players[(df_awards_players["playerID"].isin(players)) & (df_awards_players["year"] < currentYear)]
  
  # Populate missing teams_players data
  missing_players = []
  
  for player in list(players):
    if not player in players_data["playerID"].values:
      missing_players.append(player)
  
  last_year_players_average = df_teams_players_average[df_teams_players_average["year"] == (currentYear - year_offset)]
  last_year_players_average["tmID"] = currentTeamId
  
  # populate players_data with avg players data
  for player in missing_players:
    last_year_players_average["playerID"] = player
    players_data = players_data.append(last_year_players_average)
    
  # Populate missing coaches data
  if len(coach_data) == 0:
    last_year_coaches_average = df_coaches_average[df_coaches_average["year"] == (currentYear - year_offset)]
    last_year_coaches_average["tmID"] = currentTeamId
    last_year_coaches_average["coachID"] = coach_id
    
    coach_data = coach_data.append(last_year_coaches_average)
  
  # Populate missing teams data
  if len(teams_data) == 0:
    last_year_teams_average = df_teams_average[df_teams_average["year"] == (currentYear - year_offset)]
    last_year_teams_average["tmID"] = currentTeamId
    
    teams_data = teams_data.append(last_year_teams_average)
  
  # calculate attributes
  averagePoints = players_data["points"].mean()
  averageRebounds = players_data["rebounds"].mean()
  averageAssists = players_data["assists"].mean()
  averageSteals = players_data["steals"].mean()
  averageBlocks = players_data["blocks"].mean()
  averageTurnovers = players_data["turnovers"].mean()
  averageFGRatio = (players_data["fgMade"] / players_data["fgAttempted"]).mean()
  averageFTRatio = (players_data["ftMade"] / players_data["ftAttempted"]).mean()
  averageThreeRatio = (players_data["threeMade"] / players_data["threeAttempted"]).mean()

  averageWinRate = (teams_data["won"] / (teams_data["won"] + teams_data["lost"])).mean()
  coachWinRate = (coach_data["won"] / (coach_data["won"] + coach_data["lost"])).mean()
  numberOfAwardedPlayers = len(awards_data)
  
  df_final["averagePoints"][idx] = averagePoints
  df_final["averageRebounds"][idx] = averageRebounds
  df_final["averageAssists"][idx] = averageAssists
  df_final["averageSteals"][idx] = averageSteals
  df_final["averageBlocks"][idx] = averageBlocks
  df_final["averageTurnovers"][idx] = averageTurnovers
  df_final["averageFGRatio"][idx] = averageFGRatio
  df_final["averageFTRatio"][idx] = averageFTRatio
  df_final["averageThreeRatio"][idx] = averageThreeRatio
  df_final["averageWinRate"][idx] = averageWinRate
  df_final["coachWinRate"][idx] = coachWinRate
  df_final["numberOfAwardedPlayers"][idx] = numberOfAwardedPlayers
  
df_final = df_final[df_final["year"] > year_offset]
df_final = df_final.drop("players", axis=1)

# write to JSON
df_json = df_final.to_json(orient="records")

new_file = open("./datasets/competition_final/generated/aggregated_competition_data.json", "w")
new_file.writelines(df_json)
new_file.close()

df_final.head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_year_players_average["tmID"] = currentTeamId
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_year_players_average["playerID"] = player
  players_data = players_data.append(last_year_players_average)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_year_players_average["playerID"] = play

Unnamed: 0,tmID,year,playoff,averageWinRate,averagePoints,averageRebounds,averageAssists,averageSteals,averageBlocks,averageTurnovers,averageFGRatio,averageFTRatio,averageThreeRatio,coachWinRate,numberOfAwardedPlayers
0,WAS,11,,0.470588,244.629371,122.927739,47.384615,29.587413,12.310023,50.428904,0.418658,0.749509,0.280573,0.5,2
1,WAS,10,Y,0.294118,189.512143,84.572143,42.982857,19.696786,7.073929,44.290357,0.408655,0.717564,0.306577,0.5,0
2,WAS,9,N,0.470588,189.708943,82.593336,36.207412,20.147909,8.820809,38.090105,0.402735,0.800336,0.276818,0.5,2
3,WAS,8,N,0.529412,239.624444,95.275556,50.680741,25.66,10.042222,44.559259,0.434105,0.760861,0.303131,0.529412,2
4,WAS,7,Y,0.470588,216.456929,88.321161,42.381086,23.959738,8.20412,39.441011,0.422553,0.743379,0.337256,0.470588,2
5,WAS,6,N,0.5,193.30631,92.482727,39.100875,21.625518,11.273146,39.394288,0.429846,0.706172,0.334885,0.4375,1
6,WAS,5,Y,0.264706,240.308333,107.468519,54.680556,23.091667,9.691667,44.64537,0.409827,0.702331,0.30017,0.5,1
7,WAS,4,N,0.53125,167.989247,78.873425,43.032873,17.364055,6.389555,35.870968,0.401379,0.727574,0.310486,0.53125,1
8,WAS,3,Y,0.3125,157.101648,91.43956,37.078297,21.982143,11.538462,39.072802,0.387412,0.690721,0.276616,0.5,0
9,WAS,2,N,0.4375,216.313874,92.934066,39.79739,23.455357,11.973214,44.57967,0.439097,0.696891,0.319172,0.5,0


Split finished aggregated data for the 11th year which we have to predict.

In [38]:
json_data_path = "./datasets/competition_final/generated/aggregated_competition_data.json"

df = pd.read_json(json_data_path)

df_original_data = df[df["year"] < 11]
df_competition_data = df[df["year"] == 11]

# write to JSON
df_og_json = df_original_data.to_json(orient="records")
new_file = open("./datasets/competition_final/generated/final_year_1_to_10_data.json", "w")
new_file.writelines(df_og_json)
new_file.close()

df_cd_json = df_competition_data.to_json(orient="records")
new_file = open("./datasets/competition_final/generated/final_year_11_data.json", "w")
new_file.writelines(df_cd_json)
new_file.close()

df_competition_data.head(20)

len(df_competition_data)

12