In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Data

In [2]:
df = pd.read_csv("../raw_data/Team_Records_NBA_until_2017.csv")
df.head()

Unnamed: 0,Season,Lg,Team,W,L,W/L%,Finish,SRS,Pace,Rel_Pace,ORtg,Rel_ORtg,DRtg,Rel_DRtg,Playoffs,Coaches,Top WS
0,2017-18,NBA,Boston Celtics,29,10,0.744,1,4.38,95.4,-1.7,108.0,0.2,102.8,-5.0,,B. Stevens (29-10),K. Irving (5.7)
1,2016-17,NBA,Boston Celtics*,53,29,0.646,1,2.25,96.8,0.4,111.2,2.4,108.4,-0.4,Lost E. Conf. Finals,B. Stevens (53-29),I. Thomas (12.5)
2,2015-16,NBA,Boston Celtics*,48,34,0.585,2,2.84,98.5,2.7,106.8,0.4,103.6,-2.8,Lost E. Conf. 1st Rnd.,B. Stevens (48-34),I. Thomas (9.7)
3,2014-15,NBA,Boston Celtics*,40,42,0.488,2,-0.4,95.8,1.9,104.7,-0.9,104.5,-1.1,Lost E. Conf. 1st Rnd.,B. Stevens (40-42),T. Zeller (6.5)
4,2013-14,NBA,Boston Celtics,25,57,0.305,4,-4.97,93.3,-0.6,102.9,-3.8,107.7,1.0,,B. Stevens (25-57),B. Bass (5.1)


## Format of the data:

- Wins per Team per Season up to 2017
- We only need NBA data and exclude earlier League formats as BAA or ABA
- We only need Season, Team, W, L columns

## Preprocessing

In [8]:
df = df[df["Lg"] == "NBA"]

# Set correct Year format
df["Season"] = df["Season"].apply(lambda x: x[:4])

# Since there are no Ties we can set the Ties to zero
df["T"] = 0

## Create new dataframe with desired output

In [9]:
df_nba_data = pd.DataFrame(columns=["Year", "Teams", "Wins", "Losses", "Ties", "#Games"])

In [10]:
grouped = df.groupby('Season')

for name, group in grouped:
    teams = group['Team'].tolist()
    wins = group['W'].tolist()
    losses = group['L'].tolist()
    ties = group['T'].tolist()

    ngames = wins[0]+losses[0]+ties[0]
    # concat to the dataframe

    df_tmp = pd.Series({"Year": name, "Teams": teams, "Wins": wins, "Losses": losses, "Ties": ties, "#Games": ngames}).to_frame().T
    df_nba_data = pd.concat([df_nba_data ,df_tmp])

In [12]:
# Save the data
df_nba_data.to_parquet("../prepared_data/NBA_data.parquet", index=False)