# Clean dataset
Exploration of the Kaggle dataset [NBA Players stats since 1950](https://www.kaggle.com/drgilermo/nba-players-stats).

In [1]:
import pandas as pd
from numpy import floor

## Load players dataset

In [2]:
players_df = pd.read_csv('../data/raw/Players.csv', header=0, index_col=0)
players_df.head()

Unnamed: 0_level_0,Player,height,weight,collage,born,birth_city,birth_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [3]:
players_df.describe()

Unnamed: 0,height,weight,born
count,3921.0,3921.0,3921.0
mean,198.704922,94.783219,1962.37975
std,9.269761,12.039515,20.33491
min,160.0,60.0,1913.0
25%,190.0,86.0,1948.0
50%,198.0,95.0,1964.0
75%,206.0,102.0,1979.0
max,231.0,163.0,1997.0


In [4]:
players_df.isnull().sum()

Player           1
height           1
weight           1
collage        349
born             1
birth_city     470
birth_state    483
dtype: int64

In [5]:
players_df.loc[players_df.isnull()['Player'],:]

Unnamed: 0_level_0,Player,height,weight,collage,born,birth_city,birth_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
223,,,,,,,


There is one player whose values are all null, so we will remove this player.

In [6]:
players_df.drop(223, axis=0, inplace=True)

In [7]:
players_df.nunique()

Player         3921
height           28
weight           76
collage         422
born             84
birth_city     1264
birth_state     128
dtype: int64

The player names are unique so we will use this as the index.

In [8]:
players_df.set_index('Player', inplace=True)
players_df.head()

Unnamed: 0_level_0,height,weight,collage,born,birth_city,birth_state
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


## Load season stats dataset

In [9]:
season_stats_df = pd.read_csv('../data/raw/Seasons_Stats.csv', header=0, index_col=0)
season_stats_df.head(20)

Unnamed: 0_level_0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,...,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,...,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,...,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,...,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,...,0.548,,,,20.0,,,,27.0,59.0
5,1950.0,Ed Bartels,F,24.0,NYK,2.0,,,,0.376,...,0.667,,,,0.0,,,,2.0,4.0
6,1950.0,Ralph Beard,G,22.0,INO,60.0,,,,0.422,...,0.762,,,,233.0,,,,132.0,895.0
7,1950.0,Gene Berce,G-F,23.0,TRI,3.0,,,,0.275,...,0.0,,,,2.0,,,,6.0,10.0
8,1950.0,Charlie Black,F-C,28.0,TOT,65.0,,,,0.346,...,0.651,,,,163.0,,,,273.0,661.0
9,1950.0,Charlie Black,F-C,28.0,FTW,36.0,,,,0.362,...,0.632,,,,75.0,,,,140.0,382.0


In [10]:
season_stats_df.describe()

Unnamed: 0,Year,Age,G,GS,MP,PER,TS%,3PAr,FTr,ORB%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,24624.0,24616.0,24624.0,18233.0,24138.0,24101.0,24538.0,18839.0,24525.0,20792.0,...,23766.0,20797.0,20797.0,24312.0,24624.0,20797.0,20797.0,19645.0,24624.0,24624.0
mean,1992.594989,26.664405,50.83711,23.593375,1209.720317,12.479071,0.493001,0.158604,0.325455,6.181565,...,0.719279,62.18921,147.199404,224.637381,114.852623,39.897052,24.47026,73.939832,116.339222,510.11635
std,17.429594,3.841892,26.496161,28.632387,941.146575,6.039014,0.094469,0.187495,0.218971,4.872685,...,0.141824,67.324881,145.921912,228.190203,135.863913,38.713053,36.935084,67.713803,84.791873,492.922981
min,1950.0,18.0,1.0,0.0,0.0,-90.6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1981.0,24.0,27.0,0.0,340.0,9.8,0.458,0.005,0.208,2.6,...,0.657,12.0,33.0,51.0,19.0,9.0,3.0,18.0,39.0,106.0
50%,1996.0,26.0,58.0,8.0,1053.0,12.7,0.506,0.064,0.296,5.4,...,0.743,38.0,106.0,159.0,68.0,29.0,11.0,55.0,109.0,364.0
75%,2007.0,29.0,75.0,45.0,1971.0,15.6,0.544,0.288,0.4,9.0,...,0.808,91.0,212.0,322.0,160.0,60.0,29.0,112.0,182.0,778.0
max,2017.0,44.0,88.0,83.0,3882.0,129.1,1.136,1.0,6.0,100.0,...,1.0,587.0,1111.0,2149.0,1164.0,301.0,456.0,464.0,386.0,4029.0


There are several rows with all null values, so we will remove these.

In [11]:
season_stats_df.dropna(how='all', inplace=True)

In [12]:
season_stats_df.isnull().sum()

Year          0
Player        0
Pos           0
Age           8
Tm            0
G             0
GS         6391
MP          486
PER         523
TS%          86
3PAr       5785
FTr          99
ORB%       3832
DRB%       3832
TRB%       3053
AST%       2069
STL%       3832
BLK%       3832
TOV%       5042
USG%       4984
blanl     24624
OWS          39
DWS          39
WS           39
WS/48       523
blank2    24624
OBPM       3827
DBPM       3827
BPM        3827
VORP       3827
FG            0
FGA           0
FG%          99
3P         5697
3PA        5697
3P%        9208
2P            0
2PA           0
2P%         128
eFG%         99
FT            0
FTA           0
FT%         858
ORB        3827
DRB        3827
TRB         312
AST           0
STL        3827
BLK        3827
TOV        4979
PF            0
PTS           0
dtype: int64

In [13]:
print('{no_rows} total rows'.format(no_rows=len(season_stats_df)))
print('{no_rows} unique combinations of Year, Player, Pos, and Tm'.format(no_rows=len(season_stats_df.groupby(['Year', 'Player', 'Pos', 'Tm']))))

24624 total rows
24624 unique combinations of Year, Player, Pos, and Tm


Each row is a unique combination of year, player, position, and team.

## Player decade
We are only interested in player from 1970 onwards, so we will remove the rows we are not interested in. In addition, we are only interested in the decades in which each player played, so we will extract this information and add it to the players data frame.

In [14]:
season_stats_df = season_stats_df[season_stats_df['Year'] >= 1970]

In [15]:
decade_mapping = {
    70: '70s',
    80: '80s',
    90: '90s',
    0: '00s',
    10: '10s'
}

def extract_decade(years):
    decades = years.apply(lambda year: int(floor((year % 100) / 10) * 10)) \
                   .unique() \
                   .tolist()
            
    decades = list(map(lambda decade: decade_mapping[decade], decades))
    
    return decades

In [16]:
player_decades = season_stats_df.groupby('Player')['Year'].apply(extract_decade)
player_decades.name = 'decades'

players_df = pd.concat([players_df, player_decades], join='inner', axis=1)

In [17]:
players_df.head()

Unnamed: 0_level_0,height,weight,collage,born,birth_city,birth_state,decades
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Freddie Lewis,188.0,88.0,Eastern Kentucky University,1921.0,Brooklyn,New York,[70s]
Bob Cousy*,185.0,79.0,College of the Holy Cross,1928.0,New York,New York,[70s]
Richie Guerin*,193.0,88.0,Iona College,1932.0,New York,New York,[70s]
Jim Paxson,198.0,90.0,University of Dayton,1932.0,Pennville,Indiana,"[80s, 90s]"
Hot Rod,193.0,83.0,,1935.0,,,"[80s, 90s]"


Note that we did an inner join, so the player now in the players data frame are only the players with records in both the players and season stats datasets.

## Player positions
There are five primary positions in basketball:

* Point guard (PG)
* Shooting guard (SG)
* Small forward (SF)
* Power forward (PF)
* Centre (C)

However, there are many different positions values in the dataset, all of which are combinations of the above five positions.

In [18]:
season_stats_df['Pos'].unique()

array(['PF', 'C', 'PG', 'SF', 'G', 'SG', 'SF-PF', 'F', 'G-F', 'PF-SF',
       'F-C', 'SF-SG', 'C-F', 'SG-PG', 'C-PF', 'PF-C', 'SF-PG', 'C-SF',
       'SG-SF', 'PG-SG', 'PG-SF', 'SG-PF'], dtype=object)

We will create a mapping from the positions in the dataset to the default five. Each player has potentially played in multiple positions throughout their career. For the puposes of choosing a dream team, we will say a player is eligible for a position if they have played there at least once.

In [19]:
position_mapping = {
    'G-F': ['PG', 'SG', 'SF', 'PF'],
    'F': ['SF', 'PF'],
    'G': ['PG', 'SG'],
    'F-C': ['SF', 'PF', 'C'],
    'F-G': ['PG', 'SG', 'SF', 'PF'],
    'C-F': ['SF', 'PF', 'C'],
    'PF-C': ['PF', 'C'],
    'SF-SG': ['SF', 'SG'],
    'C-PF': ['PF', 'C'],
    'SG-SF': ['SF', 'SG'],
    'PF-SF': ['PF', 'SF'],
    'SF-PF': ['SF', 'PF'],
    'SG-PG': ['SG', 'PG'],
    'SF-PG': ['SF', 'PG'],
    'C-SF': ['C', 'SF'],
    'PG-SG': ['PG', 'SG'],
    'PG-SF': ['PG', 'SF'],
    'SG-PF': ['SG', 'PF']
}

season_stats_df['Pos'] = season_stats_df['Pos'].apply(lambda pos: position_mapping[pos] if pos in position_mapping else [pos])

Next we will determine the positions each player has played and add that information to the players data frame.

In [20]:
players_df['positions'] = season_stats_df.groupby('Player')['Pos'].apply(lambda df: list(set([val for sublist in df.values for val in sublist])))

players_df.head()

Unnamed: 0_level_0,height,weight,collage,born,birth_city,birth_state,decades,positions
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Freddie Lewis,188.0,88.0,Eastern Kentucky University,1921.0,Brooklyn,New York,[70s],[PG]
Bob Cousy*,185.0,79.0,College of the Holy Cross,1928.0,New York,New York,[70s],[PG]
Richie Guerin*,193.0,88.0,Iona College,1932.0,New York,New York,[70s],[SG]
Jim Paxson,198.0,90.0,University of Dayton,1932.0,Pennville,Indiana,"[80s, 90s]","[SG, PG]"
Hot Rod,193.0,83.0,,1935.0,,,"[80s, 90s]","[PF, C]"


Quick sanitiy check to ensure we only have the standard five positions.

In [21]:
print(set([position for sublist in players_df['positions'] for position in sublist]))

{'C', 'PF', 'PG', 'SG', 'SF'}


We do! Lets move on.

## Available stats
There are many columns in this dataset, whose meanings can be found [here](https://www.basketball-reference.com/about/glossary.html). In the first instance we will only consider stats available for all players when choosing our dream team. These are:

| Code   | Description   |
|:------ |:------------- |
| G      | Games |
| MP     | Minutes Played |
| OWS    | Offensive Win Shares (see [calculating win shares](https://www.basketball-reference.com/about/ws.html) for more information) |
| DWS    | Defensive Win Shares (see [calculating win shares](https://www.basketball-reference.com/about/ws.html) for more information) |
| WS     | Win Shares; an estimate of the number of wins contributed by a player (see [calculating win shares](https://www.basketball-reference.com/about/ws.html) for more information) |
| FG     | Field Goals (includes both 2-point field goals and 3-point field goals) |
| FGA    | Field Goal Attempts (includes both 2-point field goal attempts and 3-point field goal attempts) |
| 2P     | 2-Point Field Goals |
| 2PA    | 2-Point Field Goal Attempts |
| FT     | Free Throws |
| FTA    | Free Throw Attempts 
| TRB    | Total Rebounds |
| AST    | Assists |
| PF     | Personal Fouls |
| PTS    | Points |

Notice that some incomplete columns can be calculated from the complete columns:

* Field goal percentage (FG%) = FG / FGA.
* 2-point field goal percentage (2P%) = 2P / 2PA.
* 3-point field goals (3P) = FG - 2P.
* 3-point field goal attempts (3PA) = FGA - 2PA.
* 3-point field goal percentage (3P%) = 3P / 3PA.
* Free throw percentage (FT%) = FT / FTA.

Some other metrics potentially of interest are:

* Assists per game (APG) = AST / G.
* Points per game (PPG) = PTS / G.
* Personal fouls per game (PFPG) = PF / G.
* [Win shares per 48 minutes](https://www.basketball-reference.com/leaders/ws_per_48_career.html) (WS48) = 48 * WS / MP

We will go ahead and caculate these metrics, but first lets drop the non-complete columns. In addition, we're not worried about the players' teams and ages, so we can also drop those columns. We have already calculated the decades each player played in, and their positions, so these columns can go also.

In [22]:
season_stats_df.dropna(how='any', axis=1, inplace=True)
season_stats_df.drop(['Year', 'Tm', 'Age', 'Pos'], inplace=True, axis=1)

season_stats_df.head()

Unnamed: 0_level_0,Player,G,MP,OWS,DWS,WS,FG,FGA,2P,2PA,FT,FTA,TRB,AST,PF,PTS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2867,Zaid Abdul-Aziz,80.0,1637.0,0.0,2.0,2.0,237.0,546.0,237.0,546.0,119.0,185.0,603.0,62.0,167.0,593.0
2868,Kareem Abdul-Jabbar*,82.0,3534.0,9.3,4.5,13.8,938.0,1810.0,938.0,1810.0,485.0,743.0,1190.0,337.0,283.0,2361.0
2869,Rick Adelman,35.0,717.0,-0.2,0.9,0.7,96.0,247.0,96.0,247.0,68.0,91.0,81.0,113.0,90.0,260.0
2870,Lucius Allen,81.0,1817.0,1.2,1.0,2.2,306.0,692.0,306.0,692.0,182.0,249.0,211.0,342.0,201.0,794.0
2871,Wally Anderzunas,44.0,370.0,-0.4,0.3,-0.2,65.0,166.0,65.0,166.0,29.0,46.0,82.0,9.0,47.0,159.0


We must now group by player, creating a per-player row where the column values are combined additively. We can then join this into the players data frame.

In [23]:
season_stats_by_player = season_stats_df.groupby('Player') \
                                        .apply(lambda df: df.iloc[:, 1:].sum())

In [24]:
players_df = pd.concat([players_df, season_stats_by_player], join='inner', axis=1)

players_df.head()

Unnamed: 0_level_0,height,weight,collage,born,birth_city,birth_state,decades,positions,G,MP,...,FG,FGA,2P,2PA,FT,FTA,TRB,AST,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Freddie Lewis,188.0,88.0,Eastern Kentucky University,1921.0,Brooklyn,New York,[70s],[PG],32.0,552.0,...,81.0,199.0,81.0,199.0,62.0,77.0,47.0,56.0,58.0,224.0
Bob Cousy*,185.0,79.0,College of the Holy Cross,1928.0,New York,New York,[70s],[PG],7.0,34.0,...,1.0,3.0,1.0,3.0,3.0,3.0,5.0,10.0,11.0,5.0
Richie Guerin*,193.0,88.0,Iona College,1932.0,New York,New York,[70s],[SG],8.0,64.0,...,3.0,11.0,3.0,11.0,1.0,1.0,2.0,12.0,9.0,7.0
Jim Paxson,198.0,90.0,University of Dayton,1932.0,Pennville,Indiana,"[80s, 90s]","[SG, PG]",829.0,22158.0,...,4682.0,9431.0,4579.0,8975.0,2079.0,2572.0,1638.0,2376.0,1515.0,11546.0
Hot Rod,193.0,83.0,,1935.0,,,"[80s, 90s]","[PF, C]",887.0,26327.0,...,3688.0,7681.0,3686.0,7662.0,2406.0,3315.0,5998.0,1592.0,2253.0,9784.0


One final cleaning step. If a player has played zero minutes they're unlikely to make the dream team, so lets drop these players.

In [25]:
players_df = players_df.loc[players_df['MP'] > 0, :]

We can now calculate the additional metrics discussed above.

In [26]:
players_df['FG%'] = players_df.loc[:, ['FG', 'FGA']].apply(lambda row: row['FG'] / row['FGA'] if row['FGA'] != 0 else 0, axis=1)
players_df['2P%'] = players_df.loc[:, ['2P', '2PA']].apply(lambda row: row['2P'] / row['2PA'] if row['2PA'] != 0 else 0, axis=1)
players_df['3P'] = players_df.loc[:, ['FG', '2P']].apply(lambda row: row['FG'] - row['2P'], axis=1)
players_df['3PA'] = players_df.loc[:, ['FGA', '2PA']].apply(lambda row: row['FGA'] - row['2PA'], axis=1)
players_df['3P%'] = players_df.loc[:, ['3P', '3PA']].apply(lambda row: row['3P'] / row['3PA'] if row['3PA'] != 0 else 0, axis=1)
players_df['FT%'] = players_df.loc[:, ['FT', 'FTA']].apply(lambda row: row['FT'] / row['FTA'] if row['FTA'] != 0 else 0, axis=1)
players_df['APG'] = players_df.loc[:, ['AST', 'G']].apply(lambda row: row['AST'] / row['G'] if row['G'] != 0 else 0, axis=1)
players_df['PPG'] = players_df.loc[:, ['PTS', 'G']].apply(lambda row: row['PTS'] / row['G'] if row['G'] != 0 else 0, axis=1)
players_df['PFPG'] = players_df.loc[:, ['PF', 'G']].apply(lambda row: row['PF'] / row['G'] if row['G'] != 0 else 0, axis=1)
players_df['WS48'] = players_df.loc[:, ['WS', 'MP']].apply(lambda row: 48 * row['WS'] / row['MP'], axis=1)

Furthermore, we only need to keep the relative metrics. For example, we only need FG%, not FG and FGA.

In [27]:
players_df.drop(['FG', 'FGA', '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'AST', 'PF', 'PTS'], axis=1, inplace=True)

players_df.head()

Unnamed: 0_level_0,height,weight,collage,born,birth_city,birth_state,decades,positions,G,MP,...,WS,TRB,FG%,2P%,3P%,FT%,APG,PPG,PFPG,WS48
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Freddie Lewis,188.0,88.0,Eastern Kentucky University,1921.0,Brooklyn,New York,[70s],[PG],32.0,552.0,...,0.5,47.0,0.407035,0.407035,0.0,0.805195,1.75,7.0,1.8125,0.043478
Bob Cousy*,185.0,79.0,College of the Holy Cross,1928.0,New York,New York,[70s],[PG],7.0,34.0,...,0.1,5.0,0.333333,0.333333,0.0,1.0,1.428571,0.714286,1.571429,0.141176
Richie Guerin*,193.0,88.0,Iona College,1932.0,New York,New York,[70s],[SG],8.0,64.0,...,-0.1,2.0,0.272727,0.272727,0.0,1.0,1.5,0.875,1.125,-0.075
Jim Paxson,198.0,90.0,University of Dayton,1932.0,Pennville,Indiana,"[80s, 90s]","[SG, PG]",829.0,22158.0,...,55.0,1638.0,0.496448,0.510195,0.225877,0.80832,2.866104,13.927624,1.827503,0.119144
Hot Rod,193.0,83.0,,1935.0,,,"[80s, 90s]","[PF, C]",887.0,26327.0,...,70.6,5998.0,0.480146,0.481075,0.105263,0.725792,1.794814,11.03044,2.540023,0.12872


Lets store this as an interim dataset for later use.

In [28]:
players_df.to_csv('../data/interim/player_data_cleaned.csv', header=True, index=True, sep=',')