In [366]:
import pandas as pd
import numpy as np
import os
import plotly.express as px

In [367]:
players = pd.read_csv('Players.csv')
pstats = pd.read_csv('player_data.csv')
sstats = pd.read_csv('Seasons_Stats.csv')

### Data Cleaning
First I removed players with the same name.

In [368]:
duplicate_names = pstats['name'][pstats['name'].duplicated()].drop_duplicates().to_list()
duplicate_names.append('John Lucas')
duplicate_names.append('John Lucas III')
pstats = pstats.loc[~pstats['name'].isin(duplicate_names)]
sstats = sstats.loc[~sstats['Player'].isin(duplicate_names)]

# Drop Players with no height or no weight
no_height = pstats.loc[pstats['height'].isna()]['name'].to_list()
no_weight = pstats.loc[pstats['weight'].isna()]['name'].to_list()
pstats = pstats.loc[~pstats['name'].isin(no_height)]
sstats = sstats.loc[~sstats['Player'].isin(no_height)]
pstats = pstats.loc[~pstats['name'].isin(no_weight)]
sstats = sstats.loc[~sstats['Player'].isin(no_weight)]

Select only PER and MP and drop NaN values

In [369]:
sstats = sstats[['Player','Year','Age','PER','MP']].dropna()

Some players played for multiple teams in one season. To fix this issue I took the weighted average of the PER and minutes played

In [370]:
w_avg = lambda x: np.average(x, weights=df.loc[x.index, "MP"])
f = {'Age': ['mean'],'PER': w_avg}

sstats = sstats.groupby(['Player','Year']).agg(f)
sstats.columns = sstats.columns.droplevel(level=1)
sstats.reset_index(inplace=True)

Now I will add the players attributes to the table.

In [371]:
# Change height and weight to metric
pstats = pstats.assign(height_metric=pstats.height.str[:1].astype(int) * 30.48 + pstats.height.str[2:].astype(int) * 30.48 / 12)
pstats['weight_metric'] = pstats.weight * 0.45

pstats['Player'] = pstats.name

data = pd.merge(sstats, pstats, on='Player', how='left')[['Player','Year','Age','PER','position','height_metric','weight_metric']]

Here is a plot of PER over time, for the players with the longest careers

In [375]:
long_c = data.groupby(['Player']).count().sort_values(by='Age', ascending=False).head(10).index.to_list()
df_x = data.loc[data.Player.isin(long_c)]

fig = px.line(df_x, x = 'Year', y='PER', color='Player')
fig.show()