In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px

In [2]:
players = pd.read_csv('Players.csv')
pstats = pd.read_csv('player_data.csv')
sstats = pd.read_csv('Seasons_Stats.csv')

### Data Cleaning
First I removed players with the same name.

In [3]:
duplicate_names = pstats['name'][pstats['name'].duplicated()].drop_duplicates().to_list()
duplicate_names.append('John Lucas')
duplicate_names.append('John Lucas III')
pstats = pstats.loc[~pstats['name'].isin(duplicate_names)]
sstats = sstats.loc[~sstats['Player'].isin(duplicate_names)]

# Drop Players with no height or no weight
no_height = pstats.loc[pstats['height'].isna()]['name'].to_list()
no_weight = pstats.loc[pstats['weight'].isna()]['name'].to_list()
pstats = pstats.loc[~pstats['name'].isin(no_height)]
sstats = sstats.loc[~sstats['Player'].isin(no_height)]
pstats = pstats.loc[~pstats['name'].isin(no_weight)]
sstats = sstats.loc[~sstats['Player'].isin(no_weight)]

Select only PER and MP and drop NaN values

In [4]:
sstats = sstats[['Player','Year','Age','PER','MP']].dropna(how='any')

Some players played for multiple teams in one season. To fix this issue I took the weighted average of the PER and minutes played

In [5]:
w_avg = lambda x: np.average(x, weights=sstats.loc[x.index, "MP"])
f = {'Age': ['mean'],'PER': w_avg}

sstats = sstats.groupby(['Player','Year']).agg(f)
sstats.columns = sstats.columns.droplevel(level=1)
sstats.reset_index(inplace=True)

Now I will add the players attributes to the table.

In [6]:
# Change height and weight to metric
pstats = pstats.assign(height_metric=pstats.height.str[:1].astype(int) * 30.48 + pstats.height.str[2:].astype(int) * 30.48 / 12)
pstats['weight_metric'] = pstats.weight * 0.45

pstats['Player'] = pstats.name

data = pd.merge(sstats, pstats, on='Player', how='left')[['Player','Year','Age','PER','position','height_metric','weight_metric','year_start']]

Here is a plot of PER over time, for the players with the longest careers

In [114]:
long_c = data.groupby(['Player']).count().sort_values(by='Age', ascending=False).head(10).index.to_list()
df_x = data.loc[data.Player.isin(long_c)]

fig = px.line(df_x, x = 'Age', y='PER', color='Player')
fig.show()

In [10]:
x = df.groupby(['Age','ys']).mean()
x.reset_index(inplace=True)
fig = px.line(x, x = 'Age', y='PER', color='ys')
fig.show()

In [11]:
x = df.groupby(['Age']).mean()
x.reset_index(inplace=True)
fig = px.line(x, x = 'Age', y='PER')
fig.show()

In [12]:
y= df.groupby(['Age']).count()
fig = px.line(y, x = y.index, y='Player')
fig.show()

In [46]:
y= df.groupby(['year_start']).mean()
fig = px.line(y, x = y.index, y='PER')
fig.show()

## Feature Engineering
Lets begin by creating lagging PER variables

In [145]:
df = data.sort_values(by='Year')

df = df.loc[df.Age <= 35]
df = df.dropna(how='any')
df['PER_Lag_1'] = df.groupby(['Player'])['PER'].shift()
df['PER_Lag_2'] = df.groupby(['Player'])['PER_Lag_1'].shift()
df['PER_Lag_3'] = df.groupby(['Player'])['PER_Lag_2'].shift()
df['PER_Lag_4'] = df.groupby(['Player'])['PER_Lag_3'].shift()
df['PER_Lag_5'] = df.groupby(['Player'])['PER_Lag_4'].shift()
df['Career_Length'] = df['Year']-df['year_start']
df = df.fillna(0)

df['PER_Diff_1'] = np.where(df['PER_Lag_2']!=0, df['PER_Lag_1']-df['PER_Lag_2'],0)
df['PER_Diff_2'] = np.where(df['PER_Lag_3']!=0, df['PER_Lag_2']-df['PER_Lag_3'],0)
df['PER_Diff_3'] = np.where(df['PER_Lag_4']!=0, df['PER_Lag_3']-df['PER_Lag_4'],0)
df['PER_Diff_4'] = np.where(df['PER_Lag_5']!=0, df['PER_Lag_4']-df['PER_Lag_5'],0)

df['S_age'] = df['Age']-df['Career_Length']
df['ys'] = np.where(df['S_age']<=21,'1','0')
df.to_csv('clean_data.csv')
x = df.groupby(['Age']).mean()


## Fitting a model


In [158]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 25, random_state = 42)

train = df.loc[df['Year']<2010]
test = df.loc[df['Year']>=2010]

feature_list = list(train.drop('PER', axis = 1).columns)

train_labels = np.array(train['PER'])
train_features= np.array(train.drop(['PER','Player', 'position'], axis = 1))
test_labels = np.array(test['PER'])
test_features= np.array(test.drop(['PER','Player', 'position'], axis = 1))



rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=25,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [148]:
x = df.groupby(['Age']).mean()
x.reset_index(inplace=True)
fig = px.line(x, x = 'Age', y='PER')
fig.show()

In [138]:
test['PER_est'] = rf.predict(np.array(test.drop(['PER','Player', 'position'], axis = 1)))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [160]:
predictions = rf.predict(test_features)

errors = abs(predictions - test_labels)

print('Mean Absolute Error:', round(np.mean(errors), 2), '')


Mean Absolute Error: 1.7 


In [161]:

mape = 100 * (errors / test_labels)

accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


Accuracy: -inf %.



divide by zero encountered in true_divide



In [149]:
df['PER_est'] = rf.predict(np.array(df.drop(['PER','Player', 'position'], axis = 1)))

In [150]:
x = df.groupby(['Age']).mean()
x.reset_index(inplace=True)
fig = px.line(x, x = 'Age', y='PER_est')
fig.show()


In [151]:
x = df.groupby(['Age']).mean()
x.reset_index(inplace=True)
fig = px.line(x, x = 'Age', y='PER_est')
fig.show()

In [165]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
x = df.loc[df.Player == 'Kobe Bryant']
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=x['Year'], y=x['PER'], name="yaxis data"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=x['Year'], y=x['PER_est'], name="yaxis2 data"),
    secondary_y=False,
)

# Add figure title
fig.update_layout(
    title_text="Double Y Axis Example"
)

# Set x-axis title
fig.update_xaxes(title_text="xaxis title")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> yaxis title", secondary_y=True)

fig.show()

In [155]:
x = df.loc[df.Player == 'Kobe Bryant']
x.reset_index(inplace=True)
fig = px.line(x, x = 'Age', y='PER_est')
fig.show()