In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px

In [2]:
players = pd.read_csv('Players.csv')
pstats = pd.read_csv('player_data.csv')
sstats = pd.read_csv('Seasons_Stats.csv')

### Data Cleaning
First I removed players with the same name.

In [14]:
duplicate_names = pstats['name'][pstats['name'].duplicated()].drop_duplicates().tolist()
duplicate_names.append('John Lucas')
duplicate_names.append('John Lucas III')
pstats = pstats.loc[~pstats['name'].isin(duplicate_names)]
sstats = sstats.loc[~sstats['Player'].isin(duplicate_names)]

# Drop Players with no height or no weight
no_height = pstats.loc[pstats['height'].isna()]['name'].tolist()
no_weight = pstats.loc[pstats['weight'].isna()]['name'].tolist()
pstats = pstats.loc[~pstats['name'].isin(no_height)]
sstats = sstats.loc[~sstats['Player'].isin(no_height)]
pstats = pstats.loc[~pstats['name'].isin(no_weight)]
sstats = sstats.loc[~sstats['Player'].isin(no_weight)]

Select only PER and MP and drop NaN values

In [15]:
sstats = sstats[['Player','Year','Age','PER','MP']].dropna(how='any')

KeyError: "['MP'] not in index"

Some players played for multiple teams in one season. To fix this issue I took the weighted average of the PER and minutes played

In [5]:
w_avg = lambda x: np.average(x, weights=sstats.loc[x.index, "MP"])
f = {'Age': ['mean'],'PER': w_avg}

sstats = sstats.groupby(['Player','Year']).agg(f)
sstats.columns = sstats.columns.droplevel(level=1)
sstats.reset_index(inplace=True)

Now I will add the players attributes to the table.

In [6]:
# Change height and weight to metric
pstats = pstats.assign(height_metric=pstats.height.str[:1].astype(int) * 30.48 + pstats.height.str[2:].astype(int) * 30.48 / 12)
pstats['weight_metric'] = pstats.weight * 0.45

pstats['Player'] = pstats.name

data = pd.merge(sstats, pstats, on='Player', how='left')[['Player','Year','Age','PER','position','height_metric','weight_metric','year_start']]

ValueError: cannot convert float NaN to integer

Here is a plot of PER over time, for the players with the longest careers

In [None]:
long_c = data.groupby(['Player']).count().sort_values(by='Age', ascending=False).head(10).index.to_list()
df_x = data.loc[data.Player.isin(long_c)]

fig = px.line(df_x, x = 'Age', y='PER', color='Player')
fig.show()

## Feature Engineering
Lets begin by creating lagging PER variables

In [11]:
df = data.sort_values(by='Year')

df = df.loc[df.Age <= 35]
df = df.dropna(how='any')
df['PER_Lag_1'] = df.groupby(['Player'])['PER'].shift()
df['PER_Lag_2'] = df.groupby(['Player'])['PER_Lag_1'].shift()
df['PER_Lag_3'] = df.groupby(['Player'])['PER_Lag_2'].shift()
df['PER_Lag_4'] = df.groupby(['Player'])['PER_Lag_3'].shift()
df['PER_Lag_5'] = df.groupby(['Player'])['PER_Lag_4'].shift()
df['Career_Length'] = df['Year']-df['year_start']
df = df.fillna(0)

df['PER_Diff_1'] = np.where(df['PER_Lag_2']!=0, df['PER_Lag_1']-df['PER_Lag_2'],0)
df['PER_Diff_2'] = np.where(df['PER_Lag_3']!=0, df['PER_Lag_2']-df['PER_Lag_3'],0)
df['PER_Diff_3'] = np.where(df['PER_Lag_4']!=0, df['PER_Lag_3']-df['PER_Lag_4'],0)
df['PER_Diff_4'] = np.where(df['PER_Lag_5']!=0, df['PER_Lag_4']-df['PER_Lag_5'],0)

df['S_age'] = df['Age']-df['Career_Length']
df['ys'] = np.where(df['S_age']<=21,'1','0')
df.to_csv('clean_data.csv')
x = df.groupby(['Age']).mean()


x = df.loc[df.PER <= 0]['Player'].drop_duplicates().to_list()

df = df.loc[~df.Player.isin(x)]

## Fitting a model


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [13]:
df['PER_est'] = 0
mean_error = []
for year in range(2000,2018):
    train = df.loc[df['Year']<year]
    test = df.loc[df['Year']==year]
    
    rf = RandomForestRegressor(n_estimators = 25, random_state = 42)

    train_labels = np.array(train['PER'])
    train_features= np.array(train.drop(['PER','Player', 'position'], axis = 1))
    test_labels = np.array(test['PER'])
    test_features= np.array(test.drop(['PER','Player', 'position'], axis = 1))
    
    naive = np.array(test['PER_Lag_1'])
    rf.fit(train_features, train_labels)
    df['PER_est'] = rf.predict(np.array(df.drop(['PER','Player', 'position'], axis = 1)))
    p = rf.predict(test_features)
    error = rmsle(test_labels, p)
    print(error)
    mean_error.append(error)
print('-------------')
print(sum(mean_error)/len(mean_error))

0.36791171257418953
0.38454386415123204
0.3383705229671013
0.31360486206726257
0.40942208871178476
0.41089055155993315
0.3518443512086867
0.3282252631391113
0.39858149405661525
0.4089224307803588
0.3913170192057869
0.44727050486451747
0.4032743708521196
0.40037123869244273
0.4576543421387735
0.4182169948685234
0.3672185721397222
0.37625219960668543
-------------
0.3874384657547137


In [20]:
df.loc[df.Year==2016].sort_values(by='PER_est', ascending=False).head(20)['Player']

10966              Kevin Durant
16221         Russell Westbrook
3351                 Chris Paul
17706    Thanasis Antetokounmpo
17143             Stephen Curry
8367               James Harden
11746              LeBron James
857               Anthony Davis
10579             Kawhi Leonard
4789           DeMarcus Cousins
14936          Rakeem Christmas
2047           Boban Marjanovic
2453                Brook Lopez
11436         LaMarcus Aldridge
11390                Kyle Lowry
1750              Blake Griffin
5084             Derrick Favors
252                Al Jefferson
599              Andre Drummond
6281                Enes Kanter
Name: Player, dtype: object

In [11]:
predictions = rf.predict(test_features)

errors = abs(predictions - test_labels)

print('Mean Absolute Error:', round(np.mean(errors), 2), '')


Mean Absolute Error: 3.37 


In [12]:

mape = 100 * (errors / test_labels)

accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')


Accuracy: -inf %.



divide by zero encountered in true_divide



In [13]:
df['PER_est'] = rf.predict(np.array(df.drop(['PER','Player', 'position'], axis = 1)))

In [23]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
x = df.loc[df.Player == 'Kevin Garnett']

def plotter(i):
    x = df.loc[df.Player == i]
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=x['Year'], y=x['PER'], name="yaxis data"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=x['Year'], y=x['PER_est'], name="yaxis2 data"),
        secondary_y=False,
    )

    # Add figure title
    fig.update_layout(
        title_text=i
    )

    # Set x-axis title
    fig.update_xaxes(title_text="xaxis title")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>primary</b> yaxis title", secondary_y=False)
    fig.update_yaxes(title_text="<b>secondary</b> yaxis title", secondary_y=True)

    fig.show()

In [24]:
import time

bla = ['Kevin Garnett', 'Kobe Bryant', 'Tim Duncan', 'Kevin Garnett', 'Dirk Nowitzki']
for i in bla:
    
    plotter(i)
