In [151]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import composeml as cp
import featuretools as ft
from tqdm import tqdm, tqdm_notebook

from autofeat import FeatureSelector, AutoFeatRegressor

import warnings
warnings.filterwarnings('ignore')

In [152]:
data = pd.read_csv("../data/data.csv")
data.drop([
    'team', 
    'opponent', 
    'first_name',
    'last_name',
], axis=1, inplace=True)

data.drop_duplicates(inplace=True)

qb = data[data['QB'] == 1]
rb = data[data['RB'] == 1]
wr = data[data['WR'] == 1]
te = data[data['TE'] == 1]

In [153]:
fig = go.Figure()

layout = go.Layout(title='Fantasy Points by Position')

rb_points = go.Histogram(
    x = data[data['RB'] == 1]['fantasy_half_ppr'],nbinsx=100,
    name='running_backs'
)

wr_points = go.Histogram(
    x = data[data['WR'] == 1]['fantasy_half_ppr'],
    name='wide_receivers'
)

te_points = go.Histogram(
    x = data[data['TE'] == 1]['fantasy_half_ppr'],
    name='tight_ends'
)

qb_points = go.Histogram(
    x = data[data['QB'] == 1]['fantasy_half_ppr'],
    name='quarterbacks'
)

fig.add_trace(rb_points)
fig.add_trace(wr_points)
fig.add_trace(te_points)
fig.add_trace(qb_points)

fig.update_layout(layout)

fig.show()

In [154]:
correlation_data = data.corr()

qb_corr = qb.corr()
rb_corr = rb.corr()
wr_corr = wr.corr()
te_corr = te.corr()

qb_corr.sort_values(by='fantasy_half_ppr', ascending=False, inplace=True)
rb_corr.sort_values(by='fantasy_half_ppr', ascending=False, inplace=True)
wr_corr.sort_values(by='fantasy_half_ppr', ascending=False, inplace=True)
te_corr.sort_values(by='fantasy_half_ppr', ascending=False, inplace=True)

qb_corr = qb_corr[~qb_corr['fantasy_half_ppr'].isna()]
rb_corr = rb_corr[~rb_corr['fantasy_half_ppr'].isna()]
wr_corr = wr_corr[~wr_corr['fantasy_half_ppr'].isna()]
te_corr = te_corr[~te_corr['fantasy_half_ppr'].isna()]

In [155]:
fig = px.imshow(correlation_data, title='Correlation Heatmap')
fig.show()

In [156]:
qb_corr

Unnamed: 0,season,week,completions,attempts,passing_yards,passing_tds,interceptions,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,...,height,years_exp,rookie_year,offense_snaps,offense_pct,QB,RB,TE,WR,fantasy_half_ppr
fantasy_half_ppr,0.016339,-0.070453,0.679129,0.602086,0.80279,0.836562,-0.029932,-0.029277,0.564438,0.670693,...,-0.096087,0.100015,-0.085203,0.642664,0.600792,,,,,1.0
passing_tds,-0.010898,-0.057031,0.482225,0.400189,0.600419,1.0,-0.025545,0.006387,0.384485,0.49131,...,-0.053121,0.151172,-0.144184,0.413865,0.399777,,,,,0.836562
passing_yards,-0.045654,-0.090732,0.896547,0.84566,1.0,0.600419,0.185921,0.114647,0.762187,0.8484,...,-0.018685,0.192436,-0.197292,0.794986,0.738708,,,,,0.80279
passing_first_downs,-0.026058,-0.082533,0.909031,0.855601,0.92246,0.601631,0.181865,0.110826,0.731509,0.767411,...,-0.012113,0.198004,-0.19395,0.801814,0.714161,,,,,0.769514
completions,-0.026314,-0.086242,1.0,0.940072,0.896547,0.482225,0.223864,0.13257,0.724427,0.786759,...,0.006078,0.205992,-0.201431,0.848748,0.775884,,,,,0.679129
passing_yards_after_catch,-0.040161,-0.075657,0.786759,0.718165,0.8484,0.49131,0.135516,0.093111,0.479693,1.0,...,-0.032565,0.186079,-0.189051,0.685857,0.646064,,,,,0.670693
offense_snaps,-0.066802,-0.06939,0.848748,0.884822,0.794986,0.413865,0.237865,0.138932,0.751011,0.685857,...,-0.00253,0.099175,-0.120402,1.0,0.919971,,,,,0.642664
dakota,0.020911,-0.059461,0.354459,0.12672,0.494371,0.60422,-0.268485,-0.101401,0.108568,0.406246,...,-0.096141,0.135259,-0.115739,0.230927,0.230481,,,,,0.641897
passing_epa,0.021153,-0.050313,0.267103,0.073227,0.45849,0.601305,-0.42966,-0.227062,0.081565,0.383015,...,-0.082099,0.153614,-0.132568,0.144504,0.10157,,,,,0.622009
attempts,-0.06645,-0.069872,0.940072,1.0,0.84566,0.400189,0.326873,0.15049,0.845517,0.718165,...,0.044626,0.164779,-0.180775,0.884822,0.804969,,,,,0.602086


In [157]:
titles = ['[QB] Correlation Heatmap', 'Correlation Data']
fig = make_subplots(
    rows=1, 
    cols=2, 
    specs=[[{"type": "histogram"}, {"type": "table"}]],
    subplot_titles=titles
)

fig.add_heatmap(z=qb_corr, x=qb_corr.columns, y=qb_corr.index)

fig.add_table(
    cells={
        'values': [qb_corr.index.tolist(), qb_corr['fantasy_half_ppr'].values.tolist()]
    },
    header={'values': ['Correlation']},
    row=1,
    col=2, 
)

In [158]:
titles = ['[RB] Correlation Heatmap', 'Correlation Data']
fig = make_subplots(
    rows=1, 
    cols=2, 
    specs=[[{"type": "histogram"}, {"type": "table"}]],
    subplot_titles=titles
)

fig.add_heatmap(z=rb_corr, x=rb_corr.columns, y=rb_corr.index)

fig.add_table(
    cells={
        'values': [rb_corr.index.tolist(), rb_corr['fantasy_half_ppr'].values.tolist()]
    },
    header={'values': ['Correlation']},
    row=1,
    col=2, 
)

In [159]:
titles = ['[WR] Correlation Heatmap', 'Correlation Data']
fig = make_subplots(
    rows=1, 
    cols=2, 
    specs=[[{"type": "histogram"}, {"type": "table"}]],
    subplot_titles=titles
)

fig.add_heatmap(z=wr_corr, x=wr_corr.columns, y=wr_corr.index)

fig.add_table(
    cells={
        'values': [wr_corr.index.tolist(), wr_corr['fantasy_half_ppr'].values.tolist()]
    },
    header={'values': ['Correlation']},
    row=1,
    col=2, 
)

In [160]:
titles = ['[TE] Correlation Heatmap', 'Correlation Data']
fig = make_subplots(
    rows=1, 
    cols=2, 
    specs=[[{"type": "histogram"}, {"type": "table"}]],
    subplot_titles=titles
)

fig.add_heatmap(z=te_corr, x=te_corr.columns, y=te_corr.index)

fig.add_table(
    cells={
        'values': [te_corr.index.tolist(), te_corr['fantasy_half_ppr'].values.tolist()]
    },
    header={'values': ['Correlation']},
    row=1,
    col=2, 
)

In [161]:
data['time_index'] = data['season'].astype(str) + '-01' + '-' + data['week'].astype(str)
data['time_index'] = pd.to_datetime(data['time_index'])
data.sort_values(by=['player_id', 'time_index'], ascending=[True, True], inplace=True)
data.head()

Unnamed: 0,player_id,season,week,completions,attempts,passing_yards,passing_tds,interceptions,sack_fumbles_lost,passing_air_yards,...,years_exp,rookie_year,offense_snaps,offense_pct,QB,RB,TE,WR,fantasy_half_ppr,time_index
0,00-0006101,2013,1,0,0,0.0,0,0.0,0,0.0,...,16,1997,51.0,0.91,0,0,1,0,11.1,2013-01-01
2005,00-0006101,2013,2,0,0,0.0,0,0.0,0,0.0,...,16,1997,62.0,0.97,0,0,1,0,5.3,2013-01-02
2719,00-0006101,2013,3,0,0,0.0,0,0.0,0,0.0,...,16,1997,61.0,0.86,0,0,1,0,4.4,2013-01-03
4735,00-0006101,2013,4,0,0,0.0,0,0.0,0,0.0,...,16,1997,74.0,0.97,0,0,1,0,32.9,2013-01-04
6800,00-0006101,2013,5,0,0,0.0,0,0.0,0,0.0,...,16,1997,73.0,0.96,0,0,1,0,14.7,2013-01-05


### Shifting fantasy points by 1 week

In [168]:
dataframes_by_player_id = []
player_ids = data['player_id'].unique()
for player in tqdm_notebook(player_ids):
    player_df = data[data['player_id'] == player]
    player_df.sort_values(by='time_index', ascending=True, inplace=True)
    player_df['fantasy_half_ppr_shifted'] = player_df['fantasy_half_ppr'].shift(-1)
    player_df = player_df[~player_df['fantasy_half_ppr_shifted'].isna()]
    player_df = player_df[
        (player_df['player_id'] == player_df['player_id']) & 
        (player_df['season'] == player_df['season']) & 
        (player_df['week'] == player_df['week']) & 
        (player_df['fantasy_half_ppr'] != player_df['fantasy_half_ppr_shifted'])
    ]
    
    dataframes_by_player_id.append(player_df)
    
shifted = pd.concat(dataframes_by_player_id, axis=0)
shifted.reset_index(inplace=True, drop=True)

shifted['player_id_time_index'] = shifted['player_id'].astype(str) + '_' + shifted['time_index'].astype(str)
shifted.drop('fantasy_half_ppr', axis=1, inplace=True)

  0%|          | 0/884 [00:00<?, ?it/s]

In [171]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30899 entries, 0 to 30898
Data columns (total 45 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   player_id                    30899 non-null  object        
 1   season                       30899 non-null  int64         
 2   week                         30899 non-null  int64         
 3   completions                  30899 non-null  int64         
 4   attempts                     30899 non-null  int64         
 5   passing_yards                30899 non-null  float64       
 6   passing_tds                  30899 non-null  int64         
 7   interceptions                30899 non-null  float64       
 8   sack_fumbles_lost            30899 non-null  int64         
 9   passing_air_yards            30899 non-null  float64       
 10  passing_yards_after_catch    30899 non-null  float64       
 11  passing_first_downs          30899 non-nu

In [177]:
columns_to_drop = [
    'player_id',
    'player_id_time_index',
    'time_index'
]

target = 'fantasy_half_ppr_shifted'

In [None]:
# autofeat with different number of feature engineering steps
# 3 are perfect

X = shifted.drop(columns_to_drop + [target], axis=1)
y = shifted[target]

for steps in range(5):
    np.random.seed(55)
    print(f"### AutoFeat with {steps} feateng_steps")
    
    afreg = AutoFeatRegressor(
        verbose=1, 
        feateng_steps=steps
    )
    df = afreg.fit_transform(X, y)
    r2 = afreg.score(X, y)
    
    print("## Final R^2: %.4f" % r2)
    plt.figure()
    plt.scatter(afreg.predict(X), y, s=2);
    plt.title("%i FE steps (R^2: %.4f; %i new features)" % (steps, r2, len(afreg.new_feat_cols_)))

### AutoFeat with 0 feateng_steps
[AutoFeat] The 0 step feature engineering process could generate up to 42 features.
[AutoFeat] With 30899 data points this new feature matrix would use about 0.01 gb of space.
[featsel] Scaling data...done.
[featsel] Feature selection run 1/5
[featsel] Feature selection run 2/5
[featsel] Feature selection run 3/5
[featsel] Feature selection run 4/5
[featsel] Feature selection run 5/5
[featsel] 24 features after 5 feature selection runs
[featsel] 16 features after correlation filtering
[featsel] 15 features after noise filtering
[AutoFeat] Final dataframe with 42 feature columns (0 new).
[AutoFeat] Training final regression model.
[AutoFeat] Trained model: largest coefficients:
1.7110983594815838
7.097227 * dakota
7.082992 * offense_pct
5.086710 * QB
-1.487423 * TE
0.893807 * RB
0.854341 * passing_tds
0.568743 * rushing_tds
0.238784 * carries
0.230781 * targets
0.190194 * receiving_tds
0.099215 * receiving_first_downs
0.053308 * receiving_epa
0.033799 *