## WR Modelling

In [22]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('prospects_with_FF_stats.csv')

In [6]:
# drop unneeded columns
data = df.drop(columns = ['Unnamed: 0', 'Position', 'Birth_Date', 'Conference', 'Draft_Pick', 'College', 'Breakout_Year',
                         'Top_100_2011', 'RK_19', 'RK_18', 'RK_17', 'RK_16', 'RK_15', 'RK_14', 'RK_13', 'RK_12', 'RK_11', 
                         'Top12_Seasons', 'Top24_Seasons', 'Top36_Seasons', 'Fantasy_Cat', 'RK_FirstYr', 'RK_SecondYr',
                         'RK_ThirdYr', 'Class'])

In [9]:
# split data
data_1920 = data[(data['Draft_Year'] == 2019) | (data['Draft_Year'] == 2020)]
data_mod = data[data['Draft_Year'] < 2019]

In [12]:
data_mod = data_mod.drop(columns = ['Draft_Year', 'Fantasy_Class'])

In [51]:
X = data_mod.drop(columns = ['3Year_Hit'])
y = data_mod['3Year_Hit']

In [13]:
import sklearn
from sklearn.model_selection import train_test_split

In [15]:
# split dataset into training and test sets
random_state = 12493
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.3, random_state = random_state)

### XGBoost Random Forest Classifier

In [20]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [23]:
# create a dictionary of parameters 
param_grid = {'n_estimators':np.arange(100, 1050, 50),
              'learning_rate':np.arange(0.1, 1.7, 0.1),
              'max_depth':[1, 2, 3],
              'gamma':np.arange(0, 5.25, 0.25)}

# create a Gradient Boost classifier object
clf_xgb_obj = xgb.XGBClassifier(random_state = random_state)

In [24]:
# search through a random selection of the combinations in the grid
clf_xgb_gs = RandomizedSearchCV(clf_xgb_obj, param_grid, cv = 5, scoring = 'roc_auc', refit = True, n_jobs = -1, verbose = 4)
clf_xgb_gs.fit(X_train.drop(columns = ['Player']), y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  40 out of  50 | elapsed:   15.9s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   17.7s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=3131, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': array([ 100,  150,  200,  250,  300,  350,  400,  450,  500,  550,  600,
        650,  700,  750,  800,  850,  900,  950, 1000]), 'learning_rate': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6]), 'max_depth': [1, 2], 'gamma': array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  , 2.25, 2.5 ,
       2.75, 3.  , 3.25, 3.5 , 3.75,

In [26]:
# predict the training set to check for overfitting
y_preds_xgb_train = clf_xgb_gs.best_estimator_.predict(X_train.drop(columns = ['Player']))
y_probs_xgb_train = clf_xgb_gs.best_estimator_.predict_proba(X_train.drop(columns = ['Player']))[:,1]

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [28]:
confusion_matrix(y_train, y_preds_xgb_train)

array([[482,   1],
       [ 29,  18]], dtype=int64)

In [29]:
# predict classes and probabilities for test set using the best model
y_preds_xgb_best = clf_xgb_gs.best_estimator_.predict(X_test.drop(columns = ['Player']))
y_probs_xgb_best = clf_xgb_gs.best_estimator_.predict_proba(X_test.drop(columns = ['Player']))[:,1]

In [30]:
confusion_matrix(y_test, y_preds_xgb_best)

array([[206,   3],
       [ 13,   6]], dtype=int64)

In [32]:
X_test['preds'] = y_preds_xgb_best
X_test['probs'] = y_probs_xgb_best
X_test[['Player', 'Arm_Length', 'preds', 'probs']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Player,Arm_Length,preds,probs
54,Antwan Goodley,30.750,0,0.014954
914,Tre'Quan Smith,33.375,0,0.271357
183,Corey Brown,31.750,0,0.014954
78,Bra'lon Cherry,30.875,0,0.013060
799,Robert Meachem,,0,0.395287
724,Nick Williams,34.125,0,0.039347
824,Sammy Watkins,32.000,1,0.837163
138,Chad Hansen,32.125,0,0.087260
187,Corey Smith,30.625,0,0.028930
429,James Washington,32.375,0,0.208047


In [44]:
X_test['preds'].sum()

9

In [45]:
y_test.sum()

19

In [47]:
from sklearn.metrics import roc_auc_score

In [48]:
roc_auc_score(y_test, y_probs_xgb_best)

0.8329136237723495

In [50]:
# find the top 10 features in terms of importance
feature_importance = {'features': list(X_train.drop(columns=['Player']).columns.values), 'importance': clf_xgb_gs.best_estimator_.feature_importances_}
feature_df = pd.DataFrame(feature_importance)
feature_df = feature_df.sort_values('importance', ascending=False)
feature_df[:10]

Unnamed: 0,features,importance
4,Overall_Pick,0.182044
2,Draft_Round,0.120323
20,Best_Yard_Share,0.102675
22,Breakout_Age,0.073921
25,Draft_Age,0.060228
5,Hand_Size,0.05771
14,Broad_Jump,0.057335
54,Career_KO_Att,0.048908
24,Dominator,0.048609
64,Level_Comp,0.046835


### Test on 19/20 players

In [38]:
data_1920 = data_1920.drop(columns = ['Draft_Year', 'Fantasy_Class'])

In [39]:
X_1920 = data_1920.drop(columns = ['3Year_Hit'])
y_1920 = data_1920['3Year_Hit']

In [40]:
# predict classes and probabilities for 19/20 players using the best model
y_preds_xgb_1920 = clf_xgb_gs.best_estimator_.predict(X_1920.drop(columns = ['Player']))
y_probs_xgb_1920 = clf_xgb_gs.best_estimator_.predict_proba(X_1920.drop(columns = ['Player']))[:,1]

In [41]:
X_1920['prob'] = y_probs_xgb_1920
X_1920['pred'] = y_preds_xgb_1920

In [43]:
X_1920[['Player', 'prob', 'pred']].sort_values(by = 'prob', ascending = False).head(50)

Unnamed: 0,Player,prob,pred
199,D.K. Metcalf,0.56678,1
398,J.J. Arcega-Whiteside,0.511133,1
0,A.J. Brown,0.48785,0
713,N'Keal Harry,0.467055,0
625,Laviska Shenault,0.467055,0
541,Justin Jefferson,0.467055,0
416,Jalen Reagor,0.42885,0
145,Chase Claypool,0.382553,0
875,Tee Higgins,0.373587,0
548,K.J. Hamler,0.373587,0


In [55]:
X_1920.to_csv('RookPreds1920.csv')

### Combine all labelled players and test

In [52]:
# predict classes and probabilities for WRs through 2018 using the best model
y_preds_xgb_all = clf_xgb_gs.best_estimator_.predict(X.drop(columns = ['Player']))
y_probs_xgb_all = clf_xgb_gs.best_estimator_.predict_proba(X.drop(columns = ['Player']))[:,1]

In [53]:
X['actual'] = y
X['prob'] = y_probs_xgb_all
X['pred'] = y_preds_xgb_all

In [54]:
X[['Player', 'actual', 'prob', 'pred']].sort_values(by = 'prob', ascending = False).head(25)

Unnamed: 0,Player,actual,prob,pred
379,Hakeem Nicks,1,0.876982,1
29,Amari Cooper,1,0.876982,1
166,Christian Kirk,0,0.845327,1
824,Sammy Watkins,1,0.837163,1
122,Calvin Johnson,1,0.787043,1
622,Larry Fitzgerald,1,0.787043,1
527,JuJu Smith-Schuster,1,0.785428,1
82,Brandin Cooks,1,0.77615,1
27,Alshon Jeffery,1,0.771704,1
467,Jeremy Maclin,1,0.76578,1


In [56]:
X.to_csv('RookPredsall18.csv')