We will need `Scikit-learn`'s version number for setting up HuggingFace's space.

In [29]:
import sklearn
import xgboost
import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
import numpy as np


In [30]:
print("Scikit-learn's version:", sklearn.__version__)
print("xgboost's version:", xgboost.__version__)

Scikit-learn's version: 1.3.2
xgboost's version: 2.1.1


In [4]:
df = pd.read_csv('qb_index_no_tier.csv')

In [5]:
df = df.loc[:, ~df.columns.str.startswith('nfl')]
df

Unnamed: 0,year-drafted,qb-num-picked,rd-picked,num-picked,name,height (in),weight (lbs),coach-tenure,drafted-team-winpr,drafted_team_ppg_rk,...,p-yds,p-ypa,p-adj-ypa,p-td,int,rate,r-att,r-yds,r-avg,r-tds
0,2023,1,1,1,Bryce Young,70,204,0,0.412,19,...,8356,8.8,9.9,80,12,165.0,139,162,1.2,7
1,2023,2,1,2,C.J. Stroud,75,214,0,0.206,29,...,8123,9.8,11.2,85,12,182.4,80,136,1.7,1
2,2023,3,1,4,Anthony Richardson,76,244,0,0.265,29,...,3105,7.9,7.4,24,15,133.6,161,1116,6.9,12
3,2023,4,2,33,Will Levis,75,229,5,0.412,27,...,5876,8.0,7.7,46,25,145.6,312,742,2.4,17
4,2023,5,3,68,Hendon Hooker,76,217,2,0.529,5,...,8974,9.5,10.6,80,12,172.4,518,2083,4.0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,2000,8,6,202,Todd Husak,75,216,0,0.625,2,...,6564,7.5,7.2,41,24,126.6,99,-136,-1.4,3
288,2000,9,6,205,JaJuan Seider,73,230,1,0.500,25,...,0,0.0,0.0,0,0,0.0,0,0,0.0,0
289,2000,10,7,212,Tim Rattay,72,215,3,0.250,20,...,12746,8.2,8.7,115,35,154.3,153,-103,-0.7,2
290,2000,11,7,214,Jarious Jackson,72,226,5,0.375,17,...,4820,9.0,8.5,34,21,145.7,272,957,3.5,13


In [6]:
df = df.drop(columns =['name'])

In [7]:
df.dtypes

Unnamed: 0,0
year-drafted,int64
qb-num-picked,int64
rd-picked,int64
num-picked,int64
height (in),int64
weight (lbs),int64
coach-tenure,int64
drafted-team-winpr,float64
drafted_team_ppg_rk,int64
college,object


In [8]:
df = df.rename(columns={"height (in)": "height", "weight (lbs)": "weight","int":"ints"})

In [9]:
df = df.rename(columns={"year-drafted": "year_drafted", "qb-num-picked": "qb_num_picked","rd-picked":"rd_picked",
                        "coach-tenure":"coach_tenure","drafted-team-winpr" :"drafted_team_winpr","conf-str":"conf_str",
                        "p-cmp":"p_cmp","p-att":"p_att","cmp-pct":"cmp_pct","p-yds":"p_yds", "p-ypa":"p_ypa"})

In [10]:
df = df.rename(columns={"p-adj-ypa": 'p_adj_ypa',"r-att":"r_att", "r-avg":"r_avg","r-tds":"r_tds",'num-picked':"num_picked","r-yds":"r_yds"})

In [13]:
df = df.rename(columns={"p-td":"p_td"})

In [12]:
df = df.drop(columns =['conf','college'])

In [23]:
p_td_corr = df.corr()['p_td'].sort_values(ascending=False)
p_td_corr

Unnamed: 0,p_td
p_td,1.0
p_yds,0.934423
p_cmp,0.894566
p_att,0.863256
ints,0.600823
cmp_pct,0.58944
rate,0.537133
p_adj_ypa,0.515557
p_ypa,0.48807
r_att,0.369867


In [24]:
train_column = ['p_yds', 'p_cmp','p_att','ints','cmp_pct','rate','p_adj_ypa','p_ypa','r_att','year_drafted']
y_train = df["p_td"]
X_train = df[train_column]


In [25]:
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns

print(num_col)
print(cat_col)

Index(['p_yds', 'p_cmp', 'p_att', 'ints', 'cmp_pct', 'rate', 'p_adj_ypa',
       'p_ypa', 'r_att', 'year_drafted'],
      dtype='object')
Index([], dtype='object')


In [26]:
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col)])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', XGBRegressor())])

In [27]:

model.fit(X_train, y_train)

In [28]:
joblib.dump(model, 'model.joblib')

['model.joblib']