In [31]:
import pandas as pd
import numpy as np
import pybaseball
from pybaseball import batting_stats
from pybaseball import statcast #Going to need for xBA xSLG and xwOBA (All blank of FGs)
from pybaseball import statcast_batter
pybaseball.cache.enable()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

In [16]:
batting = batting_stats(2017, 2023, qual = 150)
batting.to_csv("batting.csv")
#statcast_data = statcast(start_dt='2015-01-01', end_dt='2024-01-01')
#statcast_data.to_csv("statcast_data.csv")
batting = pd.read_csv("batting.csv")

In [17]:
#makes it only players with multiple seasons
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246,0.609,404,0.169,0.287,,,,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217,0.5,434,0.22,0.27,,,,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,118.0,162,0.46,352,0.201,0.261,,,,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,121.2,309,0.55,562,0.137,0.214,,,,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,121.1,186,0.55,338,0.157,0.29,,,,8.7


In [18]:
null_variables = batting.isnull().sum() #find nulls (cant have for ML)
null_variables

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
CSW%             0
xBA           2388
xSLG          2388
xwOBA         2388
L-WAR            0
Length: 321, dtype: int64

In [19]:
full_variables = list(batting.columns[null_variables == 0]) #gets list of all non nulls
batting = batting[full_variables].copy() #updates to only non nulls
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29,8.7


In [20]:
batting.dtypes[batting.dtypes == "object"] #check for object type (cant have for ML) 

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [21]:
#removes all object other than name
batting = batting.drop('Dol', axis=1)
batting = batting.drop('Team', axis=1)
batting = batting.drop('Age Rng', axis=1)
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Age,G,AB,PA,H,1B,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR
0,2,15640,2022,Aaron Judge,30,157,570,696,177,87,...,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287,11.6
1,4,13611,2018,Mookie Betts,25,136,520,614,180,96,...,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27,10.4
2,6,10155,2018,Mike Trout,26,140,471,608,147,80,...,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261,9.5
3,18,18401,2023,Ronald Acuna Jr.,25,159,643,735,217,137,...,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214,8.4
4,15,15640,2017,Aaron Judge,25,155,542,678,154,75,...,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29,8.7


In [22]:
batting.dtypes[batting.dtypes == "object"] #confirms removal

Name    object
dtype: object

In [9]:
#Ridge regression
#Feature Selection
#Time series
#Minmax scaling
#Not Sure on order


In [24]:
def scale_dataset(dataset):
    dataset_scaled = dataset.copy()
    for i in dataset.columns:
        if i != "Name":
            min = dataset_scaled[i].min()
            max = dataset_scaled[i].max()
            if min!=max:
                dataset_scaled[i] = (dataset_scaled[i] - min) / (max - min)
    return dataset_scaled   
batting_scaled= scale_dataset(batting)
batting_scaled.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Age,G,AB,PA,H,1B,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR
0,0.000761,0.51297,0.833333,Aaron Judge,0.478261,0.960317,0.80036,0.905473,0.79798,0.490798,...,0.57958,1.0,0.952727,0.836207,0.786441,0.942408,0.657315,0.449761,0.635593,1.0
1,0.001523,0.444706,0.166667,Mookie Betts,0.26087,0.793651,0.710432,0.769486,0.813131,0.546012,...,0.687688,0.537736,0.476364,0.5,0.688136,0.752182,0.717435,0.69378,0.563559,0.915493
2,0.002284,0.328433,0.166667,Mike Trout,0.304348,0.825397,0.622302,0.759536,0.646465,0.447853,...,0.690691,0.509434,0.556364,0.818966,0.501695,0.682373,0.553106,0.602871,0.525424,0.852113
3,0.006852,0.605861,1.0,Ronald Acuna Jr.,0.26087,0.97619,0.931655,0.970149,1.0,0.797546,...,0.354354,0.811321,0.556364,0.956897,1.0,0.839442,0.973948,0.296651,0.326271,0.774648
4,0.00571,0.51297,0.0,Aaron Judge,0.26087,0.944444,0.75,0.875622,0.681818,0.417178,...,0.606607,0.792453,0.905455,0.952586,0.583051,0.839442,0.52505,0.392344,0.648305,0.795775


In [33]:
def split_data(dataset, training_percent, seed):
    shuffled = dataset.sample(frac=1, random_state=seed)
    
    total_rows = shuffled.shape[0]
    training_rows = int(training_percent * total_rows)

    training = shuffled.iloc[:training_rows, :]
    testing = shuffled.iloc[training_rows:, :]

    # split the training attributes and labels
    training_X = training.drop("Name", axis=1)
    training_y = training["Name"]
    
    # split the testing attributes and labels
    testing_X = testing.drop("Name", axis=1)
    testing_y = testing["Name"]
    
    return training_X, training_y, testing_X, testing_y

training_X, training_y, testing_X, testing_y=split_data(batting_scaled, .75, 12345)

In [37]:
def train_models(training_X, training_y, testing_X):

    linear_model = LinearRegression()
    linear_model.fit(training_X, training_y)

    lasso_model = Lasso()
    lasso_model.fit(training_X, training_y)

    ridge_model = Ridge()
    ridge_model.fit(training_X, training_y)

    svm_model_2 = SVR(kernel = 'poly', degree = 2)
    svm_model_2.fit(training_X, training_y)

    svm_model_3 = SVR(kernel = 'poly', degree = 3)
    svm_model_3.fit(training_X, training_y)

    svm_model_4 = SVR(kernel = 'poly', degree = 4)
    svm_model_4.fit(training_X, training_y)

    svm_model_rbf = SVR(kernel = 'rbf')
    svm_model_rbf.fit(training_X, training_y)


    tree_model = DecisionTreeRegressor()
    tree_model.fit(training_X, training_y)

    return linear_model, lasso_model, ridge_model, svm_model_2, svm_model_3, svm_model_4, svm_model_rbf, tree_model
training_y.head()
#linear_model, lasso_model, ridge_model, svm_model_2, svm_model_3, svm_model_4, svm_model_rbf, tree_model= train_models(training_X, training_y, testing_X)

1758      Alcides Escobar
376           Kolten Wong
1671    Guillermo Heredia
1052           Connor Joe
297           Travis Shaw
Name: Name, dtype: object