In [1]:
#pip install pybaseball

#pybaseball is a wonderfully helpful open source Package for baseball data analysis that will save me a lot of unnecessary
#webscraping with beautiful soup, and it will save time data cleaning as well.

In [2]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [3]:
#Define what time period of years we want to select data from

In [4]:
START = 2002
END = 2022

In [5]:
batstat = batting_stats(START, END, qual=200)

In [6]:
batstat = batstat.groupby("IDfg", group_keys=False).filter(lambda x:x.shape[0] > 1)

In [7]:
batstat

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,0.0,,0,0.166,0.252,,,,-2.4
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-3.1
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


In [8]:
null_values = batstat.isnull().sum()
null_values

IDfg         0
Season       0
Name         0
Team         0
Age          0
          ... 
CSW%         0
xBA       6754
xSLG      6754
xwOBA     6754
L-WAR        0
Length: 320, dtype: int64

In [9]:
batstat_nonnulls = batstat.columns[null_values == 0]
batstat = batstat[batstat_nonnulls]

In [10]:
batstat.head(3)

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,105,106,87,64,83,171,0,0.127,0.191,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,112,96,86,65,76,171,0,0.124,0.164,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118,91,84,46,83,159,404,0.169,0.287,11.2


In [11]:
def next_season(player):
    player.sort_values("Season", inplace = True)
    player["Next_WAR"] = player["WAR"].shift(-1)
    player.sort_values("IDfg", inplace = True)
    return player

batstat = batstat.groupby("IDfg", group_keys=False).apply(next_season)

In [12]:

batstat[["IDfg", "Name", "Season", "WAR", "Next_WAR"]].head(20)

Unnamed: 0,IDfg,Name,Season,WAR,Next_WAR
5562,1,Alfredo Amezaga,2006,1.1,2.0
5006,1,Alfredo Amezaga,2007,2.0,1.2
5252,1,Alfredo Amezaga,2008,1.2,
1169,2,Garret Anderson,2002,3.7,5.1
864,2,Garret Anderson,2003,5.1,0.8
2569,2,Garret Anderson,2004,0.8,-0.2
4187,2,Garret Anderson,2005,-0.2,0.1
3964,2,Garret Anderson,2006,0.1,1.4
1925,2,Garret Anderson,2007,1.4,1.4
3346,2,Garret Anderson,2008,1.4,-1.1


In [13]:
batstat.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 133, dtype: object

In [14]:
batstat.dtypes[batstat.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [15]:
batstat["Dol"]

5562      $5.5
5006     $11.2
5252      $7.2
1169     $14.6
864      $22.0
         ...  
6002      $4.8
4881    ($2.6)
3377    ($3.5)
6620      $4.0
4396     $29.3
Name: Dol, Length: 6754, dtype: object

In [16]:
batstat["Age Rng"]

5562    28 - 28
5006    29 - 29
5252    30 - 30
1169    30 - 30
864     31 - 31
         ...   
6002    25 - 25
4881    23 - 23
3377    24 - 24
6620    25 - 25
4396    26 - 26
Name: Age Rng, Length: 6754, dtype: object

In [17]:
batstat.drop(columns = ["Dol", "Age Rng"], inplace = True)

In [18]:
batstat.dtypes[batstat.dtypes == "object"]

Name    object
Team    object
dtype: object

In [19]:
batstat["Team Code"] = batstat['Team'].astype("category").cat.codes

In [20]:
batstat[batstat["Name"] == "Manny Machado"][['Name', "Season", "Team", "Team Code", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,Team,Team Code,WAR,Next_WAR
4074,Manny Machado,2012,BAL,4,1.3,5.0
3475,Manny Machado,2013,BAL,4,5.0,2.3
3005,Manny Machado,2014,BAL,4,2.3,6.6
905,Manny Machado,2015,BAL,4,6.6,6.2
1063,Manny Machado,2016,BAL,4,6.2,1.7
3311,Manny Machado,2017,BAL,4,1.7,7.0
707,Manny Machado,2018,- - -,0,7.0,2.2
2775,Manny Machado,2019,SDP,26,2.2,2.6
378,Manny Machado,2020,SDP,26,2.6,4.3
1802,Manny Machado,2021,SDP,26,4.3,7.4


In [21]:
batstat.drop(columns = "Team", inplace=True)

In [22]:
batstat2 = batstat.copy()
batstat = batstat.dropna().copy()

**Principal Component Analysis**

**Scale Data**

In [23]:
removed_columns = ["Next_WAR", "Name", "Team", "IDgf", "Season"]
selected_columns = batstat.columns[~batstat.columns.isin(removed_columns)]

In [25]:
from sklearn.preprocessing import StandardScaler

sc= StandardScaler()

batstat.loc[:, selected_columns] = sc.fit_transform(batstat[selected_columns])
batstat.loc[:, selected_columns] = sc.transform(batstat[selected_columns])

**Train-Test Split**

In [26]:
X = batstat.drop(columns="Next_WAR")
y = batstat["Next_WAR"]

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 100)

In [28]:
print(
f"""
X Train Shape {X_train.shape}
X Test Shape {X_test.shape}

y Train Shape {y_train.shape}
y Test Shape {y_test.shape}
"""
)


X Train Shape (4181, 130)
X Test Shape (1394, 130)

y Train Shape (4181,)
y Test Shape (1394,)



In [36]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

ImportError: cannot import name 'SequentialFeatureSelector' from 'sklearn.feature_selection' (C:\Users\tflanagan\Anaconda3\envs\learn-env\lib\site-packages\sklearn\feature_selection\__init__.py)