In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

In [2]:
def advancedBattingScraper(year):

  #fetching the HTML request object
  htmlData=requests.get(f'https://www.baseball-reference.com/leagues/majors/{year}-advanced-batting.shtml')

  #converting it into a Beautiful Soup object
  soupObject=BeautifulSoup(htmlData.text)

  #getting the HTML components using the nodes from the Selector Gadget Extension!
  hittingTableData=soupObject.select('#teams_advanced_batting tbody .left , #teams_advanced_batting tbody .right , #teams_advanced_batting .poptip')

  #converting the HTML components to just their values and storing them in an array
  advancedBattingDataAr=[]
  for element in hittingTableData:
    advancedBattingDataAr.append(element.text)

  #Dynamically getting all the headings of the dataframe
  TmIndex=advancedBattingDataAr.index('Tm')
  headings=advancedBattingDataAr[TmIndex:]

  #removing Teams since the Teams column shows up in column instead of row format (we deal with it later)
  headings.remove('Tm')
  
  #getting all the teams in one array
  indexToStop=advancedBattingDataAr.index('League Average')
  indices=advancedBattingDataAr[:indexToStop]

  #extracting just the data and reshaping a 1d array into a 2d array
  advancedBattingData=advancedBattingDataAr[indexToStop+1:-len(headings)*2-1]
  temp_arr=np.array(advancedBattingData).reshape(indexToStop,len(headings))

  #converting the array to a Pandas Dataframe and adding Year and Teams
  advancedBattingDf = pd.DataFrame(temp_arr, columns=headings)
  yearAr=np.full(len(indices),f'{year}')
  advancedBattingDf['Year']=yearAr
  advancedBattingDf['Tm']=indices

  #Dropping these two columns as they aren't present in all tables from 2001-2021
  if 'HardH%' in advancedBattingDf.columns:
    advancedBattingDf=advancedBattingDf.drop(['HardH%'], axis=1)
  if 'EV' in advancedBattingDf.columns:
    advancedBattingDf=advancedBattingDf.drop(['EV'], axis=1)
  return advancedBattingDf





In [3]:
TeamAdvancedBattingDf=advancedBattingScraper(2001)

for year in range(2002,2022):
  TeamAdvancedBattingDf=pd.concat([TeamAdvancedBattingDf,advancedBattingScraper(year)])

In [4]:
TeamAdvancedBattingDf

Unnamed: 0,rOBA,Rbat+,BAbip,ISO,HR%,SO%,BB%,LD%,GB%,FB%,...,Cent%,Oppo%,WPA,cWPA,RE24,RS%,SB%,XBT%,Year,Tm
0,.330,91,.290,.144,2.5%,16.1%,7.9%,25.5%,37.7%,29.5%,...,50.5%,20.1%,-10.0,-4.0%,-93.3,28%,69%,44%,2001,Anaheim Angels
1,.343,91,.294,.175,3.3%,16.6%,9.3%,24.6%,38.9%,28.6%,...,51.8%,19.7%,-2.9,-1.3%,28.4,31%,65%,42%,2001,Arizona Diamondbacks
2,.328,85,.290,.152,2.8%,16.9%,8.0%,22.6%,44.2%,26.2%,...,52.5%,21.2%,-5.9,-6.3%,-38.8,30%,65%,45%,2001,Atlanta Braves
3,.317,88,.278,.131,2.2%,16.1%,8.4%,24.6%,39.2%,28.0%,...,49.7%,19.9%,-10.0,-2.0%,-85.3,30%,72%,47%,2001,Baltimore Orioles
4,.341,100,.300,.173,3.2%,18.1%,8.3%,25.2%,40.2%,27.7%,...,51.6%,20.0%,-2.3,-1.5%,-19.2,30%,57%,34%,2001,Boston Red Sox
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,.322,99,.287,.168,3.3%,22.4%,8.0%,22.8%,40.5%,28.2%,...,53.8%,16.7%,1.2,-1.3%,3.6,30%,80%,43%,2021,St. Louis Cardinals
26,.328,112,.294,.186,3.6%,24.8%,9.4%,22.9%,42.4%,26.2%,...,52.9%,17.7%,8.0,6.1%,120.0,35%,68%,47%,2021,Tampa Bay Rays
27,.297,82,.280,.143,2.8%,23.2%,7.3%,22.5%,46.3%,23.3%,...,54.4%,18.5%,-16.7,-2.9%,-136.9,29%,79%,41%,2021,Texas Rangers
28,.345,113,.296,.200,4.3%,20.1%,8.2%,24.5%,40.5%,27.1%,...,53.8%,17.6%,3.7,2.9%,103.8,33%,80%,41%,2021,Toronto Blue Jays


In [6]:
# converting the df to a csv file
TeamAdvancedBattingDf.to_csv('AdvancedBatting.csv', index= False)
