In [1]:
import numpy as np
import pandas as pd
import altair as alt
from altair import Chart, X, Y, Color, Scale

In [2]:
import requests

In [None]:
#I am curious in examining the relationship between sprint speed and stolen bases
#I want identify players who do not possess a high-sprint speed but are still effective base-stealers
#Sprint speed data is from Baseball Savant - https://baseballsavant.mlb.com/
#Stolen Base Data is from fangraphs.com
#This statistical exploration is inspired by Foolish Bailey's Youtube Video: https://www.reddit.com/r/baseball/comments/19epvzj/foolish_bailey_freddie_freeman_baserunning_genius/
#and Fangraph's article: https://blogs.fangraphs.com/lets-talk-about-freddie-freeman-baserunner-extraordinaire/

In [7]:
sprint_speed_df = pd.read_csv('sprint_speed.csv')

In [None]:
sprint_speed_df[['last_name, first_name', 'sprint_speed']].loc[:20]


In [131]:
# Rename the column
sprint_speed_df.rename(columns={'last_name, first_name': 'Name'}, inplace=True)

# Print the first 20 rows of the DataFrame with the renamed column
sprint_speed_df.head(20)


Unnamed: 0,Name,player_id,team_id,team,position,age,competitive_runs,bolts,hp_to_1b,sprint_speed
0,Elly De La Cruz,682829,113,CIN,SS,21,162,84.0,4.13,30.5
1,Bobby Witt Jr.,677951,118,KC,SS,23,262,149.0,4.12,30.5
2,Dairon Blanco,680118,118,KC,RF,30,55,31.0,4.15,30.3
3,Trea Turner,607208,143,PHI,SS,30,288,115.0,4.14,30.3
4,Bubba Thompson,669352,140,TEX,LF,25,28,19.0,4.23,30.2
5,Tyler Fitzgerald,666149,137,SF,CF,25,12,6.0,,30.1
6,Jorge Mateo,622761,110,BAL,SS,28,125,50.0,4.2,30.1
7,Corbin Carroll,682998,109,AZ,RF,22,298,133.0,4.08,30.1
8,Blake Perkins,663368,158,MIL,RF,26,58,22.0,4.14,30.0
9,Jordan Lawlar,691783,109,AZ,SS,20,13,9.0,4.2,30.0


In [115]:
#Top Sprint Speed - ft/s 

# Sort the DataFrame by 'sprint_speed' and select the top 20 players
sorted_df = sprint_speed_df.sort_values(by='sprint_speed', ascending=False).iloc[:20]

#Altair chart with a gradient color scheme
chart = alt.Chart(sorted_df).mark_bar().encode(
    x=alt.X('sprint_speed', title="Sprint Speed"),
    y=alt.Y('Name', sort='-x', title="Player"),
    color=alt.Color('sprint_speed', scale=alt.Scale(scheme='viridis'))
)
chart

In [None]:
#Top Sprint Speed Leaders in MLB 2023, no minimum number of plate appearances or at-bats

In [11]:
#Competitive Runs 

competitive_runs = sprint_speed_df.sort_values(by = 'competitive_runs', ascending = False).iloc[:20]


alt.Chart(competitive_runs).mark_bar(color='orange').encode(
    x=alt.X('competitive_runs', title="Competitive Runs"),
    y=alt.Y('Name', sort='-x', title="Player")
)


In [78]:
#Top Competitive Runs leaders in 2023 with no min. PAs or SBs. 

#Sprint Speed is Statcast’s foot speed metric, defined as “feet per second in a player’s fastest one-second window” on individual plays. 
#For a player’s seasonal average, the following two types of plays currently qualify for inclusion in Sprint Speed. 
#The best of these runs, approximately two-thirds, are averaged for a player’s seasonal average.
#* Runs of two bases or more on non-homers, excluding being a runner on second base when an extra base hit happens
#* Home to first on “topped” or “weakly hit” balls.

#In the context of the graph, Steven Kwan has 334 instances in 2023 meeting this criteria.
#Competitive Runs help determine whether a player is fast in higher-stakes scenarios
#For example, a player having a fast sprint-speed during low-leverage situations (like a flyout) is not as valuable (or even worth considering) 
#as a player who utilizes their speed in high leverage scenarios listed above.

In [111]:

top_20_sprint_speed = sprint_speed_df['sprint_speed'].nlargest(20).min()

# Create the Altair chart with larger scale, adjusted opacity, and larger font size for player names
chart = alt.Chart(competitive_runs).mark_circle(size=100, opacity=0.7).encode(
    x=alt.X('sprint_speed', scale=alt.Scale(zero=False, padding=1)),
    y=alt.Y('competitive_runs', scale=alt.Scale(zero=False, padding=1)),
    tooltip=['Name', 'sprint_speed', 'competitive_runs'],
    color=alt.condition(
        alt.datum.sprint_speed >= top_20_sprint_speed,
        alt.value('blue'),  # Color for top 20 sprint speed
        alt.value('red')   # Color for not in top 20 sprint speed
    )
).properties(
    width=800,  # Adjust width of the chart
    height=600  # Adjust height of the chart
)

# Add text labels for player names with larger font size
text = chart.mark_text(align='left', baseline='middle', dx=7, fontSize=12).encode(
    text='Name'
)

# Combine chart and text
chart + text

In [None]:
#Top 20 sprint speed leaders plotted against Competitive Runs. If a player is Blue, they are in the top 20 for sprint-speed
#While the difference in sprint speed is almost negligible for the top players, some players are significantly better in 
#competitive runs than the top sprint-speed leader.
#My question is: Can players be effective base stealers without possessing elite sprint-speed?

In [177]:
#Scrape to get top SB data and then plot SB against sprint speed 

result = requests.get('https://www.fangraphs.com/leaders/major-league?qual=0&pageitems=100&type=0&sortcol=19&sortdir=default&pagenum=1')
print(result.status_code)
sb_text = result.text
                      

200


In [178]:
sb_text[:2000]

'<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="description"/><link rel="canonical"/><title>Major League Leaderboards - 2023 - Batting | FanGraphs Baseball</title><meta name="next-head-count" content="5"/><link rel="shortcut icon" href="https://www.fangraphs.com/favicon.ico"/><link rel="search" type="application/opensearchdescription+xml" href="https://cdn.fangraphs.com/opensearch.xml" title="FanGraphs Search"/><link href="https://fonts.googleapis.com/css?family=Lato:400,700&amp;subset=latin-ext" rel="stylesheet"/><meta property="og:locale" content="en_US"/><meta property="og:type" content="website"/><meta property="og:site_name" content="FanGraphs Baseball"/><meta property="og:image" content="https://www.fangraphs.com/blogs/wp-content/uploads/2016/04/flat_fg_green.png"/><meta name="twitter:card" content="summary_large_image"/><link rel="preload" href="/_next/static/css/95d2de09514e87ac.css" as="style"/><link 

In [179]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(sb_text, 'html.parser')

In [180]:
#Scrape Names
name_elements = soup.find_all('td', attrs={'data-col-id': 'Name', 'data-stat': 'Name', 'class': 'align-left fixed'})

#Scrape 
sb_elements = soup.find_all('td', attrs={'data-col-id': 'SB', 'data-stat': 'SB', 'class': 'align-right'})

cs_elements = soup.find_all('td', attrs={'data-col-id': 'CS', 'data-stat': 'CS', 'class': 'align-right'})

player_data = []

for name_element, sb_element, cs_element in zip(name_elements, sb_elements, cs_elements):
    # Extract player name
    name = name_element.text.strip()
    
    # Extract stolen bases
    sb_data = sb_element.text.strip()
    
    # Extract caught stealing
    cs_data = cs_element.text.strip()
    
    
    # Append to player_data list as a dictionary
    player_data.append({'Name': name, 'Stolen Bases': sb_data, 'Caught Stealing': cs_data})


In [185]:
df = pd.DataFrame(player_data)

# Print the DataFrame
df.head(20)

Unnamed: 0,Name,Stolen Bases,Caught Stealing
0,Ronald Acuña Jr.,73,14
1,Esteury Ruiz,67,13
2,Corbin Carroll,54,5
3,Bobby Witt Jr.,49,15
4,CJ Abrams,47,4
5,Nico Hoerner,43,7
6,Ha-Seong Kim,38,9
7,Julio Rodríguez,37,10
8,Elly De La Cruz,35,8
9,Willi Castro,33,5


In [182]:
# Convert 'Stolen Bases' column to numeric
df['Stolen Bases'] = pd.to_numeric(df['Stolen Bases'], errors='coerce')

# Now you can use the nlargest method
top_20_stolen_base_leaders = df.nlargest(20, 'Stolen Bases')

# Define the color scale based on the 'Stolen Bases' values
color_scale = alt.Scale(domain=(top_20_stolen_base_leaders['Stolen Bases'].min(), top_20_stolen_base_leaders['Stolen Bases'].max()), range=['blue', 'red'])

# Create Altair scatter plot
scatter_plot = alt.Chart(top_20_stolen_base_leaders).mark_circle(size=100).encode(
    x=alt.X('Caught Stealing:Q', title="Caught Stealing"),
    y=alt.Y('Stolen Bases:Q', title="Stolen Bases"),
    tooltip=['Name', 'Caught Stealing', 'Stolen Bases'],
    color=alt.Color('Stolen Bases:Q', scale=color_scale)  # Color scale based on 'Stolen Bases'
).properties(
    title='Stolen Bases vs. Caught Stealing'
).interactive()

scatter_plot

In [None]:
#This graph isn't that useful because if a player has more SB, they are likely to have more CS. 

In [183]:

# Convert 'Stolen Bases' and 'Caught Stealing' columns to numeric
df['Stolen Bases'] = pd.to_numeric(df['Stolen Bases'], errors='coerce')
df['Caught Stealing'] = pd.to_numeric(df['Caught Stealing'], errors='coerce')

# Calculate SB to CS ratio for each player
df['SB_CS_Ratio'] = df.apply(lambda x: x['Stolen Bases'] / x['Caught Stealing'] if x['Caught Stealing'] != 0 else x['Stolen Bases'], axis=1)

# Remove duplicate rows based on 'SB_CS_Ratio'
df = df.drop_duplicates(subset=['SB_CS_Ratio'])

# Sort the DataFrame by SB to CS ratio and select top 10 players
df_top_20 = df.nlargest(10, 'SB_CS_Ratio')

# Create scatter plot for top 10 players with text labels for names
scatter_plot = alt.Chart(df_top_20).mark_circle(size=100, color='blue').encode(
    x=alt.X('Caught Stealing:Q', title="Caught Stealing"),
    y=alt.Y('SB_CS_Ratio:Q', title="SB to CS Ratio"),
    tooltip=['Name', 'Caught Stealing', 'Stolen Bases', 'SB_CS_Ratio']
)

text = scatter_plot.mark_text(
    align='left',
    baseline='middle',
    dx=7,  # Adjust the horizontal position of the text labels
    dy=0    # Adjust the vertical position of the text labels
).encode(
    x='Caught Stealing:Q',
    y='SB_CS_Ratio:Q',
    text='Name'
)

# Combine scatter plot and text labels
scatter_plot_with_labels = (scatter_plot + text).properties(
    title='Top 10 SB to CS Ratio for Each Player'
).interactive()

# Set larger plot dimensions
scatter_plot_with_labels.properties(
    width=600,  
    height=400  
)


In [None]:
#Converted SB and CS into a ratio SB/CS. Trea Turner is a SB and Sprint-Speed leader, and had a 100% success rate in his steals
#Freddie Freeman has the second best SB to CS ratio, and is no where near the sprint-speed leaders, but is among the Competitive Runs leaders.

In [138]:
#While sprint-speed is useful to look at, a more all encompasing statistic to determine a players'
#baserunning value is to look at Fangraph's BSR:
#BsR: base running statistic that turns stolen bases, caught stealings, and other base running plays (taking extra bases, being thrown out on the bases, etc) into runs above and below average. 
#It is the combination of Weighted Stolen Base Runs (wSB), Weighted Grounded Into Double Play Runs (wGDP), and Ultimate Base Running (UBR) which are all available on the leaderboards and player pages.

Unnamed: 0,Name,Stolen Bases,Caught Stealing,SB_CS_Ratio
0,"Ronald Jr.,",73,14,5.214286
1,"Esteury Ruiz,",67,13,5.153846
2,"Corbin Carroll,",54,5,10.8
3,"Bobby Jr.,",49,15,3.266667
4,"Cj Abrams,",47,4,11.75


In [None]:
#But, from our current scraped data, we don't have this information. 
#For future exploration, it would be beneficial to consider these advanced baserunning metrics 
#To conclude, Freddie Freeman is our most out-of-the ordinary base runner, being no where near 
#the top 20 sprint speed leaders, yet possessing elite base-stealing abilities
#It would be useful to find the characteristics of the pitches/pitchers he steals bases on
#For example: Does he tend to have a high steal success rate when stealing on curveballs? 