# Generating Figures

###### Scope: Our project is to uncover data trends among all star MLB pitchers from 2016-2021. We will examine relationships between their strikeout %, ERA (earned run average) , BB (walk)%, in their all star seasons, as well as some of their physical dimensions and place of birth (state if domestic, country if foreign born).

In [None]:
import pandas as pd
import requests
import json
from pprint import pprint
import csv
from matplotlib import pyplot as plt
import numpy as np

In [None]:
fpath1 = 'Resources/roster_and_stats_merged.csv'
fpath2 = 'Resources/dimensions_dict.csv'

allstar_df = pd.read_csv(fpath1)
allstar_df

league_df = pd.read_csv(fpath2)
league_df

# Grabbing csv data generated in 'Cleaning DataFrames.ipynb'
stats_df = pd.read_csv(fpath1,index_col=0)
players_df = pd.read_csv(fpath2,index_col=0)


In [None]:
stats_df.head()

In [None]:
players_df = players_df.replace({'Dominican Republic':'Dom. Rep.'})
players_df.head(2)

### Gathering statistics based on All Star birth-location

In [None]:
# Creating group based on country
country_grp = players_df.groupby(['birth_country'])['birth_country'].count()

# Plot Country group data
title = 'All Star Players by Birth Country'
subtitle = '2015-2019'

country_grp.plot(kind='bar',title=subtitle, figsize=(5,5),
                 xlabel='Country',ylabel='Number of Players',rot=50)
plt.suptitle(title)
plt.tight_layout()


# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}.png')

In [None]:
# Creating group based on (non-US) country
country_grp2 = country_grp[country_grp.index != 'USA']

# Filtering for non USA born players
as_us = players_df[players_df['birth_country']=='USA']
state_grp = as_us.groupby(["birth_state"])['birth_state'].count()

# Building sortable dataframe for state group
df = pd.DataFrame(country_grp2)
df2 = df.rename(columns={"birth_country": "num"}).sort_values(
    by=['num'], axis=0, ascending= False)

# Plot Country group data
title = 'All Star Players by (non-USA) Birth Country'
subtitle = '2015-2019'

df2.plot(kind='bar',title=subtitle, figsize=(5,5),
                 xlabel='Country',ylabel='Number of Players',rot=50)
plt.suptitle(title)
plt.tight_layout()
plt.legend().remove()

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}.png')

In [None]:
# Creating US and non-US groups
country_df = pd.DataFrame(country_grp)
non_s = country_df.loc[country_df.index != 'USA'].sum()
non = int(non_s['birth_country'])

usa_s = country_df.loc[country_df.index == 'USA'].sum()
usa = int(usa_s['birth_country'])


# Plot Country group data
labels = ['Non-USA','USA']
xvals = range(len(labels))

fig=plt.figure(figsize=(5,5))
plt.bar(xvals[0], non, width=0.1, label="Non-USA")
plt.bar(xvals[1], usa, width=0.1, label = "USA")
plt.legend()

plt.xticks(xvals, labels=labels)
title = 'Non-US vs. USA-borne All Star Players'
subtitle = '2015-2019'
plt.title(subtitle)
plt.suptitle(title)
plt.ylabel('Number of Players')

plt.figure(figsize=(5,5))
plt.tight_layout()

# Commented-out to prevent regenerating file
fig.savefig(f'Images/{title}.png')

In [None]:
# Filtering for USA born players, and grouping by state
as_us = players_df[players_df['birth_country']=='USA']
state_grp = as_us.groupby(["birth_state"])['birth_state'].count()

# Building sortable dataframe for state group
df = pd.DataFrame(state_grp)
df2 = df.rename(columns={"birth_state": "num"}).sort_values(
    by=['num'], axis=0, ascending= False)

# Plot State data

title = 'All Star Players by Birth State'
subtitle = '2015-2019'

df2.plot(kind='bar',title=subtitle, color='orange',
                 xlabel='State',ylabel='Number of Players',rot=50)
plt.suptitle(title)
plt.tight_layout()
plt.legend().remove()
# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}.png')

In [None]:
# Merge player dimensions into statistic dataframe
by_weight = pd.merge(players_df,stats_df,how='left',left_on='name_display_first_last',right_on='Player Name')

# Plot ERA+ against weight
title = 'All Star Players ERA+ vs. Weight'
subtitle = '2015-2019'

by_weight.plot('weight','ERA+',kind='scatter', title=subtitle,
               rot=45,xlabel='Weight')
plt.suptitle(title)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}.png')


### Analyzing All Stars vs. League-wide Statistics

In [None]:
fpath3 = 'Resources/league_stats.csv'

In [None]:
# Generating allstar dataframe
allstar_df = pd.read_csv(fpath1)

# Generating league-wide dataframe
league_df = pd.read_csv(fpath3)

In [None]:
allstar_df.head(2)

In [None]:
league_df.head(2)

In [None]:
# Grouping All-Stars by year to summarize statistical trends
allstar_year_group = allstar_df.groupby('Year')
allstar_year_mean = allstar_year_group.mean()
allstar_year_df = pd.DataFrame(allstar_year_mean)
allstar_year_df = allstar_year_df.reset_index()
allstar_year_df.head(2)

In [None]:
# Grouping League by year to summarize statistical trends
league_year_group = league_df.groupby('Year')
league_year_mean = league_year_group.mean()
league_year_df = pd.DataFrame(league_year_mean)
league_year_df = league_year_df.reset_index()
league_year_df.head(2)

In [None]:
# Defining stats to include in plot
a = 'Year'
b = 'ERA+'

labels_year = league_year_df[a].tolist()
star_ERA = allstar_year_df[b].tolist()
league_ERA = league_year_df[b].tolist()


x = np.arange(len(labels_year))  # the label locations
width = 0.35  # the width of the bars

# Building Bar plots of data
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, star_ERA, width, label=f'All Star {b}')
rects2 = ax.bar(x + width/2, league_ERA, width, label=f'League {b}')


plt.legend()
plt.xticks(np.arange(5), labels=labels_year)
plt.title(f'All Star vs. League in Average {b} by Year')
plt.xlabel(f'{a}')
plt.ylabel(f'{b}')
plt.ylim((0,200))

# Commented-out to prevent regenerating file
plt.savefig('Images/Allstar_vs_League_ERA+.png',dpi=400)

In [None]:
# Defining stats to include in plot
a = 'Year'
b = 'K%'

labels_year = league_year_df[a].tolist()
star_K = allstar_year_df[b].tolist()
league_K = league_year_df[b].tolist()

x = np.arange(len(labels_year))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, star_K, width, label=f'All Star {b}')
rects2 = ax.bar(x + width/2, league_K, width, label=f'League {b}')

plt.legend()
plt.xticks(np.arange(5), labels=labels_year)
plt.title(f'All Star vs. League in Average {b} by Year')
plt.xlabel(f'{a}')
plt.ylabel(f'{b}')
plt.ylim((0,0.4))

plt.show()

# Commented-out to prevent regenerating file
plt.savefig('Images/Allstar_vs_League_K%.png',dpi=400)

In [None]:
# Defining stats to include in plot
a = 'Year'
b = 'BB%'

labels_year = league_year_df[a].tolist()
star_BB = allstar_year_df[b].tolist()
league_BB = league_year_df[b].tolist()

x = np.arange(len(labels_year))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, star_BB, width, label=f'All Star {b}')
rects2 = ax.bar(x + width/2, league_BB, width, label=f'League {b}')

plt.legend()
plt.xticks(np.arange(5), labels=labels_year)
plt.title(f'All Star vs. League in Average {b} by Year')
plt.xlabel(f'{a}')
plt.ylabel(f'{b}')
plt.ylim((0,0.4))

plt.show()

# Commented-out to prevent regenerating file
plt.savefig('Images/Allstar_vs_League_BB%.png',dpi=400)

In [None]:
# Generating box plot of ERA data
fig = allstar_df.boxplot("ERA+", by="Year")
fig.set_title('')
fig.set_ylabel('ERA+')

# Commented-out to prevent regenerating file
plt.savefig('Images/Allstar ERA+ Boxplot.png',dpi=400)

### Starting scatter plot trend analysis

In [None]:
# Cleaning up 'dimensions_dict.csv' data set to display height and weight
dimensions = pd.read_csv(fpath2)
dimensions2 = dimensions.iloc[:,[1,2,3,4,5,6,7,11,26,12,14,16]]
dimensions3 = dimensions2.iloc[:,[4,2,10,6,1,3,11]]
dimensions3.head(2)

In [None]:
# Additional cleaning to get height and weight
height_df = dimensions3.copy()

# Convert height into single dimension (inches)
height_df['height'] = height_df['height_feet']*12+height_df['height_inches']
height_df = height_df.rename(columns={"name_display_first_last": "Player Name"})

# Filtering out extra columns
height_df = height_df.iloc[:,[0,1,7]]
height_df

In [None]:
# Merging the player dimensions into the allstar dataframe
# Cleaning duplicates and NA values
merge_dim = pd.merge(height_df,allstar_df,left_on='Player Name', right_on='Player Name',how ='left')
merge_dim = merge_dim.drop_duplicates()
merge_dim = merge_dim.dropna()
merge_dim.head(2)

In [None]:
year = 2019
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year}'

df = merge_dim.loc[merge_dim['Year'] == year,:]
from scipy.stats import linregress

x_values = df[x_axis]
y_values = df[y_axis]


(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
year = 2018
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year}'

df = merge_dim.loc[merge_dim['Year'] == year,:]
from scipy.stats import linregress

x_values = df[x_axis]
y_values = df[y_axis]


(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,1)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
year = 2017
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year}'

df = merge_dim.loc[merge_dim['Year'] == year,:]
from scipy.stats import linregress

x_values = df[x_axis]
y_values = df[y_axis]


(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
year = 2016
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year}'

df = merge_dim.loc[merge_dim['Year'] == year,:]
from scipy.stats import linregress

x_values = df[x_axis]
y_values = df[y_axis]


(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
year = 2015
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year}'

df = merge_dim.loc[merge_dim['Year'] == year,:]
from scipy.stats import linregress

x_values = df[x_axis]
y_values = df[y_axis]


(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
# Setting parameters for kernel
year = 2015
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year} (Outliers removed)'

# Grabbing all stars for selected year
df = merge_dim.loc[merge_dim['Year'] == year,:]
era = df['ERA+']

# Building quartiles to filter out outliers
quartiles = era.quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25] 
upperq = quartiles[0.75]
iqr = upperq-lowerq 
lower_bound = lowerq - (1.5*iqr) 
upper_bound = upperq + (1.5*iqr) 
print(upper_bound)

# Removing outliers from data set for this year
df2= df.loc[df['ERA+'] < upper_bound,:]

# Gathering data for plotting
x_values = df2[x_axis]
y_values = df2[y_axis]

#Regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.ylim(75,280)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
# Setting parameters for kernel
year = 2016
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year} (Outliers removed)'

# Grabbing all stars for selected year
df = merge_dim.loc[merge_dim['Year'] == year,:]
era = df['ERA+']

# Building quartiles to filter out outliers
quartiles = era.quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25] 
upperq = quartiles[0.75]
iqr = upperq-lowerq 
lower_bound = lowerq - (1.5*iqr) 
upper_bound = upperq + (1.5*iqr) 
print(upper_bound)

# Removing outliers from data set for this year
df2= df.loc[df['ERA+'] < upper_bound,:]

# Gathering data for plotting
x_values = df2[x_axis]
y_values = df2[y_axis]

#Regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.ylim(75,280)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
# Setting parameters for kernel
year = 2017
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year} (Outliers removed)'

# Grabbing all stars for selected year
df = merge_dim.loc[merge_dim['Year'] == year,:]
era = df['ERA+']

# Building quartiles to filter out outliers
quartiles = era.quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25] 
upperq = quartiles[0.75]
iqr = upperq-lowerq 
lower_bound = lowerq - (1.5*iqr) 
upper_bound = upperq + (1.5*iqr) 
print(upper_bound)

# Removing outliers from data set for this year
df2= df.loc[df['ERA+'] < upper_bound,:]

# Gathering data for plotting
x_values = df2[x_axis]
y_values = df2[y_axis]

#Regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.ylim(75,280)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
# Setting parameters for kernel
year = 2018
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year} (Outliers removed)'

# Grabbing all stars for selected year
df = merge_dim.loc[merge_dim['Year'] == year,:]
era = df['ERA+']

# Building quartiles to filter out outliers
quartiles = era.quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25] 
upperq = quartiles[0.75]
iqr = upperq-lowerq 
lower_bound = lowerq - (1.5*iqr) 
upper_bound = upperq + (1.5*iqr) 
print(upper_bound)

# Removing outliers from data set for this year
df2= df.loc[df['ERA+'] < upper_bound,:]

# Gathering data for plotting
x_values = df2[x_axis]
y_values = df2[y_axis]

#Regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,3)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.ylim(75,280)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
# Setting parameters for kernel
year = 2019
x_axis = 'height'
y_axis = 'ERA+'
title = f'{y_axis} vs. {x_axis} for {year} (Outliers removed)'

# Grabbing all stars for selected year
df = merge_dim.loc[merge_dim['Year'] == year,:]
era = df['ERA+']

# Building quartiles to filter out outliers
quartiles = era.quantile([0.25,0.5,0.75])
lowerq = quartiles[0.25] 
upperq = quartiles[0.75]
iqr = upperq-lowerq 
lower_bound = lowerq - (1.5*iqr) 
upper_bound = upperq + (1.5*iqr) 
print(upper_bound)

# Removing outliers from data set for this year
df2= df.loc[df['ERA+'] < upper_bound,:]

# Gathering data for plotting
x_values = df2[x_axis]
y_values = df2[y_axis]

#Regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(72,250),fontsize=15,color="red")
plt.title(title)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.ylim(75,280)

# Commented-out to prevent regenerating file
plt.savefig(f'Images/{title}_regression.png',dpi=400)

In [None]:
# Resetting the dataframe name from one students analysis to anothers
file = allstar_df

### Drawing comparisons of All Star performance using differing statistics 

In [None]:
# Dataset
K_stat = file['K%']
ERA = file['ERA+']

# Set title and plot parameters
title = 'Strikeout (K%) vs. ERA+ for All Stars'
subtitle = '2015-2019'

K_stat = K_stat.astype(float)
    
plt.scatter(ERA, K_stat)
plt.title(subtitle)
plt.suptitle(title)
plt.xlabel('ERA+')
plt.ylabel('% Strikeouts')
plt.tight_layout()

plt.savefig(f'Images/{title}.png',dpi=400)

In [None]:
# Dataset
BB_stat = file['BB%']
ERA = file['ERA+']

# Set title and plot parameters
title = 'Base on Balls (BB%) vs. ERA+ for All Stars'
subtitle = '2015-2019'

BB_stat = BB_stat.astype(float)
    
plt.scatter(ERA, BB_stat)
plt.title(subtitle)
plt.suptitle(title)
plt.xlabel('ERA+')
plt.ylabel('BB%')
plt.tight_layout()

plt.savefig(f'Images/{title}.png',dpi=400)

In [None]:
# Dataset
K_stat
BB_stat

# Set title and plot parameters
title = 'BB% vs. K% for All Stars'
subtitle = '2015-2019'
plt.scatter(K_stat, BB_stat)
plt.title(subtitle)
plt.suptitle(title)
plt.xlabel('K%')
plt.ylabel('BB%')
plt.tight_layout()

plt.savefig(f'Images/{title}.png',dpi=400)

In [None]:
# Dataset
file['Age']
ERA

# Set title and plot parameters
title = 'ERA+ vs. Age for All Stars'
subtitle = '2015-2019'
plt.scatter(file['Age'],ERA)
plt.title(subtitle)
plt.suptitle(title)
plt.xlabel('Age')
plt.ylabel('ERA')
plt.tight_layout()

plt.savefig(f'Images/{title}.png',dpi=400)

In [None]:
# Resetting the dataframe name from one students analysis to anothers
# (this dataframe is statistics for all pichers in MLB)
fpath = league_df

In [None]:
# Grabbing the average ERA+ for the whole League
average_era = fpath['ERA+'].mean()

# Grabbing the average K% for the whole League
average_K = fpath['K%']
average_K = average_K.astype(float)
mean_k = average_K.mean()

# Set title and plot parameters
title = 'All Stars ERA-K% vs the Average MLB Pitcher'
subtitle = '2015-2019'

plt.scatter(ERA, K_stat, label = 'All Stars')
plt.plot(average_era, mean_k, 'ro', label ='Average')
plt.legend(loc='best')
plt.xlabel('ERA+')
plt.ylabel('K%')
plt.suptitle(title)
plt.title(subtitle)
plt.tight_layout()


plt.savefig(f'Images/{title}.png',dpi=400)

In [None]:
# Gather BB% stats for league-wide pitchers
average_BB = fpath['BB%']
average_BB = average_BB.astype(float)
mean_BB = average_BB.mean()
mean_BB

BB_stat = file['BB%']
ERA = file['ERA+']

BB_stat = BB_stat.astype(float)

# Set title and plot parameters
title = 'All Stars ERA-BB% vs the Average MLB Pitcher'
subtitle = '2015-2019'
    
plt.scatter(ERA, BB_stat, label = 'All Stars')
plt.plot(average_era, mean_BB, 'ro', label ='Average')
plt.legend(loc='best')
plt.xlabel('ERA+')
plt.ylabel('BB%')
plt.suptitle(title)
plt.title(subtitle)
plt.tight_layout()


plt.savefig(f'Images/{title}.png',dpi=400)