In [2]:
%matplotlib notebook

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats

# Grabbing data from CSV

batting_csv_path = "Data/team.csv"

complete_df =  pd.read_csv(batting_csv_path)


In [3]:
complete_df.columns

Index(['year', 'league_id', 'team_id', 'franchise_id', 'div_id', 'rank', 'g',
       'ghome', 'w', 'l', 'div_win', 'wc_win', 'lg_win', 'ws_win', 'r', 'ab',
       'h', 'double', 'triple', 'hr', 'bb', 'so', 'sb', 'cs', 'hbp', 'sf',
       'ra', 'er', 'era', 'cg', 'sho', 'sv', 'ipouts', 'ha', 'hra', 'bba',
       'soa', 'e', 'dp', 'fp', 'name', 'park', 'attendance', 'bpf', 'ppf',
       'team_id_br', 'team_id_lahman45', 'team_id_retro'],
      dtype='object')

In [4]:
# CLEANING THE DATA

dh_df = pd.DataFrame(complete_df[['year', 'league_id', 'franchise_id','rank', 'g','ghome', 'w', 'l','r', 'ab','h', 'double', 'triple', 'hr', 'bb', 'so','ra', 'er', 'era','sho','ha', 'hra', 'bba','soa', 'e', 'dp', 'fp','attendance','bpf', 'ppf']])

In [5]:
rename_df = dh_df.rename(columns={'year':"Year", 'league_id':"League",'franchise_id':"Franchise", 'rank':"Rank",'g':"Games", 'w':"Wins", 'l':"Loses",'r':"Runs",'ab':"At_Bats", 'h': "Hits",  "double":"Doubles", 'triple':"Triples", 'hr':"HR", 'bb':"BB", 'so':"SO", 'ra':"RA",'er':"ER", 'era':"ERA", 'sho':"SHO", 'ha':"HA", 'hra':"HRA", 'bba':"BBA", 'soa':"SOA",'e':"Errors", 'dp':"DP",'fp':"FP", 'attendance':"Attendance", 'bpf':"DPF", 'ppf':"PPF"})

rename_df.columns
rename_df["Year"].dtypes

dtype('int64')

In [6]:
# GRABBING DATA FROM 1950-2015

Modern_df = rename_df.loc[(rename_df["Year"] > 1950)]

# BREAKING THE DATA DOWN BY LEAGUE

NL_df = Modern_df.loc[(rename_df["League"]== "NL")]

AL_df = Modern_df.loc[(rename_df["League"]== "AL")]
AL_df.reset_index()

AL_df.index = AL_df["Year"]
NL_df.index = NL_df["Year"]

In [7]:
# RUNS SCORED PER YEAR BETWEEN LEAUGES 

AL_Runs = pd.DataFrame(AL_df[["Year","Runs", "Games"]])
NL_Runs = pd.DataFrame(NL_df[["Year","Runs", "Games"]])

AL_Runs1 = pd.DataFrame(AL_Runs[["Runs","Games"]])
NL_Runs1 = pd.DataFrame(NL_Runs[["Runs","Games"]])

NL_Runs1.head(10)

Unnamed: 0_level_0,Runs,Games
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1951,855,158
1951,723,155
1951,614,155
1951,559,155
1951,781,157
1951,648,154
1951,689,155
1951,683,155
1952,775,155
1952,569,155


In [8]:
# RUNS SCORED BY ALL TEAMS IN EACH LEAGUE GROUPED BY YEARS

grouped_Runs_AL = AL_Runs1.groupby("Year")
grouped_Runs_NL = NL_Runs1.groupby("Year")

# TOTAL RUNS SCORED BY EACH TEAM FOR ALL YEARS

AL_Sums = grouped_Runs_AL["Runs"].sum() 
NL_Sums = grouped_Runs_NL["Runs"].sum() 

# GETTING THE AVERAGE RUN PER GAME FOR A TEAM IN EACH LEAGUE

AL_RPG = pd.DataFrame(grouped_Runs_AL["Runs"].sum()/grouped_Runs_AL["Games"].sum())
NL_RPG = pd.DataFrame(grouped_Runs_NL["Runs"].sum()/grouped_Runs_NL["Games"].sum())

# sUBTRACTING THE AVERAGE AL RUN PER GAME BY THE AVERAGE NL RUN PER GAME

Difference = pd.DataFrame(AL_RPG-NL_RPG)

Diff3 = Difference.rename(columns={0: "Run_Difference"})

Diff_reset = Diff3.reset_index()

In [9]:
# PLOTTING THE RUNS PER GAME FOR EACH LEAUGE

AL_RPG[0].plot(kind= "line", color= "b")
NL_RPG[0].plot(kind= "line", color= "r")
plt.title("Average RPG for an AL team vs. NL team ", size = 16)
plt.xlabel("Years:   1950 - 2015", size = 14)
plt.ylabel("Average Runs per game",size = 14)
x_axis5 = Diff_reset["Year"]
plt.ylim(3,6)

plt.grid()
plt.legend(["AL","NL"])


plt.show()

<IPython.core.display.Javascript object>

In [10]:
# CREATING TRENDLINE

x_axis = Diff_reset["Year"]
y_axis = Diff3["Run_Difference"]

vc_slope2, vc_int2, vc_r2, vc_p2, vc_std_err2 = stats.linregress(
    x_axis, y_axis)
vc_fit2 = vc_slope2 * x_axis + vc_int2

In [13]:
# PLOTTING THE DIFFERENCE VARIABLE MADE ABOVE 

#Diff3["Run_Difference"].plot(kind= "line", color= "b", marker="o", label = "Run Difference")
plt.plot(x_axis, y_axis, color="b",  marker="o", label = "Run Difference")

plt.plot(x_axis, vc_fit2, "b--", color= "r")
plt.plot(x_axis, [0 for _ in range(len(x_axis))], color = "orange")

plt.title("AL vs. NL RPG Difference", size = 16)
plt.xlabel("Years:   1950 - 2015", size = 14)
plt.ylabel("Runs Per Game RPG", size = 14)
plt.grid(alpha= .25)
plt.legend(["Run Difference", "Trendline", "0"])


Average_Run_Dif = Diff3["Run_Difference"].mean()
ARD = round(Average_Run_Dif,3)

plt.show()


<IPython.core.display.Javascript object>

Since the Adoption of the DH, the AL has scored 0.179 more runs per game than the NL, each year


In [14]:
# SAME ANALYSIS FOR HOME-RUNS

AL_Home_Runs = pd.DataFrame(AL_df[["Year","HR", "Games"]])
NL_Home_Runs = pd.DataFrame(NL_df[["Year","HR", "Games"]])

AL_Home_Runs1 = pd.DataFrame(AL_Home_Runs[["HR","Games"]])

NL_Home_Runs1 = pd.DataFrame(NL_Home_Runs[["HR","Games"]])

In [15]:
grouped_Home_Runs_AL = AL_Home_Runs1.groupby("Year")
grouped_Home_Runs_NL = NL_Home_Runs1.groupby("Year")

AL_Sums_HR = pd.DataFrame(grouped_Home_Runs_AL["HR"].sum()) 
NL_Sums_HR = pd.DataFrame(grouped_Home_Runs_NL["HR"].sum())

AL_HRPG = pd.DataFrame(grouped_Home_Runs_AL["HR"].sum()/grouped_Runs_AL["Games"].sum()) #Average Runs per game, per year 
NL_HRPG = pd.DataFrame(grouped_Home_Runs_NL["HR"].sum()/grouped_Runs_NL["Games"].sum()) #Average Runs per game, per year 

#Home runs per year difference simulated over a normal 162 game season. Had to get per game numbers to factor out 
#the uneven number of teams and games played

HR_Difference = pd.DataFrame(AL_HRPG-NL_HRPG)

HR_Diff2 = HR_Difference.reset_index()

HR_Diff3 = HR_Diff2.rename(columns={0: "HR_Difference"})


In [16]:
x_axis1 = HR_Diff3["Year"]
y_axis1 = HR_Diff3["HR_Difference"]

vc_slope, vc_int, vc_r, vc_p, vc_std_err = stats.linregress(
    x_axis1, y_axis1)
vc_fit = vc_slope * x_axis1 + vc_int

In [17]:
HR_Difference.plot(kind= "line",color= "b", marker="o", label="Home Runs")
plt.scatter(x_axis1, y_axis1, marker="*")
plt.plot(x_axis1, vc_fit, "b--", color= "r")
plt.plot(x_axis1, [0 for _ in range(len(x_axis))], color = "orange")

plt.title("AL vs. NL Home Runs per Game Difference", size = 16)
plt.xlabel("Years:   1950 - 2015", size = 14)
plt.ylabel("AL HR - NL HR", size = 14)
plt.grid(alpha= .25)
plt.legend(["Home Run Difference", "Trendline", "0"])

plt.show()

Average_HR_Run_Dif = HR_Diff3["HR_Difference"].mean()

AHRD = round(Average_HR_Run_Dif,3)


<IPython.core.display.Javascript object>

Since the Adoption of the DH, A team in the AL has hit 0.06 more home runs than a tean in the NL for each year


In [18]:
# Has the Addition of the DH added more shutouts for the AL vs. the NL

AL_SHO = pd.DataFrame(AL_df[["Year","SHO", "Games"]])
NL_SHO = pd.DataFrame(NL_df[["Year","SHO", "Games"]])

AL_SHO = pd.DataFrame(AL_SHO[["SHO","Games"]])
AL_SHO

NL_SHO = pd.DataFrame(NL_SHO[["SHO","Games"]])
#NL_SHO

In [19]:
grouped_SHO_AL = AL_SHO.groupby("Year")
grouped_SHO_NL = NL_SHO.groupby("Year")

AL_Sums3 = grouped_SHO_AL["SHO"].sum() #SHO per game, per year 
NL_Sums3 = grouped_SHO_NL["SHO"].sum() #Average Runs per game, per year 

AL_SHOPG = pd.DataFrame(grouped_SHO_AL["SHO"].sum()/grouped_SHO_AL["Games"].sum()*162) #FIGURE THIS OUT AGAIN!!!! WHY SO LOW?
NL_SHOPG = pd.DataFrame(grouped_SHO_NL["SHO"].sum()/grouped_SHO_NL["Games"].sum()*162) #IS IT .05 Per game over 43 Years?

Difference_SHO = pd.DataFrame(AL_SHOPG-NL_SHOPG)

Diff5 = Difference_SHO.reset_index()
Diff6 = Diff5.rename(columns={0: "SHO_Difference"})

In [20]:
x_axis3 = Diff6["Year"]
y_axis3 = Diff6["SHO_Difference"]

vc_slope3, vc_int3, vc_r3, vc_p3, vc_std_err3 = stats.linregress(
    x_axis3, y_axis3)
vc_fit3 = vc_slope3 * x_axis3 + vc_int3

In [21]:
Difference_SHO.plot(kind= "line", color= "b", marker="o", label="Home Runs")
plt.scatter(x_axis3, y_axis3, marker="o")
plt.plot(x_axis3, vc_fit3, "b--", color = "r")
plt.plot(x_axis3, [0 for _ in range(len(x_axis3))], color = "orange")

plt.title("Projected Season Shutout Difference AL vs. NL", size= 14)
plt.xlabel("Years:   1960 - 2015", size = 14)
plt.ylabel("AL RPG - NL RPG", size = 14)
plt.grid(alpha= .7)
plt.legend(["Shutout Difference", "Trendline", "0"])

plt.show()

Average_SHO_Dif = Diff6["SHO_Difference"].mean()

ASD = round(Average_SHO_Dif,3)


<IPython.core.display.Javascript object>

Since the Adoption of the DH, the AL has had -0.487 more Shut Outs than the NL for each game, each year


In [None]:
plt.savefig('SHO_PS.png')