In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
master = pd.read_csv("data/Master.csv")
batting = pd.read_csv("data/Batting.csv")
salaries = pd.read_csv("data/Salaries.csv")
positions = pd.read_csv("data/Appearances.csv")
pitching = pd.read_csv("data/Pitching.csv")

In [4]:
part_master = master[["playerID", "nameFirst", "nameLast"]]
part_batting = batting[["playerID", "yearID", "teamID", "lgID", "G", "AB", "H", "BB", "HBP", "SF"]]
part_salaries = salaries[["playerID", "yearID", "teamID", "salary"]]
part_positions = positions[["playerID", "yearID", "teamID", "GS", "G_p", "G_c", "G_1b", "G_2b", "G_3b", "G_ss", "G_lf", "G_cf", "G_rf", "G_of", "G_dh"]]
part_pitching = pitching[["playerID", "yearID", "teamID", "W", "L", "GS", "ERA"]]

In [5]:
batting_2004 = part_batting[part_batting["yearID"] == 2004]
salaries_2004 = part_salaries[part_salaries["yearID"] == 2004]
positions_2004 = part_positions[part_positions["yearID"] == 2004]
pitching_2004 = part_pitching[part_pitching["yearID"] == 2004]

In [69]:
pitching_2004.sort("W", ascending=False).head()

Unnamed: 0,playerID,yearID,teamID,W,L,GS,ERA
36106,schilcu01,2004,BOS,21,6,32,3.26
36104,santajo01,2004,MIN,20,6,34,2.61
36019,oswalro01,2004,HOU,20,10,35,3.49
36107,schmija01,2004,SFN,18,7,32,3.2
35676,clemero02,2004,HOU,18,4,33,2.98


In [6]:
battpitch_2004 = pd.merge(batting_2004, pitching_2004, how="outer", on=("playerID", "teamID"))

In [7]:
battpitchpos_2004 = pd.merge(battpitch_2004, positions_2004, how="outer", on=("playerID", "teamID"))

In [8]:
battpitchpossal_2004 = pd.merge(battpitchpos_2004, salaries_2004, how="outer", on=("playerID", "teamID"))

In [10]:
mlb_2004 = pd.merge(battpitchpossal_2004, part_master, how="inner", on="playerID")

In [14]:
mlb_2004["OBP"] = (mlb_2004.H + mlb_2004.BB + mlb_2004.HBP) / (mlb_2004.AB + mlb_2004.BB + mlb_2004.HBP + mlb_2004.SF)

In [16]:
OBP_2004 = mlb_2004.OBP

In [18]:
avg_obp_2004 = OBP_2004.mean()

In [21]:
stdev_obp_2004 = OBP_2004.std()

In [24]:
mlb_2004["OBP_std"] = (mlb_2004.OBP - avg_obp_2004) / stdev_obp_2004

In [45]:
avg_sal_2004 = mlb_2004.salary.mean()
stdev_sal_2004 = mlb_2004.salary.std()

In [70]:
avg_sal_2004

2489998.6775000002

In [46]:
mlb_2004["salary_std"] = (mlb_2004.salary - avg_sal_2004) / stdev_sal_2004

In [47]:
mlb_2004["OBP_Sal"] = mlb_2004.OBP_std - mlb_2004.salary_std

In [48]:
mlb_2004.sort("OBP_Sal", ascending=False).head(20)

Unnamed: 0,playerID,yearID_x,teamID,lgID,G,AB,H,BB,HBP,SF,...,G_of,G_dh,yearID_y,salary,nameFirst,nameLast,OBP,OBP_std,salary_std,OBP_Sal
258,cottsne01,2004,CHA,AL,56,1,1,0,0,0,...,0,0,2004,301000,Neal,Cotts,1.0,5.033774,-0.619846,5.65362
146,broweji01,2004,SFN,NL,89,2,1,1,0,0,...,0,0,2004,662500,Jim,Brower,0.666667,2.806724,-0.517482,3.324206
352,eischjo01,2004,MON,NL,22,3,2,0,0,0,...,0,0,2004,1300000,Joey,Eischen,0.666667,2.806724,-0.336965,3.143689
1207,torcato01,2004,SFN,NL,13,9,5,1,1,1,...,0,0,2004,302000,Tony,Torcato,0.583333,2.249961,-0.619563,2.869524
97,bentzch01,2004,MON,NL,36,2,1,0,0,0,...,0,0,2004,300000,Chad,Bentz,0.5,1.693199,-0.620129,2.313328
675,knottga01,2004,DET,AL,36,3,1,1,0,0,...,0,0,2004,316000,Gary,Knotts,0.5,1.693199,-0.615599,2.308797
34,ankieri01,2004,SLN,NL,5,1,0,1,0,0,...,0,0,2004,320000,Rick,Ankiel,0.5,1.693199,-0.614466,2.307665
922,olmedra01,2004,CIN,NL,8,1,0,1,0,0,...,0,0,2004,322000,Ray,Olmedo,0.5,1.693199,-0.6139,2.307098
457,gipsoch01,2004,TBA,AL,5,4,2,0,0,0,...,2,0,2004,350000,Charles,Gipson,0.5,1.693199,-0.605971,2.29917
1131,simonja01,2004,SLN,NL,15,2,1,0,0,0,...,1,0,2004,350000,Jason,Simontacchi,0.5,1.693199,-0.605971,2.29917


At this point, I have a nice dataframe called mlb_2004 that has batting, pitching, appearance, and salary stats from the 2004 MLB season. I've also added in: OBP (on base percentage), OBP_std (how many standard deviations above or below the mean OBP that player is), salary_std (how many standard deviations above or below the mean salary that player is), OBP_Sal (OBP_std minus salary_std).

This last number can help compare players based on their OBP relative to the average along with their salary relative to the average.

In [50]:
mlb_2004_pitchers = mlb_2004[mlb_2004["G_p"] > 0]

In [77]:
mlb_2004_pitchers = mlb_2004_pitchers[mlb_2004_pitchers.OBP.notnull()]

In [89]:
mlb_2014_pitchers[mlb_2014_pitchers["lgID"] == "NL"].sort("OBP_Sal", ascending=False).head(20)


Unnamed: 0,playerID,yearID_x,teamID,lgID,G,AB,H,BB,HBP,SF,...,G_of,G_dh,yearID_y,salary,nameFirst,nameLast,OBP,OBP_std,salary_std,OBP_Sal
146,broweji01,2004,SFN,NL,89,2,1,1,0,0,...,0,0,2004,662500,Jim,Brower,0.666667,2.806724,-0.517482,3.324206
352,eischjo01,2004,MON,NL,22,3,2,0,0,0,...,0,0,2004,1300000,Joey,Eischen,0.666667,2.806724,-0.336965,3.143689
97,bentzch01,2004,MON,NL,36,2,1,0,0,0,...,0,0,2004,300000,Chad,Bentz,0.5,1.693199,-0.620129,2.313328
34,ankieri01,2004,SLN,NL,5,1,0,1,0,0,...,0,0,2004,320000,Rick,Ankiel,0.5,1.693199,-0.614466,2.307665
1131,simonja01,2004,SLN,NL,15,2,1,0,0,0,...,1,0,2004,350000,Jason,Simontacchi,0.5,1.693199,-0.605971,2.29917
1033,reedst01,2004,COL,NL,65,2,1,0,0,0,...,0,0,2004,600000,Steve,Reed,0.5,1.693199,-0.53518,2.228379
834,micelda01,2004,HOU,NL,74,2,1,0,0,0,...,0,0,2004,600000,Dan,Miceli,0.5,1.693199,-0.53518,2.228379
1209,torresa01,2004,PIT,NL,84,2,1,0,0,0,...,0,0,2004,775000,Salomon,Torres,0.5,1.693199,-0.485626,2.178825
1024,randost01,2004,ARI,NL,47,12,5,0,0,0,...,0,0,2004,322500,Steve,Randolph,0.416667,1.136436,-0.613758,1.750194
911,obermwe01,2004,MIL,NL,26,39,15,1,0,0,...,0,0,2004,314000,Wes,Obermueller,0.4,1.025084,-0.616165,1.641249


Johan Santana from Minnesota is a good candidate for starting pitcher. His record in 2004 was 20-6, with a 2.61 ERA, so he's an excellent pitcher. But he's also a decent hitter. His OBP that year was .375, which is not quite 1 full standard deviation above the mean (0.858 to be exact), and his salary was just below the mean (1.6 million to the avg of 2.5 million). So his OBP_Sal is 1.11, which is much better than any of the other pitchers around him based on total wins. Unfortunately, he's an AL pitcher, so he only had 8 ABs, which means his OBP might be an anomoly. So we'll restrict this to NL pitchers, for the sake of accuracy on OBP. With that restriction, the best pitcher is either Carlos Zambrano of the Cubs (16-8, 2.75 ERA, .257 OBP, Salary 450,000, 70 ABs, .646 OBP_Sal) or Jason Marquis of the Cardinals (15-7, 3.71 ERA, .297 OBP, Salary 525,000, 72 ABs, .895 OBP_Sal). Since this homework is supposed to focus on OBP, and since I'm a Cardinals fan, I'll go with Marquis on this one.