In [205]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [206]:
df = pd.read_csv("Pitching.csv")
aw = pd.read_csv("AwardsPlayers.csv")

In [207]:
df.shape

(44139, 30)

In [208]:
df.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp',
       'ERA', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'],
      dtype='object')

In [209]:
df.isnull().sum()

playerID        0
yearID          0
stint           0
teamID          0
lgID          131
W               0
L               0
G               0
GS              0
CG              0
SHO             0
SV              0
IPouts          1
H               0
ER              0
HR              0
BB              0
SO              0
BAOpp        1525
ERA            90
IBB         14575
WP            133
HBP           559
BK              0
BFP           239
GF            133
R               0
SH          32900
SF          32900
GIDP        43394
dtype: int64

In [210]:
df.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,,,,0,,,42,,,
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,,,,0,,,292,,,
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,,,,0,,,9,,,
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,,,,0,,,257,,,
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,,,,0,,,21,,,


In [211]:
df=df[df["yearID"] >= 2000]
df.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
32900,abbotpa01,2000,1,SEA,AL,9,7,35,27,0,...,4.0,3.0,5.0,0,766.0,2.0,89,1.0,4.0,
32901,aceveju01,2000,1,MIL,NL,3,7,62,0,0,...,9.0,3.0,1.0,2,347.0,18.0,38,1.0,1.0,
32902,adamste01,2000,1,LAN,NL,6,9,66,0,0,...,0.0,5.0,0.0,0,369.0,18.0,42,3.0,0.0,
32903,aguilri01,2000,1,CHN,NL,1,2,54,0,0,...,2.0,1.0,4.0,0,210.0,44.0,28,1.0,0.0,
32904,aldresc01,2000,1,PHI,NL,1,3,23,0,0,...,0.0,1.0,1.0,0,95.0,5.0,14,1.0,2.0,


In [212]:
aw=aw[aw["yearID"] >= 2000]
aw=aw[aw["awardID"] == "Cy Young Award"]

In [213]:
aw.head()

Unnamed: 0,playerID,awardID,yearID,lgID,tie,notes
4961,martipe02,Cy Young Award,2000,AL,,
4962,johnsra05,Cy Young Award,2000,NL,,
5042,clemero02,Cy Young Award,2001,AL,,
5043,johnsra05,Cy Young Award,2001,NL,,
5124,zitoba01,Cy Young Award,2002,AL,,


In [214]:
aw.shape

(32, 6)

In [215]:
aw["awardID"] = 1

In [216]:
bb=df.merge(aw[["playerID", "yearID", "awardID"]], on=["playerID", "yearID"], how="left")

In [217]:
bb["awardID"]=bb["awardID"].fillna(0).astype(int)

In [218]:
bb.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,awardID
0,abbotpa01,2000,1,SEA,AL,9,7,35,27,0,...,3.0,5.0,0,766.0,2.0,89,1.0,4.0,,0
1,aceveju01,2000,1,MIL,NL,3,7,62,0,0,...,3.0,1.0,2,347.0,18.0,38,1.0,1.0,,0
2,adamste01,2000,1,LAN,NL,6,9,66,0,0,...,5.0,0.0,0,369.0,18.0,42,3.0,0.0,,0
3,aguilri01,2000,1,CHN,NL,1,2,54,0,0,...,1.0,4.0,0,210.0,44.0,28,1.0,0.0,,0
4,aldresc01,2000,1,PHI,NL,1,3,23,0,0,...,1.0,1.0,0,95.0,5.0,14,1.0,2.0,,0


In [219]:
bb.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'W', 'L', 'G', 'GS',
       'CG', 'SHO', 'SV', 'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp',
       'ERA', 'IBB', 'WP', 'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP',
       'awardID'],
      dtype='object')

In [220]:
bb[bb["awardID"] == 1]

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,awardID
263,johnsra05,2000,1,ARI,NL,19,7,35,35,8,...,5.0,6.0,2,1001.0,0.0,89,14.0,5.0,,1
330,martipe02,2000,1,BOS,AL,18,6,29,29,7,...,1.0,14.0,0,817.0,0.0,44,2.0,1.0,,1
774,clemero02,2001,1,NYA,AL,20,3,33,33,0,...,14.0,5.0,0,918.0,0.0,94,4.0,4.0,,1
937,johnsra05,2001,1,ARI,NL,21,6,35,34,3,...,8.0,18.0,1,994.0,1.0,74,10.0,5.0,,1
1597,johnsra05,2002,1,ARI,NL,24,5,35,35,8,...,3.0,13.0,2,1035.0,0.0,78,4.0,2.0,,1
1987,zitoba01,2002,1,OAK,AL,23,5,35,35,1,...,2.0,9.0,1,939.0,0.0,79,9.0,7.0,,1
2186,gagneer01,2003,1,LAN,NL,2,3,77,0,0,...,2.0,3.0,0,306.0,67.0,12,4.0,0.0,,1
2219,hallaro01,2003,1,TOR,AL,22,7,36,36,9,...,6.0,9.0,1,1071.0,0.0,111,3.0,2.0,,1
2776,clemero02,2004,1,HOU,NL,18,4,33,33,0,...,5.0,6.0,0,878.0,0.0,76,8.0,7.0,,1
3204,santajo01,2004,1,MIN,AL,20,6,34,34,1,...,7.0,9.0,0,881.0,0.0,70,3.0,3.0,,1


In [221]:
bb.isnull().sum()

playerID        0
yearID          0
stint           0
teamID          0
lgID            0
W               0
L               0
G               0
GS              0
CG              0
SHO             0
SV              0
IPouts          1
H               0
ER              0
HR              0
BB              0
SO              0
BAOpp           6
ERA            14
IBB             0
WP              0
HBP             1
BK              0
BFP             0
GF              0
R               0
SH              0
SF              0
GIDP        10494
awardID         0
dtype: int64

In [222]:
bb=bb.dropna(subset="ERA")

In [223]:
bb["HBP"]=bb["HBP"].fillna(0)
bb["GIDP"]=bb["GIDP"].fillna(0)
bb["IPouts"]=bb["IPouts"].fillna(0)
bb["BAOpp"]=bb["BAOpp"].fillna(0)

In [224]:
bb.isnull().sum()

playerID    0
yearID      0
stint       0
teamID      0
lgID        0
W           0
L           0
G           0
GS          0
CG          0
SHO         0
SV          0
IPouts      0
H           0
ER          0
HR          0
BB          0
SO          0
BAOpp       0
ERA         0
IBB         0
WP          0
HBP         0
BK          0
BFP         0
GF          0
R           0
SH          0
SF          0
GIDP        0
awardID     0
dtype: int64

In [225]:
bb.dtypes

playerID     object
yearID        int64
stint         int64
teamID       object
lgID         object
W             int64
L             int64
G             int64
GS            int64
CG            int64
SHO           int64
SV            int64
IPouts      float64
H             int64
ER            int64
HR            int64
BB            int64
SO            int64
BAOpp       float64
ERA         float64
IBB         float64
WP          float64
HBP         float64
BK            int64
BFP         float64
GF          float64
R             int64
SH          float64
SF          float64
GIDP        float64
awardID       int64
dtype: object

In [226]:
features = ["W", "ERA", "SO", "BB", "CG", "SHO", "ER", "IPouts"]
X=bb[features]
y=bb["awardID"]

In [227]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=24,stratify=y)

In [228]:
model=RandomForestClassifier(n_estimators=10,class_weight="balanced", random_state=24)
model.fit(X_train, y_train)

In [229]:
y_predict=model.predict(X_test)
print("Accuracy = ", accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

Accuracy =  0.9977728285077951
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2239
           1       0.67      0.33      0.44         6

    accuracy                           1.00      2245
   macro avg       0.83      0.67      0.72      2245
weighted avg       1.00      1.00      1.00      2245



In [230]:
print(confusion_matrix(y_test, y_predict))

[[2238    1]
 [   4    2]]
