In [1]:
#Import the various tools that we will need as we plot our different types of graphs
import pandas as pd
import numpy as np
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, classification_report

In [2]:
def createCategoricalDummies(dataFrame, categoryList):
    return pd.get_dummies(dataFrame[categoryList], prefix_sep = "::", drop_first = True)

def printMetrics(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))

In [3]:
baseballstats = pd.read_csv("https://raw.githubusercontent.com/zoberender/Python-Group-Project/master/Batting.csv")
HallofFame = pd.read_csv("https://raw.githubusercontent.com/zoberender/Python-Group-Project/master/HallOfFame.csv") #brought in the hall of fame data

In [4]:
HallofFame = HallofFame.drop(columns=['yearID','votedBy','ballots','needed','votes','category','needed_note']) #drop the columns not needed
HallofFame

Unnamed: 0,playerID,inducted
0,cobbty01,Y
1,ruthba01,Y
2,wagneho01,Y
3,mathech01,Y
4,johnswa01,Y
...,...,...
4186,lidgebr01,N
4187,millwke01,N
4188,zambrca01,N
4189,morrija02,Y


In [5]:
#defining the columns
columns = ['playerID','G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'SH', 'SF', 'GIDP']

In [6]:
#getting columns needed
baseballstats = baseballstats[columns]
baseballstats

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP
0,abercda01,1,4,0,0,0,0,0,0.0,0.0,0.0,0,0.0,,,,0.0
1,addybo01,25,118,30,32,6,0,0,13.0,8.0,1.0,4,0.0,,,,0.0
2,allisar01,29,137,28,40,4,5,0,19.0,3.0,1.0,2,5.0,,,,1.0
3,allisdo01,27,133,28,44,10,2,2,27.0,1.0,1.0,0,2.0,,,,0.0
4,ansonca01,25,120,29,39,11,3,0,16.0,6.0,2.0,2,1.0,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107273,zimmejo02,23,2,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0
107274,zimmeky01,15,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
107275,zimmery01,52,171,20,44,9,0,6,27.0,0.0,0.0,17,39.0,0.0,0.0,2.0,4.0
107276,zobribe01,47,150,24,39,5,0,1,17.0,0.0,0.0,23,24.0,0.0,0.0,2.0,6.0


In [7]:
#filling NAN with a 0
baseballstats.fillna(0)

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP
0,abercda01,1,4,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
1,addybo01,25,118,30,32,6,0,0,13.0,8.0,1.0,4,0.0,0.0,0.0,0.0,0.0
2,allisar01,29,137,28,40,4,5,0,19.0,3.0,1.0,2,5.0,0.0,0.0,0.0,1.0
3,allisdo01,27,133,28,44,10,2,2,27.0,1.0,1.0,0,2.0,0.0,0.0,0.0,0.0
4,ansonca01,25,120,29,39,11,3,0,16.0,6.0,2.0,2,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107273,zimmejo02,23,2,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0
107274,zimmeky01,15,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
107275,zimmery01,52,171,20,44,9,0,6,27.0,0.0,0.0,17,39.0,0.0,0.0,2.0,4.0
107276,zobribe01,47,150,24,39,5,0,1,17.0,0.0,0.0,23,24.0,0.0,0.0,2.0,6.0


In [8]:
#grouping by player ID
baseballstats = baseballstats.groupby('playerID').sum().reset_index()

In [9]:
#getting Alex Rodriguez stats
Arod = baseballstats.loc[baseballstats['playerID'] == 'rodrial01']
Arod

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP
15013,rodrial01,2784,10566,2021,3115,548,31,696,2086.0,329.0,76.0,1338,2287.0,97.0,16.0,111.0,261.0


In [10]:
#merging the two tables
baseballstats=baseballstats.merge(HallofFame, left_on='playerID', right_on='playerID')

In [11]:
#replace inducted values with a 1 and 0
baseballstats=baseballstats.replace({'inducted': 'Y'}, 1)
baseballstats=baseballstats.replace({'inducted': 'N'}, 0)
baseballstats

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP,inducted
0,aaronha01,3298,12364,2174,3771,624,98,755,2297.0,240.0,73.0,1402,1383.0,293.0,21.0,121.0,328.0,1
1,abbotji01,263,21,0,2,0,0,0,3.0,0.0,0.0,0,10.0,0.0,3.0,0.0,0.0,0
2,adamsba01,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0,0
3,adamsba01,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0,0
4,adamsba01,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4115,zahnge01,304,43,3,6,0,0,0,1.0,0.0,0.0,0,9.0,0.0,10.0,0.0,1.0,0
4116,zambrca01,384,693,75,165,26,3,24,71.0,1.0,0.0,10,240.0,0.0,37.0,4.0,14.0,0
4117,zeileto01,2158,7573,986,2004,397,23,253,1110.0,53.0,51.0,945,1279.0,47.0,8.0,81.0,223.0,0
4118,zimmech01,1280,4546,617,1225,222,76,26,625.0,151.0,0.0,390,511.0,0.0,51.0,0.0,0.0,0


In [12]:
#remove player ID
baseballstats = baseballstats.drop(columns=['playerID'])
baseballstats

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP,inducted
0,3298,12364,2174,3771,624,98,755,2297.0,240.0,73.0,1402,1383.0,293.0,21.0,121.0,328.0,1
1,263,21,0,2,0,0,0,3.0,0.0,0.0,0,10.0,0.0,3.0,0.0,0.0,0
2,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0,0
3,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0,0
4,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4115,304,43,3,6,0,0,0,1.0,0.0,0.0,0,9.0,0.0,10.0,0.0,1.0,0
4116,384,693,75,165,26,3,24,71.0,1.0,0.0,10,240.0,0.0,37.0,4.0,14.0,0
4117,2158,7573,986,2004,397,23,253,1110.0,53.0,51.0,945,1279.0,47.0,8.0,81.0,223.0,0
4118,1280,4546,617,1225,222,76,26,625.0,151.0,0.0,390,511.0,0.0,51.0,0.0,0.0,0


In [13]:
#preparing the dataset
features = list(baseballstats.columns)
features.remove("inducted")
target = "inducted"
print(f"Feature categories: {features}")
print(f"Target feature: {target}")

Feature categories: ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'SH', 'SF', 'GIDP']
Target feature: inducted


In [14]:
X = baseballstats[features]
X

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP
0,3298,12364,2174,3771,624,98,755,2297.0,240.0,73.0,1402,1383.0,293.0,21.0,121.0,328.0
1,263,21,0,2,0,0,0,3.0,0.0,0.0,0,10.0,0.0,3.0,0.0,0.0
2,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0
3,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0
4,482,1019,79,216,31,15,3,75.0,1.0,1.0,53,194.0,0.0,35.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4115,304,43,3,6,0,0,0,1.0,0.0,0.0,0,9.0,0.0,10.0,0.0,1.0
4116,384,693,75,165,26,3,24,71.0,1.0,0.0,10,240.0,0.0,37.0,4.0,14.0
4117,2158,7573,986,2004,397,23,253,1110.0,53.0,51.0,945,1279.0,47.0,8.0,81.0,223.0
4118,1280,4546,617,1225,222,76,26,625.0,151.0,0.0,390,511.0,0.0,51.0,0.0,0.0


In [15]:
y = baseballstats[target]
y

0       1
1       0
2       0
3       0
4       0
       ..
4115    0
4116    0
4117    0
4118    0
4119    0
Name: inducted, Length: 4120, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

print(f"Length of X_train (feature training set): {len(X_train)}")
print(f"Length of X_test (feature test set): {len(X_test)}")
print(f"Length of y_train (target training set): {len(y_train)}")
print(f"Length of y_test (target training set): {len(y_test)}")

Length of X_train (feature training set): 3090
Length of X_test (feature test set): 1030
Length of y_train (target training set): 3090
Length of y_test (target training set): 1030


## K-nearest neighbor

In [17]:
#K-nearest neighbor classifier, 2 groups
knn = KNeighborsClassifier(n_neighbors=2)

In [18]:
knn.fit(X_train, y_train)    # Remember, X = features, y = target

KNeighborsClassifier(n_neighbors=2)

In [19]:
knn.score(X_train, y_train)  # What's our score with the training data set?

0.9459546925566343

In [20]:
knn.score(X_test, y_test)    # What's our score with the test data set?

0.9281553398058252

In [21]:
#sklearn confusion matrix
predictions = knn.predict(X_test)
printMetrics(y_test, predictions)

Confusion Matrix:
[[955   5]
 [ 69   1]]
------------------
Accuracy: 0.93
Recall: 0.01
Prediction: 0.17
f-measure: 0.03
------------------
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       960
           1       0.17      0.01      0.03        70

    accuracy                           0.93      1030
   macro avg       0.55      0.50      0.49      1030
weighted avg       0.88      0.93      0.90      1030



## Logistic Regression

In [22]:
lr = LogisticRegression(solver="liblinear")

In [23]:
lr.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [24]:
lr.score(X_train, y_train)

0.9394822006472492

In [25]:
lr.score(X_test, y_test)

0.9320388349514563

In [26]:
predictions = lr.predict(X_test)
printMetrics(y_test, predictions)

Confusion Matrix:
[[960   0]
 [ 70   0]]
------------------
Accuracy: 0.93
Recall: 0.00
Prediction: 0.00
f-measure: 0.00
------------------
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       960
           1       0.00      0.00      0.00        70

    accuracy                           0.93      1030
   macro avg       0.47      0.50      0.48      1030
weighted avg       0.87      0.93      0.90      1030



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
#remove player ID from Arods table.
Arod = Arod.drop(columns=['playerID'])
Arod

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,SH,SF,GIDP
15013,2784,10566,2021,3115,548,31,696,2086.0,329.0,76.0,1338,2287.0,97.0,16.0,111.0,261.0


In [28]:
#Logistic regression prediction
predictions = lr.predict(Arod)
print(predictions)
#Interpretation, the model predicts that Arod's career is not a hall of fame career.

[0]


In [29]:
#K nearest neighbor prediction
predictions = knn.predict(Arod)
print(predictions)
#Interpretation, the model predicts that Arod's career is a hall of fame career.

[1]
