# Group 4 Homework 7
### Our data set is looking at batting statistics for baseball from the 1800's through current.  

In [1]:
#Import the various tools that we will need as we plot our different types of graphs
import pandas as pd
import numpy as np
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, classification_report

In [2]:
# Helper methods
def createCategoricalDummies(dataFrame, categoryList):
    return pd.get_dummies(dataFrame[categoryList], prefix_sep = "::", drop_first = True)

def printMetrics(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))

## Import the data from GitHub

In [3]:
baseballstats = pd.read_csv("https://raw.githubusercontent.com/zoberender/Python-Group-Project/master/Batting.csv")
#It is important to convert NaN to 0 to get calculations to function correctly
baseballstats['IBB'] = baseballstats['IBB'].fillna(0)
baseballstats['HBP'] = baseballstats['HBP'].fillna(0)
baseballstats['SH'] = baseballstats['SH'].fillna(0)
baseballstats['SF'] = baseballstats['SF'].fillna(0)
# Create new column to calculate average
baseballstats["AVG"] = baseballstats["H"]/baseballstats["AB"]
# Create new column to calculate total bases
baseballstats["TB"] = baseballstats["H"] - baseballstats["2B"] - baseballstats["3B"] - baseballstats["HR"] + 2*baseballstats["2B"] + 3*baseballstats["3B"]+4*baseballstats["HR"]
# Slugging is Total Bases/At bats
baseballstats["SLG"] = baseballstats["TB"]/baseballstats["AB"]
#Plate Appearances to calculate OBP
baseballstats["PA"] = baseballstats["AB"] + baseballstats["BB"] + baseballstats["HBP"] + baseballstats["SF"]
#On Base Percentage = Times on base/Plat Appearances
baseballstats["OBP"] = (baseballstats["H"]+baseballstats["BB"]+baseballstats["HBP"])/baseballstats["PA"]
#OBP + SLUGGING IS A GREAT WAY TO MEASURE OVERALL PLAYER VALUE
baseballstats["OPS"] = baseballstats["OBP"] + baseballstats["SLG"]
#ISOLATED POWER IS A WAY TO MEASURE A PLAYER'S OVERALL POWER WHY TAKING OUT THE IMPACT OF BATTING AVERAGE
baseballstats['ISP'] = baseballstats['SLG'] - baseballstats['AVG']
#Add 1B as its own column.  1B = HR - (2B + 3B + HR)
baseballstats["1B"] = baseballstats["H"] - baseballstats["2B"] - baseballstats["3B"] - baseballstats["HR"]
#Add ERA to the data fram using by binning
cut_label = ['19th Century','Dead Ball','Live Ball','Integration','Divisional','Steroid','Current']
cut_bins = [0,1899,1919,1946,1968,1993,2007,max(baseballstats['yearID'])]
baseballstats['ERA'] = pd.cut(baseballstats['yearID'], bins=cut_bins, labels = cut_label)

baseballstats

Unnamed: 0,playerID,yearID,Stint,teamID,lgID,G,AB,R,H,2B,...,TEAM NAME,AVG,TB,SLG,PA,OBP,OPS,ISP,1B,ERA
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,TRO-Troy Haymakers,0.000000,0,0.000000,4.0,0.000000,0.000000,0.000000,0,19th Century
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,ROK-Rockford Forest Citys,0.271186,38,0.322034,122.0,0.295082,0.617116,0.050847,26,19th Century
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,CFC-Cleveland Forest Citys,0.291971,54,0.394161,139.0,0.302158,0.696319,0.102190,31,19th Century
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,OLY-Washington Olympics,0.330827,64,0.481203,133.0,0.330827,0.812030,0.150376,30,19th Century
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,ROK-Rockford Forest Citys,0.325000,56,0.466667,122.0,0.336066,0.802732,0.141667,25,19th Century
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107273,zimmejo02,2019,1,DET,AL,23,2,0,0,0,...,DET-Detroit Tigers,0.000000,0,0.000000,2.0,0.000000,0.000000,0.000000,0,Current
107274,zimmeky01,2019,1,KCA,AL,15,0,0,0,0,...,KCR-Kansas City Royals,,0,,0.0,,,,0,Current
107275,zimmery01,2019,1,WAS,NL,52,171,20,44,9,...,WAS-Washington Senators,0.257310,71,0.415205,190.0,0.321053,0.736257,0.157895,29,Current
107276,zobribe01,2019,1,CHN,NL,47,150,24,39,5,...,CHC-Chicago Cubs,0.260000,47,0.313333,176.0,0.357955,0.671288,0.053333,33,Current


In [4]:
baseballstats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107278 entries, 0 to 107277
Data columns (total 34 columns):
playerID     107278 non-null object
yearID       107278 non-null int64
Stint        107278 non-null int64
teamID       107278 non-null object
lgID         106597 non-null object
G            107278 non-null int64
AB           107278 non-null int64
R            107278 non-null int64
H            107278 non-null int64
2B           107278 non-null int64
3B           107278 non-null int64
HR           107278 non-null int64
RBI          106562 non-null float64
SB           104972 non-null float64
CS           83831 non-null float64
BB           107278 non-null int64
SO           105233 non-null float64
IBB          107278 non-null float64
HBP          107278 non-null float64
SH           107278 non-null float64
SF           107278 non-null float64
GIDP         81931 non-null float64
AGE          107278 non-null int64
FULL NAME    107278 non-null object
TEAM NAME    107278 non-null 

# Beginning of Zac's code
I left this in here as an idea of what I have been working on.  Trying to predict a "good" batting average based on games, at bats and age.  

We each need our own notebook for part 1 of the homework, so you can certainly delete all my stuff out.  

In [5]:
# prepare the data
# get specific columns of interest in order to predict batting average
columns = ["G",	"AB", "AGE", "AVG"]   # specify columns of interest
playeravg = baseballstats[columns]    # filter data set to specific columns
playeravg.info()                      # get column types and check for null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107278 entries, 0 to 107277
Data columns (total 4 columns):
G      107278 non-null int64
AB     107278 non-null int64
AGE    107278 non-null int64
AVG    90433 non-null float64
dtypes: float64(1), int64(3)
memory usage: 3.3 MB


In [6]:
playeravg.dropna(inplace=True) # remove na values
playeravg.info()               # get column types and verify null values are removed

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90433 entries, 0 to 107277
Data columns (total 4 columns):
G      90433 non-null int64
AB     90433 non-null int64
AGE    90433 non-null int64
AVG    90433 non-null float64
dtypes: float64(1), int64(3)
memory usage: 3.4 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
playeravg["goodBA"] = np.where(playeravg["AVG"] >= 0.275, 1, 0)  #create categorical column for a good batting average
columns2 = ["G",	"AB", "AGE", "goodBA"]                       #remove original AVG column
playeravg = playeravg[columns2]
playeravg.info()                                                 #verify columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90433 entries, 0 to 107277
Data columns (total 4 columns):
G         90433 non-null int64
AB        90433 non-null int64
AGE       90433 non-null int64
goodBA    90433 non-null int32
dtypes: int32(1), int64(3)
memory usage: 3.1 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
#columns = ["G", "AB", "AGE", "goodBA"]
#categories = ["goodBA"]
#playeravg = pd.concat(
#    [playeravg.drop(categories, axis=1), createCategoricalDummies(playeravg, categories)], axis = 1)

features = list(playeravg.columns)    #create feature list
features.remove("goodBA")             #remove target variable from feature list
target = "goodBA"                     #specify target variable

In [9]:
X = playeravg[features]               
y = playeravg[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [10]:
knn = KNeighborsClassifier(n_neighbors=3)
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [11]:
knn.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [12]:
knn.score(X_train, y_train)

0.8375796178343949

In [19]:
predictionsKNN = knn.predict(X_test)
printMetrics(y_test, predictionsKNN)

Confusion Matrix:
[[14814  2268]
 [ 3743  1784]]
------------------
Accuracy: 0.73
Recall: 0.32
Prediction: 0.44
f-measure: 0.37
------------------
              precision    recall  f1-score   support

           0       0.80      0.87      0.83     17082
           1       0.44      0.32      0.37      5527

    accuracy                           0.73     22609
   macro avg       0.62      0.60      0.60     22609
weighted avg       0.71      0.73      0.72     22609



In [14]:
lr = LogisticRegression(solver = "liblinear")
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
lr.score(X_train, y_train)

0.7850613352205709

In [17]:
lr.score(X_test, y_test)

0.7830067672165951

In [20]:
predictionsLR = lr.predict(X_test)
printMetrics(y_test, predictionsLR)

Confusion Matrix:
[[16208   874]
 [ 4032  1495]]
------------------
Accuracy: 0.78
Recall: 0.27
Prediction: 0.63
f-measure: 0.38
------------------
              precision    recall  f1-score   support

           0       0.80      0.95      0.87     17082
           1       0.63      0.27      0.38      5527

    accuracy                           0.78     22609
   macro avg       0.72      0.61      0.62     22609
weighted avg       0.76      0.78      0.75     22609

