In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import and preprocess data

In [2]:
# Source: 
# Read in csv with data from 2008-2018
savant_data = pd.read_csv('benchmark_savant_data_2008-18.csv')

# Reformat

savant_data.columns

Index(['pitches', 'player_id', 'player_name', 'year', 'total_pitches',
       'pitch_percent', 'ba', 'iso', 'babip', 'slg', 'woba', 'xwoba', 'xba',
       'hits', 'abs', 'launch_speed', 'launch_angle', 'spin_rate', 'velocity',
       'effective_speed', 'whiffs', 'swings', 'takes', 'eff_min_vel',
       'release_extension', 'pos3_int_start_distance',
       'pos4_int_start_distance', 'pos5_int_start_distance',
       'pos6_int_start_distance', 'pos7_int_start_distance',
       'pos8_int_start_distance', 'pos9_int_start_distance'],
      dtype='object')

In [3]:
# Read in batter stats
# Reference: https://www.baseball-reference.com/bullpen/Baseball_statistics
bat_stats = pd.read_csv('benchmark_savant_batter_stats.csv')

# Strip white space from column names and names
bat_stats.columns = bat_stats.columns.str.strip()
bat_stats['first_name'] = bat_stats['first_name'].str.strip()
bat_stats['last_name'] = bat_stats['last_name'].str.strip()

bat_stats.head()

Unnamed: 0,last_name,first_name,player_id,year,player_age,b_total_pa,b_single,b_double,b_triple,b_home_run,b_walk,b_k_percent,batting_avg,slg_percent,on_base_percent,on_base_plus_slg,b_rbi,r_total_stolen_base,Unnamed: 18
0,Hunter,Torii,,2013,38,652,125,37,5,17,26,17.3,0.304,0.465,0.333,0.798,84,3,
1,Konerko,Paul,,2013,37,520,86,16,0,12,45,14.2,0.244,0.355,0.313,0.669,54,0,
2,Ortiz,David,,2013,38,600,90,38,2,30,76,14.7,0.309,0.564,0.395,0.959,103,4,
3,Beltre,Adrian,,2013,34,690,137,32,0,30,50,11.3,0.315,0.509,0.371,0.88,92,1,
4,Beltran,Carlos,,2013,36,600,107,30,3,24,38,15.0,0.296,0.491,0.338,0.829,84,2,


In [4]:
# https://www.baseball-reference.com/awards/mvp.shtml
mvp_winners = pd.read_csv('benchmark_mvp_winners.csv', header=1)
mvp_winners = mvp_winners.loc[~mvp_winners['Year'].isna()]

# Reformat columns
mvp_winners['Year'] = mvp_winners['Year'].astype('int')

# Remove extraneous columns
mvp_winners = mvp_winners[['Year', 'Lg', 'Name', 'Tm', 'WAR', 'BA', 'OBP', 'SLG', 'HR', 'RBI', 'SB']]

# Reformat player name as separate first and last name
mvp_winners['Name'] = mvp_winners['Name'].map(lambda name: name[:name.rfind('\\')]) # remove trailing ID
mvp_winners['FirstName'] = mvp_winners['Name'].map(lambda name: name[:name.rfind(' ')])
mvp_winners['LastName'] = mvp_winners['Name'].map(lambda name: name[(name.rfind(' ') + 1):])

# Add label
mvp_winners['Label'] = True

# Rename columns for consistency with stats
mvp_winners.columns = ['year', 'league', 'name', 'team', 'WAR', 'BA', 'OBP', 'SLG', 'HR', 'RBI', 'SB',
                      'first_name', 'last_name', 'label']

mvp_winners

Unnamed: 0,year,league,name,team,WAR,BA,OBP,SLG,HR,RBI,SB,first_name,last_name,label
0,2021,AL,Shohei Ohtani,LAA,9.1,0.257,0.372,0.592,46.0,100.0,26.0,Shohei,Ohtani,True
1,2021,NL,Bryce Harper,PHI,5.9,0.309,0.429,0.615,35.0,84.0,13.0,Bryce,Harper,True
3,2020,AL,José Abreu,CHW,3.0,0.317,0.37,0.617,19.0,60.0,0.0,José,Abreu,True
4,2020,NL,Freddie Freeman,ATL,3.3,0.341,0.462,0.64,13.0,53.0,2.0,Freddie,Freeman,True
6,2019,AL,Mike Trout,LAA,7.9,0.291,0.438,0.645,45.0,104.0,11.0,Mike,Trout,True
7,2019,NL,Cody Bellinger,LAD,8.6,0.305,0.406,0.629,47.0,115.0,15.0,Cody,Bellinger,True
9,2018,AL,Mookie Betts,BOS,10.7,0.346,0.438,0.64,32.0,80.0,30.0,Mookie,Betts,True
10,2018,NL,Christian Yelich,MIL,7.3,0.326,0.402,0.598,36.0,110.0,22.0,Christian,Yelich,True
12,2017,AL,Jose Altuve,HOU,7.7,0.346,0.41,0.547,24.0,81.0,32.0,Jose,Altuve,True
13,2017,NL,Giancarlo Stanton,MIA,7.9,0.281,0.376,0.631,59.0,132.0,2.0,Giancarlo,Stanton,True


In [5]:
# https://baseball.vote/results/mvp
# has more detailed voting results (not just winner of each year)

In [6]:
bat_stats.columns

# # Identify names of columns denoting key statistics [hard-coded from observation]
id_columns = ['last_name', 'first_name', 'year']
key_stats = ['b_home_run', 'on_base_percent', 'slg_percent', 'b_rbi', 'r_total_stolen_base']

all_data = bat_stats[id_columns + key_stats]

In [7]:
# Add labels to bat stats about whether they won MVP in that year
def add_mvp_labels(data, mvp):
    data_copy = data.copy()
    data_copy['label'] = data_copy.apply(lambda row: 
                                             ["{} {}".format(row['first_name'], row['last_name']) in mvp['name'].tolist()][0]
                                             and any(row['year'] == mvp.loc[mvp['name'] == "{} {}".format(row['first_name'], row['last_name'])]['year'])
                                         ,
                                         axis = 1
                                        )
    return data_copy

all_labeled = add_mvp_labels(all_data, mvp_winners)
all_labeled.loc[all_labeled['label']].sort_values(by=['year', 'last_name'])

Unnamed: 0,last_name,first_name,year,b_home_run,on_base_percent,slg_percent,b_rbi,r_total_stolen_base,label
1566,Pedroia,Dustin,2008,17,0.372,0.493,83,20,True
1487,Pujols,Albert,2008,37,0.462,0.653,116,7,True
1042,Mauer,Joe,2009,28,0.444,0.587,96,4,True
1034,Pujols,Albert,2009,47,0.443,0.658,135,16,True
1164,Hamilton,Josh,2010,32,0.41,0.633,100,8,True
1266,Votto,Joey,2010,37,0.424,0.6,113,16,True
833,Braun,Ryan,2011,33,0.397,0.597,111,33,True
301,Cabrera,Miguel,2012,44,0.393,0.606,139,4,True
21,Cabrera,Miguel,2013,44,0.442,0.636,137,3,True
78,McCutchen,Andrew,2013,21,0.404,0.508,84,27,True


## Split and balance data

In [8]:
# Split data
# Train on 2008-2017; keep 2018 for "testing"
train_data = all_labeled.loc[all_labeled.year != 2018]
val_data = all_labeled.loc[all_labeled.year == 2018]

train_data.shape, val_data.shape

((1455, 9), (123, 9))

In [9]:
# Balance data
from sklearn.utils import resample

# Separate by label
train_data_mvp = train_data.loc[train_data['label']]
train_data_not = train_data.loc[~train_data['label']]

# Upsample mvp data to match number of non-mvp points
train_data_mvp =  resample(train_data_mvp, 
                           replace=True,
                           n_samples=len(train_data_not),
                           random_state=1  ## for reproducibility
                          )

# Merge all data into single train_data dataframe
train_data = pd.concat([train_data_mvp, train_data_not])

# Check that lengths are as expected
len(train_data_mvp), len(train_data_not), len(train_data)

(1438, 1438, 2876)

## Train logistic regression model: predict MVP winners from current-year stats

In [2]:
# Wrap all model training, evaluating in series of functions
# Code largely from Assignment 8
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


# Return X, y subsets of data
def get_x_y(data, features, label):
    return data[features], data[label]


# Predict with specific threshold
def predict_at_threshold(model, X_data, threshold):
    # Get probabilities
    prob_pred = model.predict_proba(X_data)[:, 1]
    return np.where(prob_pred >= threshold, 1, 0)


# Evaluate training accuracy
def get_accuracy(model, X_data, y_data, threshold, data_type='Training'):
    # Get predictions
    y_pred = predict_at_threshold(model, X_data, threshold)
    
    # Get accuracy from training data
    y_true = y_data.copy()
    acc = accuracy_score(y_true, y_pred)
    
    # Display and return results
    print('{} Accuracy: {:.3f}'.format(data_type.capitalize(), acc))
    return acc


# Print TP, FP, TN, FN for given data
def display_confusion_matrix(y_true, y_pred):
    # Get confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Print results
    # Based on documentation, row i ~ true label and col j ~ predicted label
    print("True positives: {:>5}".format(conf_matrix[1, 1]))
    print("True negatives: {:>5}".format(conf_matrix[0, 0]))
    print("False positives:{:>5}".format(conf_matrix[0, 1]))
    print("False negatives:{:>5}".format(conf_matrix[1, 0]))


# Train and evaluate Logistic Regression model
def train_and_eval(train, val, feature_names, label_name='label', threshold=0.5):
    # Get X, y data from training, validation datasets
    X_train, y_train = get_x_y(train, feature_names, label_name)
    X_val, y_val = get_x_y(val, feature_names, label_name)
    
    # Fit model to training data
    model = LogisticRegression(fit_intercept=True)  # use default L2 penalty
    model.fit(X_train, y_train)
    
    # Evaluate training accuracy
    train_acc = get_accuracy(model, X_train, y_train, threshold, 'training')
    display_confusion_matrix(y_train, predict_at_threshold(model, X_train, threshold))
    print()
    
    # Evaluate validation accuracy
    val_acc = get_accuracy(model, X_val, y_val, threshold, 'validation')
    display_confusion_matrix(y_val, predict_at_threshold(model, X_val, threshold))
    print()
    
    # Display confusion matrix for ALL data
    print("For all data:")
    all_data = pd.concat([train, val]).drop_duplicates()
    X_all, y_all = get_x_y(all_data, feature_names, label_name)
    all_pred = predict_at_threshold(model, X_all, threshold)
    display_confusion_matrix(y_all, all_pred)

SyntaxError: invalid syntax (1897237354.py, line 35)

In [11]:
train_and_eval(train_data, val_data, key_stats, threshold=0.5)

Training Accuracy: 0.919
True positives:  1368
True negatives:  1276
False positives:  162
False negatives:   70

Validation Accuracy: 0.919
True positives:     2
True negatives:   111
False positives:   10
False negatives:    0

For all data:
True positives:    18
True negatives:  1387
False positives:  172
False negatives:    1


## Train regression model based on ranks

In [12]:
# Given a data frame with numeric features, returns feature ranks, grouped by given feature
def rank_features(data, rank_features, group_feature, method='average'):
    # Assume all non-rank and non-group features are identifying features
    output = data[[col for col in data.columns if (not col in rank_features)]]
    
    # Loop through all rank features
    for rank_ft in rank_features:
        # Group and rank; save in new column with suffix '_rank'
        output.loc[:, f'{rank_ft}_rank'] = data.groupby(group_feature)[rank_ft].rank(method='average')
    
    return output

In [19]:
# Rank train and val data
method = 'average'
train_ranked = rank_features(train_data, key_stats, 'year', method=method)
val_ranked = rank_features(val_data, key_stats, 'year', method=method)

# Run logistic regression using ranked features
ranked_stats = ['{}_rank'.format(feat) for feat in key_stats]
train_and_eval(train_ranked, val_ranked, ranked_stats, threshold=0.5)

Training Accuracy: 0.946
True positives:  1368
True negatives:  1352
False positives:   86
False negatives:   70

Validation Accuracy: 0.984
True positives:     0
True negatives:   121
False positives:    0
False negatives:    2

For all data:
True positives:    16
True negatives:  1473
False positives:   86
False negatives:    3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
