In [None]:
!pip install pybaseball
!pip install unidecode

Near Future Plans: Create rankings for defense and pitching. Then, make rankings that encompass everything. Experiment with certain stats having different weights.

# Data Cleaning

In [3]:
from unidecode import unidecode

In [4]:
# For queries that return data with improper encoding (usually for non english letters)
def clean_string_encoding(input_string):
  decoded_string = bytes(input_string, 'utf-8').decode('unicode_escape')
  encoded_string = decoded_string.encode('latin-1').decode('utf-8')
  return encoded_string

In [5]:
# This deals with any potential inconsistencies between datasets or special markings such as *
def clean_names_column(column):
  column = column.apply(clean_string_encoding).apply(lambda x:
  unidecode(x)
  .replace('*', '')
  .replace('#', '')
  .replace('Jr.', '')
  .replace('III', '')
  .replace('II', '')
  .strip())
  return column

# Data Queries

In [6]:
import pybaseball as pyb

### MLB Draft Data

Columns in amateur_draft:

```python
['Tm', 'Signed', 'Bonus', 'Name', 'Pos', 'WAR', 'G', 'AB', 'HR', 'BA', 'OPS', 'G.1', 'W', 'L', 'ERA', 'WHIP', 'SV', 'Type', 'Drafted Out of', 'MLB']
```

In [7]:
def get_draft(year, round):
  draft = pyb.amateur_draft(year, round).set_index('OvPck')
  draft['Name'] = clean_names_column(draft['Name'])
  return draft

### Offense Data

Columns in batting_stats (fangraphs):

```python
['Season', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes', 'IFH', 'BU', 'BUH', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Dol', 'Spd', 'wRC+', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI', 'PH', 'WPA/LI', 'Clutch', 'FB% (Pitch)', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'BsR', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL% (sc)', 'CU% (sc)', 'KC% (sc)', 'EP% (sc)', 'CH% (sc)', 'SC% (sc)', 'KN% (sc)', 'UN% (sc)', 'vFA (sc)', 'vFT (sc)', 'vFC (sc)', 'vFS (sc)', 'vFO (sc)', 'vSI (sc)', 'vSL (sc)', 'vCU (sc)', 'vKC (sc)', 'vEP (sc)', 'vCH (sc)', 'vSC (sc)', 'vKN (sc)', 'FA-X (sc)', 'FT-X (sc)', 'FC-X (sc)', 'FS-X (sc)', 'FO-X (sc)', 'SI-X (sc)', 'SL-X (sc)', 'CU-X (sc)', 'KC-X (sc)', 'EP-X (sc)', 'CH-X (sc)', 'SC-X (sc)', 'KN-X (sc)', 'FA-Z (sc)', 'FT-Z (sc)', 'FC-Z (sc)', 'FS-Z (sc)', 'FO-Z (sc)', 'SI-Z (sc)', 'SL-Z (sc)', 'CU-Z (sc)', 'KC-Z (sc)', 'EP-Z (sc)', 'CH-Z (sc)', 'SC-Z (sc)', 'KN-Z (sc)', 'wFA (sc)', 'wFT (sc)', 'wFC (sc)', 'wFS (sc)', 'wFO (sc)', 'wSI (sc)', 'wSL (sc)', 'wCU (sc)', 'wKC (sc)', 'wEP (sc)', 'wCH (sc)', 'wSC (sc)', 'wKN (sc)', 'wFA/C (sc)', 'wFT/C (sc)', 'wFC/C (sc)', 'wFS/C (sc)', 'wFO/C (sc)', 'wSI/C (sc)', 'wSL/C (sc)', 'wCU/C (sc)', 'wKC/C (sc)', 'wEP/C (sc)', 'wCH/C (sc)', 'wSC/C (sc)', 'wKN/C (sc)', 'O-Swing% (sc)', 'Z-Swing% (sc)', 'Swing% (sc)', 'O-Contact% (sc)', 'Z-Contact% (sc)', 'Contact% (sc)', 'Zone% (sc)', 'Pace', 'Def', 'wSB', 'UBR', 'Age Rng', 'Off', 'Lg', 'wGDP', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'TTO%', 'CH% (pi)', 'CS% (pi)', 'CU% (pi)', 'FA% (pi)', 'FC% (pi)', 'FS% (pi)', 'KN% (pi)', 'SB% (pi)', 'SI% (pi)', 'SL% (pi)', 'XX% (pi)', 'vCH (pi)', 'vCS (pi)', 'vCU (pi)', 'vFA (pi)', 'vFC (pi)', 'vFS (pi)', 'vKN (pi)', 'vSB (pi)', 'vSI (pi)', 'vSL (pi)', 'vXX (pi)', 'CH-X (pi)', 'CS-X (pi)', 'CU-X (pi)', 'FA-X (pi)', 'FC-X (pi)', 'FS-X (pi)', 'KN-X (pi)', 'SB-X (pi)', 'SI-X (pi)', 'SL-X (pi)', 'XX-X (pi)', 'CH-Z (pi)', 'CS-Z (pi)', 'CU-Z (pi)', 'FA-Z (pi)', 'FC-Z (pi)', 'FS-Z (pi)', 'KN-Z (pi)', 'SB-Z (pi)', 'SI-Z (pi)', 'SL-Z (pi)', 'XX-Z (pi)', 'wCH (pi)', 'wCS (pi)', 'wCU (pi)', 'wFA (pi)', 'wFC (pi)', 'wFS (pi)', 'wKN (pi)', 'wSB (pi)', 'wSI (pi)', 'wSL (pi)', 'wXX (pi)', 'wCH/C (pi)', 'wCS/C (pi)', 'wCU/C (pi)', 'wFA/C (pi)', 'wFC/C (pi)', 'wFS/C (pi)', 'wKN/C (pi)', 'wSB/C (pi)', 'wSI/C (pi)', 'wSL/C (pi)', 'wXX/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)', 'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)', 'Zone% (pi)', 'Pace (pi)', 'FRM', 'AVG+', 'BB%+', 'K%+', 'OBP+', 'SLG+', 'ISO+', 'BABIP+', 'LD+%', 'GB%+', 'FB%+', 'HR/FB%+', 'Pull%+', 'Cent%+', 'Oppo%+', 'Soft%+', 'Med%+', 'Hard%+', 'EV', 'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA', 'xSLG', 'xwOBA', 'L-WAR']

 ```

In [4]:
def get_offense(year):
  offense = pyb.batting_stats(year).set_index('Name').drop('IDfg', axis=1).sort_index()
  return offense

# Data Analysis

In [19]:
from math import trunc

In [11]:
from scipy.stats import zscore

### MLB Draft Data

##### Functions

In [11]:
# This returns only players with MLB experience
def made_mlb(draft):
  draft['MLB'] = ['Y' if x else 'N' for x in draft['G'].notnull()]
  return draft[['Name', 'Tm', 'MLB']]

##### Application

In [None]:
data = made_mlb(get_draft(2003, 4))
data

### Offense Data

##### Functions

In [1]:
# If you want to use K%, this makes it so the higher the value, the better
def inv_k_pct(df):
  df['invK%'] = 1 - df['K%']

In [47]:
# Turns every applicable value into its zscore for its column
def apply_zscore(df, features: list):
  df_z = df.copy()[features]

  for col in df_z.columns:
    if df_z[col].dtype != object:
      df_z[col] = list(zscore(df_z[col]))
  return df_z

# Maps each player to their average zscore
def assign_zscores(df, features: list):
  return apply_zscore(df, features).mean(1).to_dict()

# Creates and prints a ranked list of players in order of their average zscore
def make_rankings(zscores: dict, features):
  player_rankings = sorted([(player, zscores[player]) for player in zscores], key=lambda x: x[1], reverse=True)
  scores = []
  print(f'Players are rated on where they stand compared to the averages of the following: {features}')
  for idx, tup in enumerate(player_rankings):
    print(f'{idx + 1}: {tup[0]} - {trunc(tup[1] * 100)}')
    scores.append(trunc(tup[1] * 100))

# Currently the final scores are a player's average zscore across different features times 100

###### Application

In [25]:
offense_data = get_offense(2023)

In [48]:
features = ['Clutch', 'BABIP', 'OPS'] # This is an arbitrary selection
# Next step: create a GUI to select features (there are so many!), for now, all features are listed in query section

In [51]:
player_scores = assign_zscores(offense_data, features)
# player_scores

In [50]:
make_rankings(player_scores, features)

Players are rated on where they stand compared to the averages of the following: ['Clutch', 'BABIP', 'OPS']
1: Bryce Harper - 192
2: Ronald Acuna Jr. - 183
3: Yandy Diaz - 164
4: Corey Seager - 159
5: Corbin Carroll - 121
6: Freddie Freeman - 112
7: Shohei Ohtani - 96
8: T.J. Friedl - 88
9: James Outman - 82
10: Mookie Betts - 82
11: Luis Arraez - 81
12: Ezequiel Tovar - 80
13: Christian Yelich - 77
14: Bo Bichette - 74
15: Matt Olson - 72
16: Anthony Santander - 71
17: J.P. Crawford - 67
18: Kyle Tucker - 64
19: Elias Diaz - 63
20: Luis Robert - 59
21: Michael Harris II - 59
22: Seiya Suzuki - 57
23: Joey Meneses - 56
24: Jorge Soler - 55
25: Justin Turner - 52
26: Lane Thomas - 51
27: Austin Riley - 51
28: William Contreras - 49
29: Ozzie Albies - 49
30: Nick Castellanos - 46
31: Jeimer Candelario - 44
32: Jose Ramirez - 44
33: Cody Bellinger - 43
34: Bryson Stott - 42
35: Adley Rutschman - 42
36: Ian Happ - 39
37: Mark Canha - 36
38: Eddie Rosario - 36
39: Nathaniel Lowe - 32
40: Br