# Calculate Z-scores for players

This takes the list of players that we're trying to calculate scores for, and creates z-scores for players with an arbitrarily chosen minimum innings (for pitchers) or plate appearances (for hitters).

Notes for this year's sheet:

- Z-score avg/stdev calculated using minimum IP (70) or PA (35) stats for batting and pitching.
- Reliever value was artificially decremented as usual since they're normally over-inflated value-wise. Multiplied by (project innings) / (90th pctile of projected innings), capped at 1.0.

In [1]:
import pandas as pd
import numpy as np
import os
import sqlalchemy
import psycopg2

# arbitrarily high max view columns
pd.options.display.max_columns = 150

  """)


## SqlAlchemy Connection Information

These are used to get and return a connection to the postgres DB so that we can query for player stats and write them back to the table.

In [2]:
# connection information for the database
POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_IP = "192.168.0.118"
POSTGRES_PORT = 5432
POSTGRES_DB = "postgres"

def get_sqlalchemy_engine():
    """
    Create and return a SQLAlchemy engine for inserting into postgres.
    """
    # ## Write Information Back to Database
    #
    return sqlalchemy.create_engine(
        "postgres://{user}:{password}@{host}:{port}/{db}".format(
            user=POSTGRES_USER,
            password=POSTGRES_PASSWORD,
            host=POSTGRES_IP,
            port=POSTGRES_PORT,
            db=POSTGRES_DB,
        )
    )

In [3]:
# create a connection and read in pitchers data

engine = get_sqlalchemy_engine()
conn = engine.connect()

## Calculate Z-Scores

For each of the players, we calculate a z-score based on how they compare to the rest of the league. There are several columns that we want to do this for.

- For each z-score, there is an arbitrary minimum set to inflate the league average away from non-full-time players.
- After each is totaled, the z-scores are totaled and then z-scored again to make a single number. This is done so that we can compare pitchers and batters into the same chart.

In [4]:
# read in information from postgres
dfp = pd.read_sql("select * from fantasy.pitchers_projections_depth_charts", conn)
dfb = pd.read_sql("select * from fantasy.batters_projections_depth_charts", conn)

In [6]:
# create any calculated columns necessary
dfb['k_pct'] = dfb['so'] / dfb['ab']
dfb['rc'] = dfb['ab'] * dfb['obp'] * dfb['slg']

In [47]:
# thresholds for batters and pitchers

MINIMUM_INNINGS_PITCHED = 70
MINIMUM_PLATE_APPEARANCES = 350
min_pa = dfb['pa'] > MINIMUM_PLATE_APPEARANCES
min_ip = dfp['ip'] > MINIMUM_INNINGS_PITCHED

# dict of columns that we want to calculate z-scores for
# +1 means more is better, -1 means negative is better
dfb_score_cols = {
    "pa": {"dir": 1, "weight": 1.3}, 
    "k_pct": {"dir": -1, "weight": 1.0},
    "hr": {"dir": 1, "weight": 0.9},
    "rc": {"dir": 1, "weight": 1.4},
    "woba": {"dir": 1, "weight": 1.3},
    "slg": {"dir": 1, "weight": 1.0}
}
dfp_score_cols = {
    "ip": {"dir": 1, "weight": 1.3},
    "era": {"dir": -1, "weight": 1.0},
    "hr": {"dir": -1, "weight": 0.9},
    "so": {"dir": 1, "weight": 1.0},
    "whip": {"dir": -1, "weight": 1.5},
    "k-9": {"dir": 1, "weight": 1.3}
}
dfb_weight = 1.0
dfp_weight = 0.8

# get all of the score columns for maximum z-scoring
dfb_score_col_names = [x + "_score" for x in dfb_score_cols.keys()]
dfp_score_col_names = [x + "_score" for x in dfp_score_cols.keys()]

In [48]:
# only mean and stdev are filtered by minimums, so that way the player's actual
# projections are still counted normally.

for col in dfb_score_cols.keys():
    col_score = col + "_score"
    dfb[col_score] = (
        (dfb[col] - dfb[col][min_pa].mean()) / dfb[col][min_pa].std(ddof=0)
        * dfb_score_cols[col]["dir"]
        * dfb_score_cols[col]["weight"]
    )

for col in dfp_score_cols.keys():
    col_score = col + "_score"
    dfp[col_score] = (
        (dfp[col] - dfp[col][min_ip].mean()) / dfp[col][min_ip].std(ddof=0)
        * dfp_score_cols[col]["dir"]
        * dfp_score_cols[col]["weight"]
    )

In [50]:
# sum all of the values into 'total_score'
dfb['total_score'] = dfb[dfb_score_col_names].sum(axis=1)
dfb['total_z_score'] = (
    dfb['total_score'] - dfb['total_score'][min_pa].mean()) / dfb['total_score'][min_pa].std(ddof=0)
dfb['total_z_score'] *= dfb_weight
dfb['total_z_score_rank'] = dfb['total_z_score'].rank(ascending=False)

# do the same for pitchers, but scale back reliever value according to innings
dfp['total_score'] = dfp[dfp_score_col_names].sum(axis=1)
dfp['reliever_decrement'] = (dfp['ip'] / dfp['ip'].max())
dfp['total_score'] = dfp['total_score'] * dfp['reliever_decrement']
dfp['total_z_score'] = (
    dfp['total_score'] - dfp['total_score'][min_ip].mean()) / dfp['total_score'][min_ip].std(ddof=0)
dfp['total_z_score'] *= dfp_weight
dfp['total_z_score_rank'] = dfp['total_z_score'].rank(ascending=False)

# sort by score descending
dfb.sort_values(by='total_z_score_rank', inplace=True)
dfp.sort_values(by='total_z_score_rank', inplace=True)


In [46]:
dfp.head(100)

Unnamed: 0,index,name,w,l,sv,hld,era,gs,g,ip,h,er,hr,so,bb,whip,k-9,bb-9,fip,war,ip_score,era_score,hr_score,so_score,whip_score,k-9_score,total_score,reliever_decrement,total_z_score,total_z_score_rank
14,14,Max Scherzer,16,8,0,0,3.06,32,32,210.0,162,71,25,271,54,1.03,11.61,2.32,3.04,5.7,3.797510,2.149044,-1.156693,3.237561,5.524147,4.895322,18.359465,0.995261,3.120061,1.0
5,5,Jacob deGrom,14,9,0,0,2.84,32,32,211.0,174,67,20,249,50,1.06,10.59,2.14,2.82,6.0,3.851170,2.544001,-0.230757,2.743113,4.969895,3.457306,17.334728,1.000000,2.939544,2.0
1,1,Chris Sale,15,6,0,0,2.70,29,29,183.0,141,55,18,238,38,0.97,11.67,1.85,2.63,6.0,2.348686,2.795336,0.139618,2.495889,6.632653,4.979911,16.818735,0.867299,2.848647,3.0
27,27,Justin Verlander,16,8,0,0,3.20,32,32,204.0,164,72,26,243,49,1.04,10.73,2.18,3.27,5.2,3.475549,1.897708,-1.341881,2.608264,5.339397,3.654681,15.115064,0.966825,2.548529,4.0
22,22,Corey Kluber,16,9,0,0,3.27,32,32,211.0,187,77,23,220,43,1.09,9.37,1.82,3.22,5.3,3.851170,1.772040,-0.786319,2.091341,4.415642,1.737326,13.081201,1.000000,2.190245,5.0
25,25,Gerrit Cole,14,8,0,0,3.35,32,32,198.0,167,73,22,231,57,1.14,10.51,2.61,3.25,4.8,3.153588,1.628420,-0.601131,2.338565,3.491888,3.344521,12.532978,0.938389,2.093670,6.0
34,34,Blake Snell,15,10,0,0,3.15,32,32,190.0,153,66,18,224,75,1.20,10.63,3.55,3.30,4.0,2.724307,1.987471,0.139618,2.181241,2.383383,3.513699,11.642874,0.900474,1.936870,7.0
40,40,Carlos Carrasco,15,9,0,0,3.41,32,32,194.0,175,73,23,212,45,1.13,9.83,2.08,3.33,4.5,2.938947,1.520704,-0.786319,1.911542,3.676639,2.385843,10.708945,0.919431,1.772349,8.0
55,55,Trevor Bauer,14,9,0,0,3.47,32,32,191.0,166,74,20,215,67,1.22,10.09,3.15,3.42,4.3,2.777967,1.412989,-0.230757,1.978967,2.013881,2.752396,9.690709,0.905213,1.592977,9.0
46,46,Aaron Nola,13,9,0,0,3.39,31,31,189.0,170,71,20,200,51,1.17,9.49,2.44,3.36,4.2,2.670647,1.556609,-0.230757,1.641844,2.937635,1.906504,9.389522,0.895735,1.539920,10.0


## Write to the Database

Prior to writing the excel files, write back to the database for safekeeping.

In [41]:
dfb.to_sql("batters_scores", conn, schema="fantasy", if_exists="replace")
conn.execute("grant select on fantasy.batters_scores to public")
dfp.to_sql("pitchers_scores", conn, schema="fantasy", if_exists="replace")
conn.execute("grant select on fantasy.pitchers_scores to public")

<sqlalchemy.engine.result.ResultProxy at 0x11663f438>

## Create Draft Sheet

This sheet includes a number of important pieces of information for drafting specifically, so that players can look up by eligibility as well as important stats.

- Name
- Eligibility
- Positions
- Combined scores

In [42]:
draft_sheet_query = """
select
	p.fullname,
	p.averagedraftposition as espn_avg_draft_pos,
	p.eligibility,
	p.position,
	p.injurystatus,
	sc.catg,
	sc.score
from fantasy.players p
	left join (
		select name, 'b' as catg, total_z_score as score
		from fantasy.batters_scores
		union all
		select name, 'p' as catg, total_z_score as score
		from fantasy.pitchers_scores
	) sc
		on p.fullname = sc.name
		and case when position like '%P%' then 'p' else 'b' end = sc.catg
"""
dfd = pd.read_sql(sqlalchemy.text(draft_sheet_query), conn)

In [43]:
dfd.sort_values(by='score', inplace=True, ascending=False)
dfd.head(100)
dfd['rank'] = dfd['score'].rank(ascending=False, method='first', na_option='bottom')

In [44]:
dfd.head()

Unnamed: 0,fullname,espn_avg_draft_pos,eligibility,position,injurystatus,catg,score,rank
540,Max Scherzer,5.21,P|SP,SP,ACTIVE,p,3.120061,1.0
0,Mike Trout,1.5,OF|UTIL,OF,ACTIVE,b,3.039583,2.0
541,Jacob deGrom,11.97,P|SP,SP,ACTIVE,p,2.939544,3.0
542,Chris Sale,9.21,P|SP,SP,ACTIVE,p,2.848647,4.0
543,Justin Verlander,18.85,P|SP,SP,ACTIVE,p,2.548529,5.0


## Write Excel files for distribution

In [17]:
with pd.ExcelWriter('fantasy2019.xlsx') as writer:
    dfb.to_excel(writer, sheet_name='batters')
    dfp.to_excel(writer, sheet_name='pitchers')
    dfd.to_excel(writer, sheet_name='draft_sheet')