In [1]:
import load

df = load.data("strikes", season=[15, 16, 17])

In [2]:
df["ruth"] = load.player_attribute(df, "pitcher", "ruthlessness", vibes=True, mods=True, items=True, broken_items=True)
df["musc"] = load.player_attribute(df, "batter", "musclitude", vibes=False, mods=True, items=True, broken_items=True)
df["fwd"] = load.stadium_attribute(df, "forwardness")

In [3]:
dfc = df.copy()
for exclude_mod in [
    # "ON_FIRE", 
    # "OVERPERFORMING",
    # "UNDERPERFORMING", 
    # "TRAVELING",
    # "GROWTH",
    # "HIGH_PRESSURE",
    # "SINKING_SHIP", 
    # "MINIMALIST",
    # "MAXIMALIST", 
    # "SLOW_BUILD", 
    # "SHELLED", 
    # "SMOOTH",
    # "CHUNKY",
    # "AFFINITY_FOR_CROWS",
    # "PSYCHIC",
]:
    dfc = dfc[~dfc["batter_mods"].astype(str).str.contains(exclude_mod)]
    dfc = dfc[~dfc["pitcher_mods"].astype(str).str.contains(exclude_mod)]
    dfc = dfc[~dfc["batting_team_mods"].astype(str).str.contains(exclude_mod)]
    dfc = dfc[~dfc["pitching_team_mods"].astype(str).str.contains(exclude_mod)]

dfc["flinch"] = (dfc["batter_mods"].astype(str).str.contains("FLINCH")) & (dfc["strike_count"] == 0)
dfc = dfc[~dfc["flinch"]]

dfc = dfc[dfc["roll"] < 0.86]


In [37]:
X = dfc[
    [
        "roll",
        "passed",
        "ruth",
        "musc",
        "fwd",
    ]
]

y = X["passed"]
X = X.drop("passed", axis=1)

# pin_intercept = 0.18
pin_intercept = 0.3
pins = [
    (0.285, "ruth"),
    (0.2, "fwd"),
    (0.1, "musc"),
]
for val, var in pins:
    X["roll"] -= X[var] * val
    X = X.drop(var, axis=1)

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn import svm
import numpy as np

sc = StandardScaler(with_mean=False)
X2 = sc.fit_transform(X)
if pin_intercept is not None:
    X2[:, 0] -= pin_intercept / sc.scale_[0]
    
trainedsvm = svm.LinearSVC(dual=False, max_iter=10000, C=1e10, tol=1e-10, fit_intercept=pin_intercept is None).fit(
    X2, y
)
predictionsvm = trainedsvm.predict(X2)
print(confusion_matrix(y, predictionsvm))

coef = np.true_divide(trainedsvm.coef_, sc.scale_)
coef_scaled = coef / coef[0, 0]
coef_list = coef_scaled.tolist()[0]

intercept = trainedsvm.intercept_ - np.dot(coef, sc.mean_)
intercept_scaled = -(intercept / coef[0, 0])[0] if pin_intercept is None else pin_intercept
# print(sorted(zip(coef_list, X.columns), key=lambda x: abs(x[0]), reverse=True))

dfc['threshold'] = intercept_scaled
print(intercept_scaled)
for pair in pins + list(zip(-np.array(coef_list), X.columns)):
    if "roll" not in pair[1]:
        print(pair)
        dfc['threshold'] += pair[0] * dfc[pair[1]]

[[ 64522     14]
 [     5 105693]]
0.3
(0.285, 'ruth')
(0.2, 'fwd')
(0.1, 'musc')


In [40]:
dfc["offset"] = dfc["roll"] - dfc["threshold"]
outliers = dfc[(dfc["passed"] & (dfc["offset"] >= 0)) | (~dfc["passed"] & (dfc["offset"] <= 0))]
outliers[["roll", "weather", "event_type", "season", "day", "game_id", "play_count", "threshold", "offset", "batter_name","pitcher_name", "batter_mods", "batting_team_mods", "pitcher_mods", "pitching_team_mods"]]

Unnamed: 0,roll,weather,event_type,season,day,game_id,play_count,threshold,offset,batter_name,pitcher_name,batter_mods,batting_team_mods,pitcher_mods,pitching_team_mods
202677,0.744364,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,30,0.744403,-3.9e-05,Swamuel Mora,Alexandria Rosales,,,,PSYCHIC
202690,0.704755,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,33,0.744403,-0.039647,Swamuel Mora,Alexandria Rosales,,,,PSYCHIC
202798,0.629534,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,63,0.654249,-0.024715,Socks Maybe,Alexandria Rosales,ALTERNATE;CAREFUL,,,PSYCHIC
203008,0.729086,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,122,0.744403,-0.015317,Swamuel Mora,Alexandria Rosales,,,,PSYCHIC
203060,0.61517,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,136,0.649454,-0.034283,Justice Spoon,Alexandria Rosales,FIRE_EATER;ALTERNATE,,,PSYCHIC
203472,0.647473,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,243,0.654249,-0.006775,Socks Maybe,Alexandria Rosales,ALTERNATE;CAREFUL,,,PSYCHIC
203762,0.644804,12,Ball,17,15,18e00e0f-9e9b-421b-930c-f1e017f22f79,390,0.649454,-0.00465,Justice Spoon,Alexandria Rosales,FIRE_EATER;ALTERNATE,,,PSYCHIC
206346,0.731939,13,StrikeLooking,17,18,338639ad-2a8c-4edb-815c-95d960df87b5,138,0.716197,0.015743,Kichiro Guerra,Alexandria Rosales,PRO_SKATER;MAXIMALIST;ALTERNATE,HOME_FIELD;PSYCHIC;BLACKHOLE_PAYOUTS;BOTTOM_DW...,,PSYCHIC
206840,0.644471,13,StrikeLooking,17,18,338639ad-2a8c-4edb-815c-95d960df87b5,285,0.643538,0.000933,Igneus Delacruz,Alexandria Rosales,PARASITE;CHUNKY;ALTERNATE,HOME_FIELD;PSYCHIC;BLACKHOLE_PAYOUTS;BOTTOM_DW...,,PSYCHIC
208232,0.756243,12,StrikeLooking,17,20,712df228-c8f9-4f5f-936d-d178d147581a,45,0.733354,0.022888,Igneus Delacruz,Bennett Bluesky,PARASITE;CHUNKY;ALTERNATE,HOME_FIELD;BOTTOM_DWELLER;BLACKHOLE_PAYOUTS,,PSYCHIC


In [34]:
outliers["stadium_name"] = outliers["stadium_object"].apply(lambda x: x.nickname)

outliers[["batter_name", "pitcher_name", "batting_team_name", "season", "day", "roll", "threshold", "offset","stadium_name"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outliers["stadium_name"] = outliers["stadium_object"].apply(lambda x: x.nickname)


Unnamed: 0,batter_name,pitcher_name,batting_team_name,season,day,roll,threshold,offset,stadium_name
202677,Swamuel Mora,Alexandria Rosales,Firefighters,17,15,0.744364,0.744403,-3.9e-05,The Fire House
202690,Swamuel Mora,Alexandria Rosales,Firefighters,17,15,0.704755,0.744403,-0.039647,The Fire House
202798,Socks Maybe,Alexandria Rosales,Firefighters,17,15,0.629534,0.654249,-0.024715,The Fire House
203008,Swamuel Mora,Alexandria Rosales,Firefighters,17,15,0.729086,0.744403,-0.015317,The Fire House
203060,Justice Spoon,Alexandria Rosales,Firefighters,17,15,0.61517,0.649454,-0.034283,The Fire House
203472,Socks Maybe,Alexandria Rosales,Firefighters,17,15,0.647473,0.654249,-0.006775,The Fire House
203762,Justice Spoon,Alexandria Rosales,Firefighters,17,15,0.644804,0.649454,-0.00465,The Fire House
206346,Kichiro Guerra,Alexandria Rosales,Worms,17,18,0.731939,0.716197,0.015743,The Wormhole
206840,Igneus Delacruz,Alexandria Rosales,Worms,17,18,0.644471,0.643538,0.000933,The Wormhole
208232,Igneus Delacruz,Bennett Bluesky,Worms,17,20,0.756243,0.733354,0.022888,The Wormhole


Index(['event_type', 'event_time', 'roll', 'passed', 'batting_team_hype',
       'pitching_team_hype', 'game_id', 'play_count', 'ball_count',
       'strike_count', 'out_count', 'home_score', 'away_score', 'inning',
       'baserunner_count', 'baserunners', 'baserunners_next', 'is_strike',
       'strike_roll', 'strike_threshold', 'fielder_roll',
       'batter_consecutive_hits', 'weather', 'season', 'day', 'runner_count',
       'top_of_inning', 'is_maximum_blaseball', 'batter_at_bats',
       'batter_file', 'batting_team_file', 'pitcher_file',
       'pitching_team_file', 'stadium_file', 'fielder_file',
       'relevant_runner_file', 'runner_on_first_file', 'runner_on_second_file',
       'runner_on_third_file', 'runner_on_third_hh_file', 'attacked_team_file',
       'stat_relevant_data', 'pitcher_object', 'batter_object',
       'batting_team_object', 'pitching_team_object', 'stadium_object',
       'pitcher_vibes', 'pitcher_mods', 'pitcher_name', 'batter_vibes',
       'batter_mods