In [2]:
# Imports

import joblib
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

model = joblib.load('model.joblib')

# Preparation

df_players = pd.read_csv("mlb_top_players_id.csv")
df_players_seasons = pd.read_csv("mlb_top_players_season_team.csv")
top_players_plays = pd.read_csv('mlb_top_players_plays.csv')
all_player_stats = pd.read_csv('all_player_stats.csv')

# Prepare df_players with the top 10 per group and all their seasons

df_players_top10 = (
    df_players_seasons
    .loc[df_players_seasons['season'].isin(range(1870, 2025, 10)) | (df_players_seasons['season'] == 2024)]
    .filter(['season','player_id','team_id'])
    .merge(df_players, on='player_id', how='left')
    .sort_values(['top_source','top_idx'])
)

grouped = df_players_top10.groupby('top_source')

filtered_groups = []

for name, group in grouped:
    unique_top_idx = group['top_idx'].unique()[:10]
    filtered_group = group[group['top_idx'].isin(unique_top_idx)]
    filtered_groups.append(filtered_group)

df_players_top10 = pd.concat(filtered_groups)

# IDs from the top 10

batters_id = (
    df_players_top10
    .pipe(lambda df:df[df['top_source'].isin(['bhist','b2024'])])
    ['player_id']
    .unique()
    .tolist()
)

pitchers_id = (
    df_players_top10
    .pipe(lambda df:df[df['top_source'].isin(['phist','p2024'])])
    ['player_id']
    .unique()
    .tolist()
)

# Get all possible predictions

batter_stats = all_player_stats.copy().add_prefix("b_")
pitcher_stats = all_player_stats.copy().add_prefix("p_")
batter_stats = batter_stats.rename(columns={'b_player_id': 'batter_id', 'b_season': 'season'})
pitcher_stats = pitcher_stats.rename(columns={'p_player_id': 'pitcher_id', 'p_season': 'season'})

df_players_predict = (
    top_players_plays
    .filter(['batter_id','pitcher_id','season', 'event_type', 'play_events', 'vs_RHB','vs_LHB','vs_SHB','vs_RHP','vs_LHP'])
    .pipe(lambda df:df[(df['pitcher_id'].isin(pitchers_id)) | (df['batter_id'].isin(batters_id))])
    .drop_duplicates()
    .merge(batter_stats, on=['season', 'batter_id'], how='left')
    .merge(pitcher_stats, on=['season', 'pitcher_id'], how='left')
)

full_columns = [
    'season', 'batter_id', 'pitcher_id', 'event_type', 'play_events', 'vs_RHB', 'vs_LHB',
    'vs_SHB', 'vs_RHP', 'vs_LHP', 'b_airOuts', 'b_atBats', 'b_baseOnBalls',
    'b_catchersInterference', 'b_caughtStealing', 'b_doubles', 'b_gamesPlayed',
    'b_groundIntoDoublePlay', 'b_groundOuts', 'b_hitByPitch', 'b_hits', 'b_homeRuns',
    'b_intentionalWalks', 'b_leftOnBase', 'b_numberOfPitches', 'b_plateAppearances',
    'b_rbi', 'b_runs', 'b_sacBunts', 'b_sacFlies', 'b_stolenBases', 'b_strikeOuts',
    'b_totalBases', 'b_triples', 'b_singles', 'b_atBatsPerHomeRun', 'b_avg',
    'b_babip', 'b_groundOutsToAirouts', 'b_obp', 'b_slg', 'b_stolenBasePercentage',
    'b_strikeoutWalkRatio', 'b_ops', 'p_airOuts', 'p_atBats', 'p_balks', 'p_baseOnBalls',
    'p_battersFaced', 'p_blownSaves', 'p_catchersInterference', 'p_caughtStealing',
    'p_completeGames', 'p_doubles', 'p_earnedRuns', 'p_gamesFinished', 'p_gamesPitched',
    'p_gamesPlayed', 'p_gamesStarted', 'p_groundIntoDoublePlay', 'p_groundOuts', 'p_hitBatsmen',
    'p_hitByPitch', 'p_hits', 'p_holds', 'p_homeRuns', 'p_inheritedRunners',
    'p_inheritedRunnersScored', 'p_inningsPitched', 'p_intentionalWalks', 'p_losses',
    'p_numberOfPitches', 'p_outs', 'p_pickoffs', 'p_runs', 'p_sacBunts', 'p_sacFlies',
    'p_saveOpportunities', 'p_saves', 'p_shutouts', 'p_stolenBases', 'p_strikeOuts',
    'p_strikes', 'p_totalBases', 'p_triples', 'p_wildPitches', 'p_wins', 'p_singles',
    'p_atBatsPerHomeRun', 'p_avg', 'p_babip', 'p_era', 'p_groundOutsToAirouts', 'p_hitsPer9Inn',
    'p_homeRunsPer9', 'p_obp', 'p_pitchesPerInning', 'p_runsScoredPer9', 'p_slg',
    'p_stolenBasePercentage', 'p_strikePercentage', 'p_strikeoutWalkRatio', 'p_strikeoutsPer9Inn',
    'p_walksPer9Inn', 'p_whip', 'p_winPercentage', 'p_ops'
    ]

df_players_predict = (
    df_players_predict
    .filter(full_columns)
)

X_input = df_players_predict.iloc[:, 5:]
predictions = model.predict(X_input)
df_players_predict['prediction'] = predictions

# Get all possible combinations from players

batters = (
    df_players_top10
    .loc[df_players_top10['top_source'].isin(['bhist', 'b2024'])]
)

pitchers = (
    df_players_top10
    .loc[df_players_top10['top_source'].isin(['phist', 'p2024'])]
)

batters['key'] = 1
pitchers['key'] = 1

df_combination = (
    batters
    .merge(pitchers, on='key').drop('key', axis=1)
    .rename(columns={
        'player_id_x': 'batter_id',
        'player_id_y': 'pitcher_id',
        'fullName_x': 'batter_fullName',
        'fullName_y': 'pitcher_fullName',
        'season_x': 'batter_season',
        'season_y': 'pitcher_season',
        'team_id_x': 'batter_team_id',
        'team_id_y': 'pitcher_team_id'
    })
    .drop(['top_idx_x', 'top_idx_y', 'top_source_x', 'top_source_y'], axis=1)
)

# Put the most similar result for each combination
# Through cosine similarity

batter_cols = [
    'season', 'batter_id', 'vs_RHP', 'vs_LHP', 'b_airOuts', 'b_atBats', 'b_baseOnBalls',
    'b_catchersInterference', 'b_caughtStealing', 'b_doubles', 'b_gamesPlayed', 'b_groundIntoDoublePlay',
    'b_groundOuts', 'b_hitByPitch', 'b_hits', 'b_homeRuns', 'b_intentionalWalks', 'b_leftOnBase',
    'b_numberOfPitches', 'b_plateAppearances', 'b_rbi', 'b_runs', 'b_sacBunts', 'b_sacFlies', 'b_stolenBases',
    'b_strikeOuts', 'b_totalBases', 'b_triples', 'b_singles', 'b_atBatsPerHomeRun', 'b_avg', 'b_babip',
    'b_groundOutsToAirouts', 'b_obp', 'b_slg', 'b_stolenBasePercentage', 'b_strikeoutWalkRatio', 'b_ops'
]

pitcher_cols = [
    'season','pitcher_id', 'vs_RHB', 'vs_LHB', 'vs_SHB', 'p_airOuts', 'p_atBats', 'p_balks', 'p_baseOnBalls',
    'p_battersFaced', 'p_blownSaves', 'p_catchersInterference', 'p_caughtStealing', 'p_completeGames',
    'p_doubles', 'p_earnedRuns', 'p_gamesFinished', 'p_gamesPitched', 'p_gamesPlayed', 'p_gamesStarted',
    'p_groundIntoDoublePlay', 'p_groundOuts', 'p_hitBatsmen', 'p_hitByPitch', 'p_hits', 'p_holds', 'p_homeRuns',
    'p_inheritedRunners', 'p_inheritedRunnersScored', 'p_inningsPitched', 'p_intentionalWalks', 'p_losses',
    'p_numberOfPitches', 'p_outs', 'p_pickoffs', 'p_runs', 'p_sacBunts', 'p_sacFlies', 'p_saveOpportunities',
    'p_saves', 'p_shutouts', 'p_stolenBases', 'p_strikeOuts', 'p_strikes', 'p_totalBases', 'p_triples',
    'p_wildPitches', 'p_wins', 'p_singles', 'p_atBatsPerHomeRun', 'p_avg', 'p_babip', 'p_era',
    'p_groundOutsToAirouts', 'p_hitsPer9Inn', 'p_homeRunsPer9', 'p_obp', 'p_pitchesPerInning',
    'p_runsScoredPer9', 'p_slg', 'p_stolenBasePercentage', 'p_strikePercentage', 'p_strikeoutWalkRatio',
    'p_strikeoutsPer9Inn', 'p_walksPer9Inn', 'p_whip', 'p_winPercentage', 'p_ops'
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batters['key'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitchers['key'] = 1


In [26]:
df_players_predict.iloc[0,4]

"[{'details': {'call': {'code': 'E', 'description': 'In play, run(s)'}, 'description': 'In play, run(s)', 'code': 'E', 'ballColor': 'rgba(26, 86, 190, 1.0)', 'isInPlay': True, 'isStrike': False, 'isBall': False, 'isOut': True, 'hasReview': False}, 'count': {'balls': 0, 'strikes': 0, 'outs': 0}, 'pitchData': {'strikeZoneTop': 3.49, 'strikeZoneBottom': 1.601, 'coordinates': {}, 'breaks': {}}, 'hitData': {'trajectory': '', 'hardness': 'medium', 'location': '4', 'coordinates': {}}, 'index': 0, 'playId': '01303436-0076-0013-000c-f08cd117d70a', 'pitchNumber': 1, 'isPitch': True, 'type': 'pitch'}]"

In [27]:
import random

event_type_list = []
play_events_list = []

for _, row in df_combination.iterrows():
    batter_season = row['batter_season']
    batter_id = row['batter_id']
    pitcher_season = row['pitcher_season']
    pitcher_id = row['pitcher_id']

    df_predict_batter = df_players_predict[(df_players_predict['batter_id']==batter_id) &
                                           (df_players_predict['season']==batter_season)]
    df_predict_pitcher = df_players_predict[(df_players_predict['pitcher_id']==pitcher_id) &
                                            (df_players_predict['season']==pitcher_season)]
    
    df_temp = pd.concat([
        df_predict_batter.filter(['event_type','play_events']),
        df_predict_pitcher.filter(['event_type','play_events'])
    ])

    length = df_temp.shape[0]

    # A true logic should be using the best match vectors with similarity.
    # For showing purposes
    try:
        idx = random.randint(0, length-1)
        event_type_list.append(df_temp.iloc[idx,0])
        play_events_list.append(df_temp.iloc[idx,1])
    except:
        event_type_list.append('field_out')
        play_events_list.append("[{'details': {'call': {'code': 'E', 'description': 'In play, run(s)'}, 'description': 'In play, run(s)', 'code': 'E', 'ballColor': 'rgba(26, 86, 190, 1.0)', 'isInPlay': True, 'isStrike': False, 'isBall': False, 'isOut': True, 'hasReview': False}, 'count': {'balls': 0, 'strikes': 0, 'outs': 0}, 'pitchData': {'strikeZoneTop': 3.49, 'strikeZoneBottom': 1.601, 'coordinates': {}, 'breaks': {}}, 'hitData': {'trajectory': '', 'hardness': 'medium', 'location': '4', 'coordinates': {}}, 'index': 0, 'playId': '01303436-0076-0013-000c-f08cd117d70a', 'pitchNumber': 1, 'isPitch': True, 'type': 'pitch'}]")

    

df_combination['event_type'] = event_type_list
df_combination['play_events'] = play_events_list

In [1]:
df_combination.to_csv('batch_prediction.csv', index=False)

NameError: name 'df_combination' is not defined