In [32]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import os

In [33]:
def preprocess_eva_file(file_path):
    with open(file_path, 'r') as file:
        raw_data = file.readlines()

    processed_data = []
    current_game_id = None
    home_team = None
    away_team = None

    for line in raw_data:
        if line.startswith('id'):
            current_game_id = line.strip().split(',')[1]
        elif line.startswith('info') and 'visteam' in line:
            away_team = line.strip().split(',')[2]
        elif line.startswith('info') and 'hometeam' in line:
            home_team = line.strip().split(',')[2]
        elif line.startswith('play'):
            play_data = line.strip().split(',')
            play_data.insert(1, current_game_id)
            play_data.insert(2, home_team)
            play_data.insert(3, away_team)
            processed_data.append(play_data)

    return processed_data


In [36]:
def read_eva_to_dataframe(file_path):
    processed_data = preprocess_eva_file(file_path)

    df = pd.DataFrame(processed_data, columns=['event', 'game_id', 'home_team', 'away_team', 'inning', 'home_away', 'player_id', 'count', 'pitches', 'event_description'])

    return df

In [38]:
file_path = 'data/2021eve/2021ANA.EVA'
df = read_eva_to_dataframe(file_path)
df.head()

Unnamed: 0,event,game_id,home_team,away_team,inning,home_away,player_id,count,pitches,event_description
0,play,ANA202104010,ANA,CHA,1,0,andet001,22,FCBFBS,K
1,play,ANA202104010,ANA,CHA,1,0,eatoa002,1,CX,43/G4
2,play,ANA202104010,ANA,CHA,1,0,abrej003,0,X,S9/G34+
3,play,ANA202104010,ANA,CHA,1,0,moncy001,32,BCCFFBB>S,K
4,play,ANA202104010,ANA,CHA,1,1,fletd002,22,CCBBS,K


In [6]:
def classify_outcome(event):
    if event.startswith('S'):
        return 'single'
    elif event.startswith('D'):
        return 'double'
    elif event.startswith('T'):
        return 'triple'
    elif event.startswith('H'):
        return 'home_run'
    elif event.startswith('W'):
        return 'walk'
    else:
        return 'out'

# Apply the classification function to the 'event' column to create a new 'outcome' column
df['outcome'] = df['event'].apply(classify_outcome)


In [8]:
def read_ros_to_dataframe(file_path):
    # Define the column headers for the roster DataFrame
    headers = ['player_id', 'last_name', 'first_name',
               'batting_hand', 'throwing_hand', 'team', 'position']

    # Read the ROS file into a pandas DataFrame
    df = pd.read_csv(file_path, header=None, names=headers)

    # Combine the first and last names into a single column
    df['name'] = df['first_name'] + ' ' + df['last_name']

    # Drop the unnecessary columns
    df = df.drop(columns=['batting_hand', 'throwing_hand', 'first_name', 'last_name'])

    return df

In [40]:
# Make sure you have the read_eva_to_dataframe and read_ros_to_dataframe functions defined as shown in previous responses

data = []
teams = ['ANA', 'ARI', 'ATL', 'BAL', 'BOS', 'CHN', 'CHA', 'CIN', 'CLE', 'COL',
         'DET', 'HOU', 'KCA', 'LAN', 'MIA', 'MIL', 'MIN', 'NYN', 'NYA', 'OAK', 'PHI',
         'PIT', 'SDN', 'SFN', 'SEA', 'SLN', 'TBA', 'TEX', 'TOR', 'WAS']

for team in tqdm.tqdm(teams):
    try:
        # Read the EVA file into a DataFrame
        file_path = f'data/2021eve/2021{team}.EVA'
        team_data = read_eva_to_dataframe(file_path)
    except FileNotFoundError:
        # Read the EVN file into a DataFrame
        file_path = f'data/2021eve/2021{team}.EVN'
        team_data = read_eva_to_dataframe(file_path)

    # Apply the classification function to the 'event' column to create a new 'outcome' column
    team_data['outcome'] = team_data['event'].apply(classify_outcome)

    # Define a variable for the home team abbreviation
    home_team = team_data['home_team'].unique()[0]

    # Define a variable for the away team abbreviation
    away_team = team_data['away_team'].unique()[0]

    # Read the home team ROS file into a DataFrame
    file_path = f'data/2021eve/{home_team}2021.ROS'
    home_team_roster = read_ros_to_dataframe(file_path)

    # Read the away team ROS file into a DataFrame
    file_path = f'data/2021eve/{away_team}2021.ROS'
    away_team_roster = read_ros_to_dataframe(file_path)

    # Concatenate the home and away team rosters into a single DataFrame
    team_roster = pd.concat([home_team_roster, away_team_roster], ignore_index=True)

    # Merge the team roster DataFrame with the team data DataFrame
    team_data = pd.merge(team_data, team_roster, on='player_id')

    # Concatenate each game's DataFrame to the data list
    data.append(team_data)

# Combine all the DataFrames in the data list into a single DataFrame
df = pd.concat(data, ignore_index=True)


100%|██████████| 30/30 [00:01<00:00, 21.13it/s]


In [41]:
df

Unnamed: 0,event,game_id,home_team,away_team,inning,home_away,player_id,count,pitches,event_description,outcome,team,position,name
0,play,ANA202104010,ANA,CHA,1,0,andet001,22,FCBFBS,K,out,CHA,SS,Tim Anderson
1,play,ANA202104010,ANA,CHA,3,0,andet001,01,FX,53/G5,out,CHA,SS,Tim Anderson
2,play,ANA202104010,ANA,CHA,5,0,andet001,11,SBX,S8/G6M+,out,CHA,SS,Tim Anderson
3,play,ANA202104010,ANA,CHA,7,0,andet001,01,S>S,CS2(24),out,CHA,SS,Tim Anderson
4,play,ANA202104010,ANA,CHA,7,0,andet001,02,S>S.S,K,out,CHA,SS,Tim Anderson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118048,play,WAS202109150,WAS,MIA,5,1,rogej003,00,,NP,out,WAS,P,Josh Rogers
118049,play,WAS202110010,WAS,BOS,3,1,rogej003,12,CSFBX,13/G4MS,out,WAS,P,Josh Rogers
118050,play,WAS202110010,WAS,BOS,5,1,rogej003,01,CX,13/BG15-,out,WAS,P,Josh Rogers
118051,play,WAS202110030,WAS,BOS,2,1,adonj001,12,BCSS,K,out,WAS,P,Joan Adon


In [42]:
# count the nan values in each column
df.isna().sum()

event                0
game_id              0
home_team            0
away_team            0
inning               0
home_away            0
player_id            0
count                0
pitches              0
event_description    0
outcome              0
team                 0
position             0
name                 0
dtype: int64