## Imports, Glossary

In [2]:
import pandas as pd
import numpy as np
import os
import glob
from collections import defaultdict


## Team Info CSV

Could get the rosters for each home team in both seasons (see promotions, drops/transfers, demotions)

In [3]:
team_info = pd.read_csv('team_info.csv')
'''home_team: Home teams are teams in the same farm system at different levels of minor league (4A is the highest, 1A is the lowest)'''
'''player_id: a unique 3 digit identifier for farm system players, which carry over across leagues. 203 unique ids among 4 home teams and 2 seasons '''
'''team_year: not actually 1883 and 1884, but 2 consecutive years (1883 about half a season, 1884 a full season)'''
'''away_team: non-unique ids, that are assigned for a series (3-4 games) and reanonymized for each series'''
team_info

Unnamed: 0,home_team,player_id,team_year,away_team
0,Home4A,383,1883,Vis4AB
1,Home4A,392,1883,Vis4AB
2,Home4A,431,1883,Vis4AB
3,Home4A,461,1883,Vis4AB
4,Home4A,463,1883,Vis4AB
...,...,...,...,...
2093,Home1A,838,1884,Vis1BC
2094,Home1A,877,1884,Vis1BC
2095,Home1A,892,1884,Vis1BC
2096,Home1A,953,1884,Vis1BC


## Game Events CSV

### Season 1883 

In [4]:
# game str; the year and sequential day in the season, if the day ends in .5 it means it was the second in a double header. Year_Day_AwayTeam_HomeTeam
#play id is the identifier for a play (defined as a situation where the ball is live), sequentially listed. generally consist of multiple events (e.g the pitch, what happens to the ball, and end of play)
#at bat corresponds to a batter at the plate, some plays such as pick off throws occur duirng an at bat but are not associated with that at bat 
#play per game: play id for a game 
#tiemstamp is the time in ms, starting from the beginning of a game (first play will be t <60s)
#player position is the position corresponding to glossary (check above)
#game events: id, check summary
root_game_event_path = r'game_events/Season_1883/Home1A/'
vis_paths = [f'Vis1A{i}' for i in ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']]

dfs = [] 
l = 0 
    
for vis_path in vis_paths:
    dir_path = os.path.join(root_game_event_path, vis_path)
    for dir in os.listdir(dir_path):
        full_path = os.path.join(dir_path, dir)
        if os.path.isdir(full_path):  # Check if it's a directory
            l += 1
            file_path = os.path.join(full_path, 'game_events.csv')
            print(f'Going to read in: {file_path}, file #: {l}')
            df = pd.read_csv(file_path)
            dfs.append(df)

game_events1883_home1a = pd.concat(dfs, ignore_index=True)
print(f'done reading in, total length of game_events1883_home1a: {len(game_events1883_home1a)}')

Going to read in: game_events/Season_1883/Home1A/Vis1AB/day_002/game_events.csv, file #: 1
Going to read in: game_events/Season_1883/Home1A/Vis1AB/day_003/game_events.csv, file #: 2
Going to read in: game_events/Season_1883/Home1A/Vis1AB/day_001/game_events.csv, file #: 3
Going to read in: game_events/Season_1883/Home1A/Vis1AC/day_005/game_events.csv, file #: 4
Going to read in: game_events/Season_1883/Home1A/Vis1AC/day_004/game_events.csv, file #: 5
Going to read in: game_events/Season_1883/Home1A/Vis1AC/day_006/game_events.csv, file #: 6
Going to read in: game_events/Season_1883/Home1A/Vis1AC/day_007/game_events.csv, file #: 7
Going to read in: game_events/Season_1883/Home1A/Vis1AD/day_010/game_events.csv, file #: 8
Going to read in: game_events/Season_1883/Home1A/Vis1AD/day_011/game_events.csv, file #: 9
Going to read in: game_events/Season_1883/Home1A/Vis1AD/day_012/game_events.csv, file #: 10
Going to read in: game_events/Season_1883/Home1A/Vis1AE/day_013/game_events.csv, file #: 

In [5]:
root_game_event_path = r'game_events/Season_1883/Home2A/'
vis_paths = [f'Vis2A{i}' for i in ['B', 'C', 'D', 'E']]

dfs = [] 
l = 0 
    
for vis_path in vis_paths:
    dir_path = os.path.join(root_game_event_path, vis_path)
    for dir in os.listdir(dir_path):
        full_path = os.path.join(dir_path, dir)
        if os.path.isdir(full_path):  # Check if it's a directory
            l += 1
            file_path = os.path.join(full_path, 'game_events.csv')
            print(f'Going to read in: {file_path}, file #: {l}')
            df = pd.read_csv(file_path)
            dfs.append(df)

game_events1883_home2a = pd.concat(dfs, ignore_index=True)
print(f'done reading in, total length of game_events1883_home2a: {len(game_events1883_home2a)}')

Going to read in: game_events/Season_1883/Home2A/Vis2AB/day_026/game_events.csv, file #: 1
Going to read in: game_events/Season_1883/Home2A/Vis2AB/day_025/game_events.csv, file #: 2
Going to read in: game_events/Season_1883/Home2A/Vis2AB/day_024/game_events.csv, file #: 3
Going to read in: game_events/Season_1883/Home2A/Vis2AC/day_035/game_events.csv, file #: 4
Going to read in: game_events/Season_1883/Home2A/Vis2AC/day_037/game_events.csv, file #: 5
Going to read in: game_events/Season_1883/Home2A/Vis2AC/day_036/game_events.csv, file #: 6
Going to read in: game_events/Season_1883/Home2A/Vis2AD/day_041.5/game_events.csv, file #: 7
Going to read in: game_events/Season_1883/Home2A/Vis2AD/day_041/game_events.csv, file #: 8
Going to read in: game_events/Season_1883/Home2A/Vis2AD/day_039/game_events.csv, file #: 9
Going to read in: game_events/Season_1883/Home2A/Vis2AE/day_047.5/game_events.csv, file #: 10
Going to read in: game_events/Season_1883/Home2A/Vis2AE/day_047/game_events.csv, file

In [6]:
root_game_event_path = r'game_events/Season_1883/Home3A/'
vis_paths = [f'Vis3A{i}' for i in ['B', 'C', 'D', 'E','F']]
dfs = [] 
l = 0 
    
for vis_path in vis_paths:
    dir_path = os.path.join(root_game_event_path, vis_path)
    for dir in os.listdir(dir_path):
        l += 1
        file_path = os.path.join(dir_path, dir, 'game_events.csv')
        print(f'Going to read in: {file_path}, file #: {l}')
        df = pd.read_csv(file_path)
        dfs.append(df)

game_events1883_home3a = pd.concat(dfs, ignore_index=True)
print(f'done reading in,total length of game_events1883_home3a: {len(game_events1883_home3a)}')

Going to read in: game_events/Season_1883/Home3A/Vis3AB/day_010/game_events.csv, file #: 1
Going to read in: game_events/Season_1883/Home3A/Vis3AB/day_011/game_events.csv, file #: 2
Going to read in: game_events/Season_1883/Home3A/Vis3AB/day_013/game_events.csv, file #: 3
Going to read in: game_events/Season_1883/Home3A/Vis3AB/day_012/game_events.csv, file #: 4
Going to read in: game_events/Season_1883/Home3A/Vis3AC/day_017/game_events.csv, file #: 5
Going to read in: game_events/Season_1883/Home3A/Vis3AC/day_018/game_events.csv, file #: 6
Going to read in: game_events/Season_1883/Home3A/Vis3AC/day_016/game_events.csv, file #: 7
Going to read in: game_events/Season_1883/Home3A/Vis3AD/day_028.5/game_events.csv, file #: 8
Going to read in: game_events/Season_1883/Home3A/Vis3AD/day_028/game_events.csv, file #: 9
Going to read in: game_events/Season_1883/Home3A/Vis3AD/day_029/game_events.csv, file #: 10
Going to read in: game_events/Season_1883/Home3A/Vis3AD/day_030/game_events.csv, file #

In [7]:
root_game_event_path = r'game_events/Season_1883/Home4A/'
vis_paths = [f'Vis4A{i}' for i in ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']]
dfs = [] 
l = 0 
    
for vis_path in vis_paths:
    dir_path = os.path.join(root_game_event_path, vis_path)
    for dir in os.listdir(dir_path):
        l += 1
        file_path = os.path.join(dir_path, dir, 'game_events.csv')
        print(f'Going to read in: {file_path}, file #: {l}')
        df = pd.read_csv(file_path)
        dfs.append(df)

game_events1883_home4a = pd.concat(dfs, ignore_index=True)
print(f'done reading in, total length of game_events1883_home4a: {len(game_events1883_home4a)}')

Going to read in: game_events/Season_1883/Home4A/Vis4AB/day_002/game_events.csv, file #: 1
Going to read in: game_events/Season_1883/Home4A/Vis4AB/day_003/game_events.csv, file #: 2
Going to read in: game_events/Season_1883/Home4A/Vis4AB/day_001/game_events.csv, file #: 3
Going to read in: game_events/Season_1883/Home4A/Vis4AC/day_005/game_events.csv, file #: 4
Going to read in: game_events/Season_1883/Home4A/Vis4AC/day_004/game_events.csv, file #: 5
Going to read in: game_events/Season_1883/Home4A/Vis4AC/day_006/game_events.csv, file #: 6
Going to read in: game_events/Season_1883/Home4A/Vis4AD/day_010/game_events.csv, file #: 7
Going to read in: game_events/Season_1883/Home4A/Vis4AD/day_008/game_events.csv, file #: 8
Going to read in: game_events/Season_1883/Home4A/Vis4AD/day_009/game_events.csv, file #: 9
Going to read in: game_events/Season_1883/Home4A/Vis4AE/day_011/game_events.csv, file #: 10
Going to read in: game_events/Season_1883/Home4A/Vis4AE/day_013/game_events.csv, file #: 

In [8]:
game_events_season1883 = pd.concat([game_events1883_home1a, game_events1883_home2a, game_events1883_home3a, game_events1883_home4a], ignore_index=True)
game_events_season1883

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
0,1883_002_Vis1AB_Home1A,1,1.0,1,22112,1,1
1,1883_002_Vis1AB_Home1A,1,1.0,1,22562,0,5
2,1883_002_Vis1AB_Home1A,1,1.0,1,22562,2,2
3,1883_002_Vis1AB_Home1A,2,1.0,2,37312,1,1
4,1883_002_Vis1AB_Home1A,2,1.0,2,37812,0,5
...,...,...,...,...,...,...,...
99388,1883_042_Vis4AJ_Home4A,363,88.0,363,11590840,255,16
99389,1883_042_Vis4AJ_Home4A,363,88.0,363,11590940,5,2
99390,1883_042_Vis4AJ_Home4A,363,88.0,363,11592140,5,3
99391,1883_042_Vis4AJ_Home4A,363,88.0,363,11592890,4,2


### Season 1884

In [9]:
dfs = []

root_path = 'game_events/Season_1884/'


for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    for vis_dir in os.listdir(home_path):
        vis_path = os.path.join(home_path, vis_dir)
        if os.path.isdir(vis_path):  # Check if vis_path is a directory
            for day_dir in os.listdir(vis_path):
                day_path = os.path.join(vis_path, day_dir)
                if os.path.isdir(day_path):  # Check if day_path is a directory
                    csv_path = os.path.join(day_path, 'game_events.csv')
                    if os.path.isfile(csv_path):
                        df = pd.read_csv(csv_path)
                        dfs.append(df)

game_events_season1884 = pd.concat(dfs, ignore_index=True)
game_events_season1884

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
0,1884_137_Vis1BC_Home1A,1,1.0,1,46010,1,1
1,1884_137_Vis1BC_Home1A,1,1.0,1,46439,2,2
2,1884_137_Vis1BC_Home1A,1,1.0,1,46472,0,5
3,1884_137_Vis1BC_Home1A,2,1.0,2,57098,1,1
4,1884_137_Vis1BC_Home1A,2,1.0,2,57494,10,4
...,...,...,...,...,...,...,...
263639,1884_007_Vis4AL_Home4A,298,72.0,298,9916259,5,2
263640,1884_007_Vis4AL_Home4A,298,72.0,298,9917909,5,3
263641,1884_007_Vis4AL_Home4A,298,72.0,298,9919009,255,16
263642,1884_007_Vis4AL_Home4A,298,72.0,298,9919109,3,2


Ending CSVs: game_events_season1884, game_events_season1883

## Ball Pos CSV

### Season 1883

In [10]:
dfs = []

root_path = 'ball_pos/Season_1883/'


for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    if os.path.isdir(home_path):  # Check if home_path is a directory
        for vis_dir in os.listdir(home_path):
            vis_path = os.path.join(home_path, vis_dir)
            if os.path.isdir(vis_path):  # Check if vis_path is a directory
                for day_dir in os.listdir(vis_path):
                    day_path = os.path.join(vis_path, day_dir)
                    if os.path.isdir(day_path):  # Check if day_path is a directory
                        csv_path = os.path.join(day_path, 'ball_pos.csv')
                        if os.path.isfile(csv_path):
                            df = pd.read_csv(csv_path)
                            dfs.append(df)


ball_pos_season1883 = pd.concat(dfs, ignore_index=True)
ball_pos_season1883

Unnamed: 0,game_str,play_id,timestamp,ball_position_x,ball_position_y,ball_position_z
0,1883_002_Vis1AB_Home1A,1,22112,0.891828,54.12030,5.553450
1,1883_002_Vis1AB_Home1A,1,22162,0.940503,47.50860,5.285460
2,1883_002_Vis1AB_Home1A,1,22212,0.957876,40.96890,4.959120
3,1883_002_Vis1AB_Home1A,1,22262,0.943944,34.50120,4.574400
4,1883_002_Vis1AB_Home1A,1,22312,0.898710,28.10607,4.131360
...,...,...,...,...,...,...
743356,1883_001_Vis4AB_Home4A,263,9453652,47.478000,82.21890,2.794281
743357,1883_001_Vis4AB_Home4A,263,9453702,50.974800,78.79110,2.123898
743358,1883_001_Vis4AB_Home4A,263,9453752,54.427200,75.38160,1.385064
743359,1883_001_Vis4AB_Home4A,263,9453802,57.835200,71.99040,0.577776


### Season 1884

In [11]:
dfs = []

root_path = 'ball_pos/Season_1884/'


for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    if os.path.isdir(home_path):  # Check if home_path is a directory
        for vis_dir in os.listdir(home_path):
            vis_path = os.path.join(home_path, vis_dir)
            if os.path.isdir(vis_path):  # Check if vis_path is a directory
                for day_dir in os.listdir(vis_path):
                    day_path = os.path.join(vis_path, day_dir)
                    if os.path.isdir(day_path):  # Check if day_path is a directory
                        csv_path = os.path.join(day_path, 'ball_pos.csv')
                        if os.path.isfile(csv_path):
                            df = pd.read_csv(csv_path)
                            dfs.append(df)

ball_pos_season1884 = pd.concat(dfs, ignore_index=True)
ball_pos_season1884

Unnamed: 0,game_str,play_id,timestamp,ball_position_x,ball_position_y,ball_position_z
0,1884_137_Vis1BC_Home1A,1,46010,-1.059618,50.3259,5.110950
1,1884_137_Vis1BC_Home1A,1,46043,-0.831393,45.9324,4.880490
2,1884_137_Vis1BC_Home1A,1,46076,-0.612414,41.5746,4.632090
3,1884_137_Vis1BC_Home1A,1,46109,-0.402681,37.2525,4.365780
4,1884_137_Vis1BC_Home1A,1,46142,-0.202194,32.9658,4.081500
...,...,...,...,...,...,...
2080959,1884_007_Vis4AL_Home4A,298,9918909,45.647100,68.1357,1.246203
2080960,1884_007_Vis4AL_Home4A,298,9918959,49.756200,67.3356,0.339675
2080961,1884_007_Vis4AL_Home4A,298,9919009,53.819700,66.5178,-0.636369
2080962,1884_007_Vis4AL_Home4A,298,9919059,57.504900,65.6523,-0.099150


## Player Pos CSV

### Season 1883


In [12]:
dfs = []
root_path = 'player_pos/Season_1883/'

for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    for vis_dir in os.listdir(home_path):
        if vis_dir == '.DS_Store':
            continue
        vis_path = os.path.join(home_path, vis_dir)
        for day_dir in os.listdir(vis_path):
            if day_dir == '.DS_Store':
                continue
            day_path = os.path.join(vis_path, day_dir)
            csv_path = os.path.join(day_path, 'player_pos.csv')
            if os.path.isfile(csv_path):
                df = pd.read_csv(csv_path)
                dfs.append(df)

player_pos_season1883 = pd.concat(dfs, ignore_index=True)
player_pos_season1883

Unnamed: 0,game_str,play_id,timestamp,player_position,field_x,field_y
0,1883_002_Vis1AB_Home1A,1,22112,1,-0.5799,55.5510
1,1883_002_Vis1AB_Home1A,1,22112,2,0.0000,-4.5000
2,1883_002_Vis1AB_Home1A,1,22112,3,46.4301,76.6185
3,1883_002_Vis1AB_Home1A,1,22112,4,23.1222,147.9099
4,1883_002_Vis1AB_Home1A,1,22112,5,-61.9377,76.1523
...,...,...,...,...,...,...
12095641,1883_001_Vis4AB_Home4A,263,9454252,9,105.7641,224.4330
12095642,1883_001_Vis4AB_Home4A,263,9454252,10,67.6869,69.0168
12095643,1883_001_Vis4AB_Home4A,263,9454252,11,3.3825,127.4442
12095644,1883_001_Vis4AB_Home4A,263,9454252,16,6.5193,99.1725


### Season 1884

In [13]:
dfs = []

root_path = 'player_pos/Season_1884/'


for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    for vis_dir in os.listdir(home_path):
        vis_path = os.path.join(home_path, vis_dir)
        for day_dir in os.listdir(vis_path):
            day_path = os.path.join(vis_path, day_dir)
            csv_path = os.path.join(day_path, 'player_pos.csv')
            if os.path.isfile(csv_path):
                df = pd.read_csv(csv_path)
                dfs.append(df)

player_pos_season1884 = pd.concat(dfs, ignore_index=True)
player_pos_season1884

Unnamed: 0,game_str,play_id,timestamp,player_position,field_x,field_y
0,1884_137_Vis1BC_Home1A,1,46010,1,0.3525,57.0498
1,1884_137_Vis1BC_Home1A,1,46010,2,0.0765,-5.3778
2,1884_137_Vis1BC_Home1A,1,46010,3,77.7816,90.5169
3,1884_137_Vis1BC_Home1A,1,46010,4,50.7480,137.8917
4,1884_137_Vis1BC_Home1A,1,46010,5,-39.7566,80.4018
...,...,...,...,...,...,...
87941203,1884_007_Vis4AL_Home4A,298,9919609,9,108.3837,193.4163
87941204,1884_007_Vis4AL_Home4A,298,9919609,10,56.1000,60.0066
87941205,1884_007_Vis4AL_Home4A,298,9919609,17,-45.0933,106.8552
87941206,1884_007_Vis4AL_Home4A,298,9919609,18,65.4312,87.9636


## Game Info CSV

### Season 1883

In [14]:
dfs = []

root_path = 'game_info/Season_1883/'


for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    for vis_dir in os.listdir(home_path):
        if vis_dir == '.DS_Store':
            continue
        vis_path = os.path.join(home_path, vis_dir)
        for day_dir in os.listdir(vis_path):
            if day_dir == '.DS_Store':
                continue
            day_path = os.path.join(vis_path, day_dir)
            csv_path = os.path.join(day_path, 'game_info.csv')
            if os.path.isfile(csv_path):
                df = pd.read_csv(csv_path)
                dfs.append(df)

game_info_season1883 = pd.concat(dfs, ignore_index=True)
game_info_season1883

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,inning,top_bottom,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner
0,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,1,1,top,,,,,,,,,,,,,
1,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,2,1,top,,,,,,,,,,,,,
2,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,3,1,top,,,,,,,,,,,,,
3,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,4,1,top,,,,,,,,,,,,,
4,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,5,1,top,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26182,1883_001_Vis4AB_Home4A,Home4A,Vis4AB,63.0,260,9,top,,,,,,,,,,,,,
26183,1883_001_Vis4AB_Home4A,Home4A,Vis4AB,64.0,261,9,top,,,,,,,,,,,,,
26184,1883_001_Vis4AB_Home4A,Home4A,Vis4AB,64.0,262,9,top,,,,,,,,,,,,,
26185,1883_001_Vis4AB_Home4A,Home4A,Vis4AB,64.0,263,9,top,,,,,,,,,,,,,


### Season 1884

In [15]:
dfs = []

root_path = 'game_info/Season_1884/'


for home_dir in ['Home1A', 'Home2A', 'Home3A', 'Home4A']:
    home_path = os.path.join(root_path, home_dir)
    if os.path.isdir(home_path):  # Ensure home_path is a directory
        for vis_dir in os.listdir(home_path):
            vis_path = os.path.join(home_path, vis_dir)
            if os.path.isdir(vis_path):  # Ensure vis_path is a directory
                for day_dir in os.listdir(vis_path):
                    day_path = os.path.join(vis_path, day_dir)
                    if os.path.isdir(day_path):  # Ensure day_path is a directory
                        csv_path = os.path.join(day_path, 'game_info.csv')
                        if os.path.isfile(csv_path):  # Ensure csv_path is a file
                            df = pd.read_csv(csv_path)
                            dfs.append(df)

game_info_season1884 = pd.concat(dfs, ignore_index=True)
game_info_season1884

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,inning,top_bottom,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner
0,1884_137_Vis1BC_Home1A,Home1A,Vis1BC,1.0,1,1,top,523.0,953.0,423.0,959.0,838.0,630.0,892.0,435.0,787.0,4716.0,,,
1,1884_137_Vis1BC_Home1A,Home1A,Vis1BC,1.0,2,1,top,523.0,953.0,423.0,959.0,838.0,630.0,892.0,435.0,787.0,4716.0,,,
2,1884_137_Vis1BC_Home1A,Home1A,Vis1BC,2.0,3,1,top,523.0,953.0,423.0,959.0,838.0,630.0,892.0,435.0,787.0,4394.0,,,
3,1884_137_Vis1BC_Home1A,Home1A,Vis1BC,3.0,4,1,top,523.0,953.0,423.0,959.0,838.0,630.0,892.0,435.0,787.0,9761.0,,,
4,1884_137_Vis1BC_Home1A,Home1A,Vis1BC,3.0,5,1,top,523.0,953.0,423.0,959.0,838.0,630.0,892.0,435.0,787.0,9761.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69220,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,294,9,top,792.0,520.0,467.0,586.0,835.0,636.0,427.0,776.0,638.0,2746.0,5631.0,,
69221,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,295,9,top,792.0,520.0,467.0,586.0,835.0,636.0,427.0,776.0,638.0,2746.0,5631.0,,
69222,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,296,9,top,792.0,520.0,467.0,586.0,835.0,636.0,427.0,776.0,638.0,2746.0,5631.0,,
69223,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,297,9,top,792.0,520.0,467.0,586.0,835.0,636.0,427.0,776.0,638.0,2746.0,5631.0,,


# SMT Catch Zone Model

In [16]:
# Try remake the rosters - Jason already got the fielders for each team so just use his CSV 

rosters = team_info.groupby(['home_team', 'team_year'])['player_id'].apply(list).reset_index() #make lists of player ids, grouped by home team and year 
rosters['player_id'] = rosters['player_id'].apply(set)
rosters['player_id'] = rosters['player_id'].apply(list)

In [17]:
unique_player_ids = team_info['player_id'].nunique()
print(f"Number of unique player IDs: {unique_player_ids}")

Number of unique player IDs: 203


In [18]:
# from jason
center_fielders = pd.read_csv("cents2.csv")
right_fielders = pd.read_csv("rights2.csv")
left_fielders = pd.read_csv("lefts2.csv")
team_info_updated = pd.read_csv("team_info_updated.csv")
fielders = pd.concat([center_fielders, right_fielders, left_fielders], ignore_index=True)

In [19]:
center_fielders

Unnamed: 0,home_team,center_field
0,Home4A,336
1,Home3A,337
2,Home1A,423
3,Home4A,427
4,Home1A,435
5,Home3A,461
6,Home3A,475
7,Home2A,492
8,Home1A,495
9,Home2A,495


In [20]:
# Concatenate all the game_infos from both seasons
game_info = pd.concat([game_info_season1883, game_info_season1884], ignore_index=True)
# Convert NaNs to 0s, and floats to ints
game_info = game_info.fillna(0)
# Convert the floats in the columns of the players to ints
cols = ['pitcher', 'catcher', 'first_base', 'second_base', 'third_base', 'shortstop', 'left_field', 'center_field', 'right_field', 'batter', 'first_baserunner','second_baserunner', 'third_baserunner']
game_info[cols] = game_info[cols].astype(int)
game_info

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,inning,top_bottom,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner
0,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,1,1,top,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,2,1,top,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,3,1,top,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,4,1,top,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1883_002_Vis1AB_Home1A,Home1A,Vis1AB,1.0,5,1,top,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,294,9,top,792,520,467,586,835,636,427,776,638,2746,5631,0,0
95408,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,295,9,top,792,520,467,586,835,636,427,776,638,2746,5631,0,0
95409,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,296,9,top,792,520,467,586,835,636,427,776,638,2746,5631,0,0
95410,1884_007_Vis4AL_Home4A,Home4A,Vis4AL,71.0,297,9,top,792,520,467,586,835,636,427,776,638,2746,5631,0,0


In [21]:
# Now want to get all the game_str, player_per_game, home team + away team where any of the player IDs in the fielders list appears 
center_fielders_ids = center_fielders['center_field'].astype(int).tolist()
left_fielders_ids = left_fielders['left_field'].astype(int).tolist()
right_fielders_ids = right_fielders['right_field'].astype(int).tolist()
fielders_ids = list(set(center_fielders_ids + left_fielders_ids + right_fielders_ids)) #gets rid of duplicate ids 

#Filter the rows in game_info where any of the player IDs in the fielders list appears
game_info_fielders = game_info[
    (game_info['left_field'].isin(fielders_ids)) | 
    (game_info['center_field'].isin(fielders_ids)) |
    (game_info['right_field'].isin(fielders_ids))] # make a new df of only the rows where a fielder is in the game (from the home team player ids)

In [22]:
# Concatenate the game_events dataframes
game_events = pd.concat([game_events_season1883, game_events_season1884], ignore_index=True) 
game_events.loc[game_events['game_str'] == '1884_143_Vis4BE_Home4A', 'timestamp'] -= 500 # Mistake pointed out in the Slack, timestamps for this game are systematically off by 500 ms
for i in range(len(game_events) - 1): # Any situation where two consecutive rows are 1, ....., 5, 2 should be swapped to 1, ...., 2, 5
    if game_events.iloc[i]['event_code'] == 5 and game_events.iloc[i + 1]['event_code'] == 2:
        temp = game_events.iloc[i].copy() #make a copy of the row to switch out
        game_events.iloc[i] = game_events.iloc[i + 1]
        game_events.iloc[i + 1] = temp

game_events[game_events['game_str'] == '1884_086_Vis3AP_Home3A'].iloc[73:80] #sanity check 

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
261404,1884_086_Vis3AP_Home3A,17,4.0,17,451188,0,5
261405,1884_086_Vis3AP_Home3A,18,5.0,18,484688,1,1
261406,1884_086_Vis3AP_Home3A,18,5.0,18,485188,2,2
261407,1884_086_Vis3AP_Home3A,18,5.0,18,485188,0,5
261408,1884_086_Vis3AP_Home3A,19,5.0,19,519038,1,1
261409,1884_086_Vis3AP_Home3A,19,5.0,19,519538,2,2
261410,1884_086_Vis3AP_Home3A,19,5.0,19,519538,0,5


In [23]:
game_events

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
0,1883_002_Vis1AB_Home1A,1,1.0,1,22112,1,1
1,1883_002_Vis1AB_Home1A,1,1.0,1,22562,2,2
2,1883_002_Vis1AB_Home1A,1,1.0,1,22562,0,5
3,1883_002_Vis1AB_Home1A,2,1.0,2,37312,1,1
4,1883_002_Vis1AB_Home1A,2,1.0,2,37812,2,2
...,...,...,...,...,...,...,...
363032,1884_007_Vis4AL_Home4A,298,72.0,298,9916259,5,2
363033,1884_007_Vis4AL_Home4A,298,72.0,298,9917909,5,3
363034,1884_007_Vis4AL_Home4A,298,72.0,298,9919009,255,16
363035,1884_007_Vis4AL_Home4A,298,72.0,298,9919109,3,2


In [24]:
game_dict = {}
for index, row in game_info_fielders.iterrows():
    if row['game_str'] not in game_dict:
        game_dict[row['game_str']] = [] #initialize if not in the dictionary
    game_dict[row['game_str']].append(row['play_per_game']) #if already in the dictionmary just append it to the value list

game_dict['1883_001_Vis1AB_Home1A']

#end up with a dictionary of the key game_Str and the plays where we have a fielder in the game 
len(game_dict) 

334

In [25]:
game_events_fielders = pd.DataFrame()
for game_str, plays in game_dict.items():
    game_rows = game_events[game_events['game_str'] == game_str] #rows where the game_str is in the dictionary from game_events

    for play in plays: 
        play_rows = game_rows[game_rows['play_per_game'] == play] #for a given game_str, get the rows where the play_per_game is in the dictionary (i.e a fielder is in the game)
        game_events_fielders = pd.concat([game_events_fielders, play_rows], ignore_index=True)
game_events_fielders # we end up with a data frame of the game events where a fielder is playing 

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
0,1883_003_Vis1AB_Home1A,1,1.0,1,37948,1,1
1,1883_003_Vis1AB_Home1A,1,1.0,1,38348,10,4
2,1883_003_Vis1AB_Home1A,1,1.0,1,38398,0,5
3,1883_003_Vis1AB_Home1A,2,1.0,2,56898,1,1
4,1883_003_Vis1AB_Home1A,2,1.0,2,57348,10,4
...,...,...,...,...,...,...,...
147269,1884_007_Vis4AL_Home4A,298,72.0,298,9916259,5,2
147270,1884_007_Vis4AL_Home4A,298,72.0,298,9917909,5,3
147271,1884_007_Vis4AL_Home4A,298,72.0,298,9919009,255,16
147272,1884_007_Vis4AL_Home4A,298,72.0,298,9919109,3,2


In [26]:
# Okay so we have all the plays where a fielder is in the game
# Now to filter for plays that the fielder has an oppportunity to make a play, we will look at plays that have a game_event code of 4 (ball hit into play) all the way until 5 (end of play). So basically extracting plays from 1 - 5 if there is a 4 somewhere in there

game_events_fielders2 = pd.DataFrame()
seq_start = None
hit_ball = False
for i in range(len(game_events_fielders)):
    if game_events_fielders.iloc[i]['event_code'] == 1: #if a play has started (1)
        #say that a sequence has started
        seq_start = i 
    elif seq_start is not None and game_events_fielders.iloc[i]['event_code'] == 4:
        hit_ball = True #a ball has been hit if we find a 4
    elif seq_start is not None and hit_ball and game_events_fielders.iloc[i]['event_code'] == 5:
        seq_end = i
        #if a sequence has started, we have identified the ball was hit, and the sequence has ended, we want to identify the end of the play and add the rows between the 1 and 5 to thew new df
        game_events_fielders2 = pd.concat([game_events_fielders2, game_events_fielders.iloc[seq_start:seq_end + 1]], ignore_index=True)
        #reset the sequence indicators
        seq_start = None
        hit_ball = False


In [27]:
print(f"num of events with fielders where the ball is hit vs just 1-2-5 sequences: {len(game_events_fielders2),  len(game_events_fielders)} " )
game_events_fielders2.tail(15) #okay so now we have all the plays (and their unique play id for each game) where a hit actually happens, and therefore a fielder will be attentive b

num of events with fielders where the ball is hit vs just 1-2-5 sequences: (70113, 147274) 


Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
70098,1884_007_Vis4AL_Home4A,297,71.0,297,9873259,10,4
70099,1884_007_Vis4AL_Home4A,297,71.0,297,9875209,255,16
70100,1884_007_Vis4AL_Home4A,297,71.0,297,9875259,6,2
70101,1884_007_Vis4AL_Home4A,297,71.0,297,9876709,6,3
70102,1884_007_Vis4AL_Home4A,297,71.0,297,9877709,3,2
70103,1884_007_Vis4AL_Home4A,297,71.0,297,9877759,0,5
70104,1884_007_Vis4AL_Home4A,298,72.0,298,9914259,1,1
70105,1884_007_Vis4AL_Home4A,298,72.0,298,9914659,10,4
70106,1884_007_Vis4AL_Home4A,298,72.0,298,9914709,255,16
70107,1884_007_Vis4AL_Home4A,298,72.0,298,9916209,255,16


In [28]:
# okay now apply to the ball_pos and player_pos dfs--> we have all the plays of interest for each game, now we want to get the ball position and player position for each of these plays

#set up the original dfs 
ball_pos = pd.concat([ball_pos_season1883, ball_pos_season1884], ignore_index=True)
player_pos = pd.concat([player_pos_season1883, player_pos_season1884], ignore_index=True)

In [29]:
# ball_pos.apply(concat_gamestr_playid, axis=1, vectorize=True)
# player_pos.apply(concat_gamestr_playid, axis=1,vectorize=True)
game_events_fielders2['game_str_play_id_timestamp'] = game_events_fielders2['game_str'].astype(str) + '_' + game_events_fielders2['play_per_game'].astype(str) + game_events_fielders2['timestamp'].astype(str)
game_events_fielders2 = game_events_fielders2[['game_str_play_id_timestamp'] + [col for col in game_events_fielders2.columns if col != 'game_str_play_id_timestamp']] #move to front, rest of the columns in the same order

ball_pos['game_str_play_id_timestamp'] = ball_pos['game_str'].astype(str) + '_' + ball_pos['play_id'].astype(str) + ball_pos['timestamp'].astype(str)
ball_pos = ball_pos[['game_str_play_id_timestamp'] + [col for col in ball_pos.columns if col != 'game_str_play_id_timestamp']]  

player_pos['game_str_play_id_timestamp'] = player_pos['game_str'].astype(str) + '_' + player_pos['play_id'].astype(str)  + player_pos['timestamp'].astype(str)
player_pos = player_pos[['game_str_play_id_timestamp'] + [col for col in player_pos.columns if col != 'game_str_play_id_timestamp']]  


move the ball_pos x, y, z into a tuple that you can access this code is slow use cell below
```
def ball_pos_tuple(row):
    return (row['ball_position_x'], row['ball_position_y'], row['ball_position_z'])


ball_pos['ball_pos'] = ball_pos.apply(ball_pos_tuple, axis=1) 
ball_pos
```

In [30]:
game_events_fielders2

Unnamed: 0,game_str_play_id_timestamp,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code
0,1883_003_Vis1AB_Home1A_137948,1883_003_Vis1AB_Home1A,1,1.0,1,37948,1,1
1,1883_003_Vis1AB_Home1A_138348,1883_003_Vis1AB_Home1A,1,1.0,1,38348,10,4
2,1883_003_Vis1AB_Home1A_138398,1883_003_Vis1AB_Home1A,1,1.0,1,38398,0,5
3,1883_003_Vis1AB_Home1A_256898,1883_003_Vis1AB_Home1A,2,1.0,2,56898,1,1
4,1883_003_Vis1AB_Home1A_257348,1883_003_Vis1AB_Home1A,2,1.0,2,57348,10,4
...,...,...,...,...,...,...,...,...
70108,1884_007_Vis4AL_Home4A_2989916259,1884_007_Vis4AL_Home4A,298,72.0,298,9916259,5,2
70109,1884_007_Vis4AL_Home4A_2989917909,1884_007_Vis4AL_Home4A,298,72.0,298,9917909,5,3
70110,1884_007_Vis4AL_Home4A_2989919009,1884_007_Vis4AL_Home4A,298,72.0,298,9919009,255,16
70111,1884_007_Vis4AL_Home4A_2989919109,1884_007_Vis4AL_Home4A,298,72.0,298,9919109,3,2


In [31]:
ball_pos['ball_pos'] = tuple(zip(ball_pos['ball_position_x'], 
                                ball_pos['ball_position_y'], 
                                ball_pos['ball_position_z']))
ball_pos.drop(columns = ['ball_position_x', 'ball_position_y', 'ball_position_z'], inplace=True)
ball_pos

Unnamed: 0,game_str_play_id_timestamp,game_str,play_id,timestamp,ball_pos
0,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,"(0.891828, 54.1203, 5.55345)"
1,1883_002_Vis1AB_Home1A_122162,1883_002_Vis1AB_Home1A,1,22162,"(0.940503, 47.5086, 5.28546)"
2,1883_002_Vis1AB_Home1A_122212,1883_002_Vis1AB_Home1A,1,22212,"(0.957876, 40.9689, 4.95912)"
3,1883_002_Vis1AB_Home1A_122262,1883_002_Vis1AB_Home1A,1,22262,"(0.943944, 34.5012, 4.5744)"
4,1883_002_Vis1AB_Home1A_122312,1883_002_Vis1AB_Home1A,1,22312,"(0.89871, 28.10607, 4.13136)"
...,...,...,...,...,...
2824320,1884_007_Vis4AL_Home4A_2989918909,1884_007_Vis4AL_Home4A,298,9918909,"(45.6471, 68.1357, 1.246203)"
2824321,1884_007_Vis4AL_Home4A_2989918959,1884_007_Vis4AL_Home4A,298,9918959,"(49.7562, 67.3356, 0.339675)"
2824322,1884_007_Vis4AL_Home4A_2989919009,1884_007_Vis4AL_Home4A,298,9919009,"(53.8197, 66.5178, -0.636369)"
2824323,1884_007_Vis4AL_Home4A_2989919059,1884_007_Vis4AL_Home4A,298,9919059,"(57.5049, 65.6523, -0.09915)"


In [32]:
player_pos_fielders = player_pos[player_pos['player_position'].isin([7,8,9,255])]
player_pos_fielders

Unnamed: 0,game_str_play_id_timestamp,game_str,play_id,timestamp,player_position,field_x,field_y
6,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,7,-125.0007,253.8183
7,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,8,7.4436,301.8720
8,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,9,127.7214,248.6190
18,1883_002_Vis1AB_Home1A_122162,1883_002_Vis1AB_Home1A,1,22162,7,-125.0643,253.8678
19,1883_002_Vis1AB_Home1A_122162,1883_002_Vis1AB_Home1A,1,22162,8,7.4370,301.8990
...,...,...,...,...,...,...,...
100036835,1884_007_Vis4AL_Home4A_2989919559,1884_007_Vis4AL_Home4A,298,9919559,8,20.1444,300.8991
100036836,1884_007_Vis4AL_Home4A_2989919559,1884_007_Vis4AL_Home4A,298,9919559,9,108.6270,193.9260
100036847,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,7,-88.7160,242.2737
100036848,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,8,20.0151,300.3975


In [33]:
player_pos_fielders

Unnamed: 0,game_str_play_id_timestamp,game_str,play_id,timestamp,player_position,field_x,field_y
6,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,7,-125.0007,253.8183
7,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,8,7.4436,301.8720
8,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,9,127.7214,248.6190
18,1883_002_Vis1AB_Home1A_122162,1883_002_Vis1AB_Home1A,1,22162,7,-125.0643,253.8678
19,1883_002_Vis1AB_Home1A_122162,1883_002_Vis1AB_Home1A,1,22162,8,7.4370,301.8990
...,...,...,...,...,...,...,...
100036835,1884_007_Vis4AL_Home4A_2989919559,1884_007_Vis4AL_Home4A,298,9919559,8,20.1444,300.8991
100036836,1884_007_Vis4AL_Home4A_2989919559,1884_007_Vis4AL_Home4A,298,9919559,9,108.6270,193.9260
100036847,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,7,-88.7160,242.2737
100036848,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,8,20.0151,300.3975


In [34]:

combined_keys = player_pos_fielders['game_str_play_id_timestamp'].values
positions = player_pos_fielders['player_position'].values
x_coords = player_pos_fielders['field_x'].values
y_coords = player_pos_fielders['field_y'].values

positions_dict = defaultdict(list)
for i in range(len(combined_keys)):
    positions_dict[combined_keys[i]].append((positions[i], x_coords[i], y_coords[i]))

player_pos_fielders = player_pos_fielders.groupby(['game_str', 'play_id', 'timestamp'])['game_str_play_id_timestamp'].first().reset_index() #create a new df with the first instance of the game_str_play_id_timestamp for each row 

player_pos_fielders['player_positions'] = player_pos_fielders['game_str_play_id_timestamp'].map(
    lambda key: [{pos: (x, y)} for pos, x, y in positions_dict[key]]
) #map the dictionary of the player positions to the game_str_play_id_timestamp


In [37]:
# Filter out the balls that go near the walls, or the corners --> Set up a boundary because we know it will mess with the data
# Eliminate the walls


Unnamed: 0,game_str_play_id_timestamp,game_str,play_id,timestamp,player_position,field_x,field_y
0,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,1,-0.5799,55.5510
1,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,2,0.0000,-4.5000
2,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,3,46.4301,76.6185
3,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,4,23.1222,147.9099
4,1883_002_Vis1AB_Home1A_122112,1883_002_Vis1AB_Home1A,1,22112,5,-61.9377,76.1523
...,...,...,...,...,...,...,...
100036849,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,9,108.3837,193.4163
100036850,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,10,56.1000,60.0066
100036851,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,17,-45.0933,106.8552
100036852,1884_007_Vis4AL_Home4A_2989919609,1884_007_Vis4AL_Home4A,298,9919609,18,65.4312,87.9636
