In [2]:
import re
import polars as pl
from glob import glob

# Using a `dict` to store tables by name.

On occasion, we will need to combine more than 2 files using some combination of `UNION` and `JOIN`.  In this lecture, we will show a clean approach to scaling up these operations up to any number of files.  In the process, we will

1. Use `list` comprehensions to process and `UNION` many similar files.
2. Use `dict` comprehensions to store and access many tables by name.

## Store in `dict` or `list`?

* Natural sequence/order? $\rightarrow$ `list`
    *  Example: Lakes data and years are a natural sequence
* Easier to refer by name? $\rightarrow$ `dict`
    * Baseball files have no order and easier to refer to by name

## The Basics of working with many files.

* Use `glob.glob` to find all files that match a pattern
* Convert all files to `pd.DataFrames`
* Store the `df` in a list or dictionary

### Extracting information from a `glob` search result.

**Options.**
1. Use string methods such as `split`, or
2. Use a regular expression.

In [3]:
(all_baseball_csv 
 := glob('./data/baseballdatabank*/**/*.csv', recursive=True)
)

['./data/baseballdatabank-2023.1/core/Managers.csv',
 './data/baseballdatabank-2023.1/core/Fielding.csv',
 './data/baseballdatabank-2023.1/core/Parks.csv',
 './data/baseballdatabank-2023.1/core/People.csv',
 './data/baseballdatabank-2023.1/core/PitchingPost.csv',
 './data/baseballdatabank-2023.1/core/Teams.csv',
 './data/baseballdatabank-2023.1/core/Appearances.csv',
 './data/baseballdatabank-2023.1/core/TeamsFranchises.csv',
 './data/baseballdatabank-2023.1/core/Batting.csv',
 './data/baseballdatabank-2023.1/core/ManagersHalf.csv',
 './data/baseballdatabank-2023.1/core/FieldingOF.csv',
 './data/baseballdatabank-2023.1/core/Pitching.csv',
 './data/baseballdatabank-2023.1/core/HomeGames.csv',
 './data/baseballdatabank-2023.1/core/BattingPost.csv',
 './data/baseballdatabank-2023.1/core/TeamsHalf.csv',
 './data/baseballdatabank-2023.1/core/SeriesPost.csv',
 './data/baseballdatabank-2023.1/core/FieldingPost.csv',
 './data/baseballdatabank-2023.1/core/AllstarFull.csv',
 './data/baseballdata

#### Example 1 - Using the `str.split` method to extract a file name.

**Note.** The following cells are meant to show how the solution evolves.  In practice, this would all be proto-typed in a single cell.

In [4]:
# 1. Split on forward slash
[p.split('/') for p in all_baseball_csv]

[['.', 'data', 'baseballdatabank-2023.1', 'core', 'Managers.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'Fielding.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'Parks.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'People.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'PitchingPost.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'Teams.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'Appearances.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'TeamsFranchises.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'Batting.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'ManagersHalf.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'FieldingOF.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'Pitching.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'HomeGames.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core', 'BattingPost.csv'],
 ['.', 'data', 'baseballdatabank-2023.1', 'core

In [5]:
# 1. Split on forward slash
# 2. Get the last element 
[p.split('/')[-1] for p in all_baseball_csv]

['Managers.csv',
 'Fielding.csv',
 'Parks.csv',
 'People.csv',
 'PitchingPost.csv',
 'Teams.csv',
 'Appearances.csv',
 'TeamsFranchises.csv',
 'Batting.csv',
 'ManagersHalf.csv',
 'FieldingOF.csv',
 'Pitching.csv',
 'HomeGames.csv',
 'BattingPost.csv',
 'TeamsHalf.csv',
 'SeriesPost.csv',
 'FieldingPost.csv',
 'AllstarFull.csv',
 'FieldingOFsplit.csv',
 'AwardsManagers.csv',
 'AwardsPlayers.csv',
 'Salaries.csv',
 'Schools.csv',
 'AwardsSharePlayers.csv',
 'CollegePlaying.csv',
 'HallOfFame.csv',
 'AwardsShareManagers.csv',
 'Teams.csv']

In [6]:
# 1. Split on forward slash
# 2. Get the last element 
# 3. Split off the file type 
[p.split('/')[-1].split('.') for p in all_baseball_csv]


[['Managers', 'csv'],
 ['Fielding', 'csv'],
 ['Parks', 'csv'],
 ['People', 'csv'],
 ['PitchingPost', 'csv'],
 ['Teams', 'csv'],
 ['Appearances', 'csv'],
 ['TeamsFranchises', 'csv'],
 ['Batting', 'csv'],
 ['ManagersHalf', 'csv'],
 ['FieldingOF', 'csv'],
 ['Pitching', 'csv'],
 ['HomeGames', 'csv'],
 ['BattingPost', 'csv'],
 ['TeamsHalf', 'csv'],
 ['SeriesPost', 'csv'],
 ['FieldingPost', 'csv'],
 ['AllstarFull', 'csv'],
 ['FieldingOFsplit', 'csv'],
 ['AwardsManagers', 'csv'],
 ['AwardsPlayers', 'csv'],
 ['Salaries', 'csv'],
 ['Schools', 'csv'],
 ['AwardsSharePlayers', 'csv'],
 ['CollegePlaying', 'csv'],
 ['HallOfFame', 'csv'],
 ['AwardsShareManagers', 'csv'],
 ['Teams', 'csv']]

In [7]:
# 1. Split on forward slash
# 2. Get the last element 
# 3. Split off the file type 
# 4. Get the first enter (e.g. file name)
[p.split('/')[-1].split('.')[0] for p in all_baseball_csv]

['Managers',
 'Fielding',
 'Parks',
 'People',
 'PitchingPost',
 'Teams',
 'Appearances',
 'TeamsFranchises',
 'Batting',
 'ManagersHalf',
 'FieldingOF',
 'Pitching',
 'HomeGames',
 'BattingPost',
 'TeamsHalf',
 'SeriesPost',
 'FieldingPost',
 'AllstarFull',
 'FieldingOFsplit',
 'AwardsManagers',
 'AwardsPlayers',
 'Salaries',
 'Schools',
 'AwardsSharePlayers',
 'CollegePlaying',
 'HallOfFame',
 'AwardsShareManagers',
 'Teams']

In [8]:
# 1. Split on forward slash
# 2. Get the last element 
# 3. Split off the file type 
# 4. Get the first enter (e.g. file name)
# 5. Refactor
get_file_name = lambda p: p.split('/')[-1].split('.')[0]

[get_file_name(p) for p in all_baseball_csv]


['Managers',
 'Fielding',
 'Parks',
 'People',
 'PitchingPost',
 'Teams',
 'Appearances',
 'TeamsFranchises',
 'Batting',
 'ManagersHalf',
 'FieldingOF',
 'Pitching',
 'HomeGames',
 'BattingPost',
 'TeamsHalf',
 'SeriesPost',
 'FieldingPost',
 'AllstarFull',
 'FieldingOFsplit',
 'AwardsManagers',
 'AwardsPlayers',
 'Salaries',
 'Schools',
 'AwardsSharePlayers',
 'CollegePlaying',
 'HallOfFame',
 'AwardsShareManagers',
 'Teams']

#### Example 2 - Using a regular expression to capture the file name.

In [9]:
# 1. Start with an example path
file_name = re.compile(r'./data/baseballdatabank-2023.1/core/Managers.csv')

[_match for p in all_baseball_csv if (_match := file_name.match(p))]

[<re.Match object; span=(0, 48), match='./data/baseballdatabank-2023.1/core/Managers.csv'>]

In [10]:
# 1. Start with an example path
# 2. Make it match any file name

file_name = re.compile(r'./data/baseballdatabank-2023.1/core/[a-zA-Z]+.csv')

[_match for p in all_baseball_csv if (_match := file_name.match(p))]

[<re.Match object; span=(0, 48), match='./data/baseballdatabank-2023.1/core/Managers.csv'>,
 <re.Match object; span=(0, 48), match='./data/baseballdatabank-2023.1/core/Fielding.csv'>,
 <re.Match object; span=(0, 45), match='./data/baseballdatabank-2023.1/core/Parks.csv'>,
 <re.Match object; span=(0, 46), match='./data/baseballdatabank-2023.1/core/People.csv'>,
 <re.Match object; span=(0, 52), match='./data/baseballdatabank-2023.1/core/PitchingPost.>,
 <re.Match object; span=(0, 45), match='./data/baseballdatabank-2023.1/core/Teams.csv'>,
 <re.Match object; span=(0, 51), match='./data/baseballdatabank-2023.1/core/Appearances.c>,
 <re.Match object; span=(0, 55), match='./data/baseballdatabank-2023.1/core/TeamsFranchis>,
 <re.Match object; span=(0, 47), match='./data/baseballdatabank-2023.1/core/Batting.csv'>,
 <re.Match object; span=(0, 52), match='./data/baseballdatabank-2023.1/core/ManagersHalf.>,
 <re.Match object; span=(0, 50), match='./data/baseballdatabank-2023.1/core/FieldingOF.cs

In [11]:
# 1. Start with an example path
# 2. Make it match any file name
# 3. Make it match any subfolder

file_name = re.compile(r'./data/baseballdatabank-2023.1/[a-z]+/[a-zA-Z]+.csv')

[_match for p in all_baseball_csv if (_match := file_name.match(p))]

[<re.Match object; span=(0, 48), match='./data/baseballdatabank-2023.1/core/Managers.csv'>,
 <re.Match object; span=(0, 48), match='./data/baseballdatabank-2023.1/core/Fielding.csv'>,
 <re.Match object; span=(0, 45), match='./data/baseballdatabank-2023.1/core/Parks.csv'>,
 <re.Match object; span=(0, 46), match='./data/baseballdatabank-2023.1/core/People.csv'>,
 <re.Match object; span=(0, 52), match='./data/baseballdatabank-2023.1/core/PitchingPost.>,
 <re.Match object; span=(0, 45), match='./data/baseballdatabank-2023.1/core/Teams.csv'>,
 <re.Match object; span=(0, 51), match='./data/baseballdatabank-2023.1/core/Appearances.c>,
 <re.Match object; span=(0, 55), match='./data/baseballdatabank-2023.1/core/TeamsFranchis>,
 <re.Match object; span=(0, 47), match='./data/baseballdatabank-2023.1/core/Batting.csv'>,
 <re.Match object; span=(0, 52), match='./data/baseballdatabank-2023.1/core/ManagersHalf.>,
 <re.Match object; span=(0, 50), match='./data/baseballdatabank-2023.1/core/FieldingOF.cs

In [12]:
# 1. Start with an example path
# 2. Make it match any file name
# 3. Make it match any subfolder
# 4. Add a capture group and extract the data

file_name = re.compile(r'./data/baseballdatabank-2023.1/[a-z]+/([a-zA-Z]+).csv')

[_match.group(1) for p in all_baseball_csv if (_match := file_name.match(p))]

['Managers',
 'Fielding',
 'Parks',
 'People',
 'PitchingPost',
 'Teams',
 'Appearances',
 'TeamsFranchises',
 'Batting',
 'ManagersHalf',
 'FieldingOF',
 'Pitching',
 'HomeGames',
 'BattingPost',
 'TeamsHalf',
 'SeriesPost',
 'FieldingPost',
 'AllstarFull',
 'FieldingOFsplit',
 'AwardsManagers',
 'AwardsPlayers',
 'Salaries',
 'Schools',
 'AwardsSharePlayers',
 'CollegePlaying',
 'HallOfFame',
 'AwardsShareManagers',
 'Teams']

In [13]:
# 1. Start with an example path
# 2. Make it match any file name
# 3. Make it match any subfolder
# 4. Add a capture group and extract the data
# 5. Refactor

file_name = re.compile(r'./data/baseballdatabank-2023.1/[a-z]+/([a-zA-Z]+).csv')

get_file_names = lambda paths: [_match.group(1) for p in paths if (_match := file_name.match(p))]

get_file_names(all_baseball_csv)

['Managers',
 'Fielding',
 'Parks',
 'People',
 'PitchingPost',
 'Teams',
 'Appearances',
 'TeamsFranchises',
 'Batting',
 'ManagersHalf',
 'FieldingOF',
 'Pitching',
 'HomeGames',
 'BattingPost',
 'TeamsHalf',
 'SeriesPost',
 'FieldingPost',
 'AllstarFull',
 'FieldingOFsplit',
 'AwardsManagers',
 'AwardsPlayers',
 'Salaries',
 'Schools',
 'AwardsSharePlayers',
 'CollegePlaying',
 'HallOfFame',
 'AwardsShareManagers',
 'Teams']

### Dealing with Windows paths

* Windows uses backslash `\` instead of forward slash `/` to separate folders/files.
* Even on Windows, `glob` understands unix style paths that use `/` to separate files/folders.
* Since `\` is the escape character in Python, Windows paths will contain an escaped/literal backslash `\\`.
* The wild card parts of the pattern will return Windows style paths with `\\`

In [14]:
# Non-recursive search run on Windows
baseball_core_csv = glob('./data/baseballdatabank*/core/*csv', recursive=True)

baseball_core_csv

['./data/baseballdatabank-2023.1/core/Managers.csv',
 './data/baseballdatabank-2023.1/core/Fielding.csv',
 './data/baseballdatabank-2023.1/core/Parks.csv',
 './data/baseballdatabank-2023.1/core/People.csv',
 './data/baseballdatabank-2023.1/core/PitchingPost.csv',
 './data/baseballdatabank-2023.1/core/Teams.csv',
 './data/baseballdatabank-2023.1/core/Appearances.csv',
 './data/baseballdatabank-2023.1/core/TeamsFranchises.csv',
 './data/baseballdatabank-2023.1/core/Batting.csv',
 './data/baseballdatabank-2023.1/core/ManagersHalf.csv',
 './data/baseballdatabank-2023.1/core/FieldingOF.csv',
 './data/baseballdatabank-2023.1/core/Pitching.csv',
 './data/baseballdatabank-2023.1/core/HomeGames.csv',
 './data/baseballdatabank-2023.1/core/BattingPost.csv',
 './data/baseballdatabank-2023.1/core/TeamsHalf.csv',
 './data/baseballdatabank-2023.1/core/SeriesPost.csv',
 './data/baseballdatabank-2023.1/core/FieldingPost.csv',
 './data/baseballdatabank-2023.1/core/AllstarFull.csv',
 './data/baseballdata

In [15]:
# Recursive search run on Windows
all_baseball_csv = glob('./data/baseballdatabank*/**/*.csv', recursive=True)

all_baseball_csv

['./data/baseballdatabank-2023.1/core/Managers.csv',
 './data/baseballdatabank-2023.1/core/Fielding.csv',
 './data/baseballdatabank-2023.1/core/Parks.csv',
 './data/baseballdatabank-2023.1/core/People.csv',
 './data/baseballdatabank-2023.1/core/PitchingPost.csv',
 './data/baseballdatabank-2023.1/core/Teams.csv',
 './data/baseballdatabank-2023.1/core/Appearances.csv',
 './data/baseballdatabank-2023.1/core/TeamsFranchises.csv',
 './data/baseballdatabank-2023.1/core/Batting.csv',
 './data/baseballdatabank-2023.1/core/ManagersHalf.csv',
 './data/baseballdatabank-2023.1/core/FieldingOF.csv',
 './data/baseballdatabank-2023.1/core/Pitching.csv',
 './data/baseballdatabank-2023.1/core/HomeGames.csv',
 './data/baseballdatabank-2023.1/core/BattingPost.csv',
 './data/baseballdatabank-2023.1/core/TeamsHalf.csv',
 './data/baseballdatabank-2023.1/core/SeriesPost.csv',
 './data/baseballdatabank-2023.1/core/FieldingPost.csv',
 './data/baseballdatabank-2023.1/core/AllstarFull.csv',
 './data/baseballdata

### Extracting information from Windows paths

On Windows, we need to switch `/` to `\\`

#### Example 1 - Split and get the file name in Windows

In [16]:
get_file_name = lambda p: p.split('\\')[-1].split('.')[0]

[get_file_name(p) for p in all_baseball_csv]


['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

#### Example 2 - Using a regular expression to extract file name in Windows

In [17]:
file_name = re.compile(r'./data\\baseballdatabank-2023.1\\[a-z]+\\([a-zA-Z]+).csv')

get_file_names = lambda paths: [_match.group(1) for p in paths if (_match := file_name.match(p))]

get_file_names(all_baseball_csv)

[]

## Storing tables by name in a `dict`

### Example 1 - Read all baseball database using `dict`

**Task:** Create a `dict` of tables for all tables in the Lahman database

#### Step 1 - Use `glob` to find paths for all CSV files

In [18]:
from glob import glob

(files := 
 glob('./data/baseball/core/*.csv')
)

[]

#### Step 2 - Make a function to extract the table name

In [19]:
import re
FILE_NAME_RE = re.compile(r'^\./data/baseball/core/([a-zA-Z_]*)\.csv$')
file_name = lambda p: FILE_NAME_RE.match(p).group(1) if FILE_NAME_RE.match(p) else None

[file_name(p) for p in files]

[]

#### 4 - Read in the tables.

In [20]:
(baseball_db := 
 {file_name(p):pl.read_csv(p, infer_schema_length=10000) for p in files}
)

{}

### We can now access all the tables by name.

In [21]:
# Biggish output
baseball_db

{}

In [22]:
baseball_db['Teams']

KeyError: 'Teams'

In [None]:
baseball_db['Batting']

## Example 2 - Reading and joining the baseball database using `dict`

**Task:** Collect the number of total hits for each batters in the 2010 season join on their first and last name.

In the second example, we will store the data frames in a `dict`, which will make it easier to join the files by ne

#### Step 1 - Get the files names

* Only need the `Batting.csv` and `People.csv`.  
* Narrow with a RegEx

In [None]:
from glob import glob

(files := 
 glob('./data/baseball/core/*.csv')
)

#### Step 2 - Make helper functions to get the name from path

In [None]:
import re
FILE_NAME_RE = re.compile(r'^\./data/baseball/core/(Batting|People)\.csv$')
is_batting_or_people = lambda p: FILE_NAME_RE.match(p)
file_name = lambda p: FILE_NAME_RE.match(p).group(1) 

[file_name(p) for p in files if is_batting_or_people(p)]

#### Step 3 - Use a comprehension to read in all files

**Note:** The data is small (< 10mb total) so it is safe to read all at once.

In [None]:
dfs = {file_name(p):pl.read_csv(p) for p in files if is_batting_or_people(p)}
dfs['Batting'].head()

In [None]:
dfs['People'].head()

#### Step 4 - Preprocess each file.

In [None]:
# Filter, select, and aggregate hits for 2010.
(hits_in_2010_raw := 
 dfs['Batting']
.select(['yearID', 'playerID', 'H'])
.filter(pl.col('yearID') == 2010)
.group_by('playerID')
.agg(pl.col('H').mean().alias('Total Hits'))
).head(2)

In [None]:
# Grab the first and last names from People.

(player_names := 
 dfs['People']
 .select(['playerID', 'nameFirst', 'nameLast'])
).head(2)

#### Step 4 -- Join the tables

In [None]:
(hits_in_2010 := 
 hits_in_2010_raw 
 .join(player_names, on='playerID', how='left')
 .drop('playerID')
).head()

## <font color="red"> Exercise 3.2 </font>

We want to get the total hits allowed for all pitchers during the 2000-2010 seasons.  Use `glob` and a `dict` to collect this information into a table that includes the players first and last names.

In [29]:
from glob import glob
from pathlib import Path
import polars as pl

files = glob("./data/baseballdatabank-2023.1/core/*.csv")

baseball_db = {
    Path(p).stem.lower(): pl.read_csv(p, infer_schema_length=10000)
    for p in files
}

pitching = baseball_db["pitching"]
people   = baseball_db["people"]

pitching_2000_2010 = pitching.filter(
    (pl.col("yearID") >= 2000) & (pl.col("yearID") <= 2010)
)

hits_allowed = (
    pitching_2000_2010
    .group_by("playerID")
    .agg(pl.sum("H").alias("total_hits_allowed"))
)

result = (
    hits_allowed.join(
        people.select(["playerID", "nameFirst", "nameLast"]),
        on="playerID",
        how="left"
    )
    .select(["playerID", "nameFirst", "nameLast", "total_hits_allowed"])
    .sort("total_hits_allowed", descending=True)
)

result.head()


playerID,nameFirst,nameLast,total_hits_allowed
str,str,str,i64
"""hernali01""","""Livan""","""Hernandez""",2666
"""buehrma01""","""Mark""","""Buehrle""",2389
"""suppaje01""","""Jeff""","""Suppan""",2298
"""vazquja01""","""Javier""","""Vazquez""",2256
"""millwke01""","""Kevin""","""Millwood""",2146
