# SF Giants pitching logs by season, 1958-2023
> This notebook visusalizes current and past game-by-game and cumulative totals for strikeouts, walks, ERA, etc., using data from [Baseball Reference](https://www.baseball-reference.com/teams/tgl.cgi?team=SFG&t=p&year=2024).

---

In [8]:
#!/usr/bin/env python
# coding: utf-8

import os
import requests
import datetime
import pandas as pd
from io import BytesIO
import boto3
import logging
from time import sleep
from random import randint

In [2]:
# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Determine if running in a GitHub Actions environment
is_github_actions = os.getenv('GITHUB_ACTIONS') == 'true'

# AWS credentials and session initialization
aws_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
aws_secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
aws_region = "us-east-1"

# Conditional AWS session creation based on the environment
if is_github_actions:
    # In GitHub Actions, use environment variables directly
    session = boto3.Session(
        aws_access_key_id=aws_key_id,
        aws_secret_access_key=aws_secret_key,
        region_name=aws_region
    )
else:
    # Locally, use a specific profile
    session = boto3.Session(profile_name="mattwilkens", region_name=aws_region)

s3_resource = session.resource("s3")

# Base directory settings
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'data', 'pitching')
# os.makedirs(data_dir, exist_ok=True)

profile_name = os.environ.get("AWS_PERSONAL_PROFILE")
today = datetime.date.today()
year = today.year

2024-06-02 14:43:14,281 - INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
}

In [4]:
# Fetch archive game logs
#archive_url = "https://stilesdata.com/dodgers/data/pitching/archive/dodgers_historic_pitching_gamelogs_1958_2023.parquet"
#archive_df = pd.read_parquet(archive_url)

In [9]:
dfs = []
for year in range(1958,2024):
    # Fetch Current game logs
    current_url = f"https://www.baseball-reference.com/teams/tgl.cgi?team=SFG&t=p&year={year}"
    current_src = pd.read_html(current_url)[1].assign(year=year).query('SO != "SO"')
    current_src.columns = current_src.columns.str.lower()
    
    # Process current game logs
    current_src["game_date"] = pd.to_datetime(
        current_src["date"] + " " + current_src["year"].astype(str),
        format="%b %d %Y",
        errors="coerce"
    ).dt.strftime("%Y-%m-%d")
    print(current_src.columns)
    # Just the columns we need
    keep_cols = ['gtm', 'year', 'game_date', 'h', 'hr', 'er', 'so', 'era']
    current_df = current_src[keep_cols].copy()
    
    # Define value columns
    int_cols = ["gtm", 'h', 'hr', 'er', 'so']
    
    # Convert value columns to numbers
    current_df[int_cols] = current_df[int_cols].astype(int)
    current_df['era'] = current_df['era'].astype(float)
    current_df['era_cum'] = current_df['era']
    
    # Calculate cumulative columns
    for col in ['h', 'hr', 'er', 'so']:
        current_df[f"{col}_cum"] = current_df.groupby("year")[col].cumsum()
    dfs.append(current_df)
    sleep(randint(3, 6))

In [10]:
"""
MERGE
"""

'\nMERGE\n'

In [10]:
# Combine current and archive data
df = (
    pd.concat(dfs)
    .sort_values(["year", "gtm"], ascending=[False, True])
    .reset_index(drop=True)
    .drop_duplicates()
)

In [11]:
df

Unnamed: 0,gtm,year,game_date,h,hr,er,so,era,era_cum,h_cum,hr_cum,er_cum,so_cum
0,1,2023,2023-03-30,8,2,5,16,5.62,5.62,8,2,5,16
1,2,2023,2023-04-01,10,2,4,9,4.76,4.76,18,4,9,25
2,3,2023,2023-04-02,7,3,6,6,5.40,5.40,25,7,15,31
3,4,2023,2023-04-03,5,1,3,5,4.76,4.76,30,8,18,36
4,5,2023,2023-04-05,13,0,4,7,4.71,4.71,43,8,22,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10434,150,1958,1958-09-23,10,1,3,7,3.94,3.94,1353,161,592,757
10435,151,1958,1958-09-24,14,1,9,2,3.97,3.97,1367,162,601,759
10436,152,1958,1958-09-26,6,1,2,8,3.96,3.96,1373,163,603,767
10437,153,1958,1958-09-27,16,1,9,4,3.99,3.99,1389,164,612,771


In [12]:
"""
OUTPUT
"""

'\nOUTPUT\n'

In [12]:
# Optimize DataFrame for output
optimized_df = df[['gtm', 'year', 'game_date', 'era_cum','h_cum', 'hr_cum', 'er_cum', 'so_cum']].copy()

In [13]:
def save_dataframe(df, path_without_extension, formats):
    """
    Save a DataFrame in multiple formats.
    """
    for file_format in formats:
        if file_format == "csv":
            df.to_csv(f"{path_without_extension}.{file_format}", index=False)
        elif file_format == "json":
            df.to_json(
                f"{path_without_extension}.{file_format}", indent=4, orient="records"
            )
        elif file_format == "parquet":
            df.to_parquet(f"{path_without_extension}.{file_format}", index=False)
        else:
            print(f"Unsupported format: {file_format}")

In [14]:
formats = ['csv', 'json', 'parquet']
save_dataframe(
    optimized_df,
    "../data/pitching/archive/giants_historic_pitching_gamelogs_1958_2023",
    formats,
)

In [17]:
# Function to save DataFrame to S3
def save_to_s3(df, base_path, s3_bucket, formats):
    for fmt in formats:
        try:
            buffer = BytesIO()
            if fmt == "csv":
                df.to_csv(buffer, index=False)
                content_type = "text/csv"
            elif fmt == "json":
                df.to_json(buffer, indent=4, orient="records", lines=False)
                content_type = "application/json"
            elif fmt == "parquet":
                df.to_parquet(buffer, index=False)
                content_type = "application/octet-stream"
            buffer.seek(0)
            s3_resource.Bucket(s3_bucket).put_object(Key=f"{base_path}.{fmt}", Body=buffer, ContentType=content_type)
            logging.info(f"Uploaded {fmt} to {s3_bucket}/{base_path}.{fmt}")
        except Exception as e:
            logging.error(f"Failed to upload {fmt} to S3: {e}")

# Saving files locally and to S3
file_path = os.path.join(data_dir, 'dodgers_historic_pitching_gamelogs_1958-present')
formats = ["csv", "json", "parquet"]
# save_dataframe(optimized_df, file_path, formats)
save_to_s3(optimized_df, "dodgers/data/pitching/dodgers_historic_pitching_gamelogs_1958-present", "stilesdata.com", formats)

2024-05-17 08:18:57,122 - INFO - Uploaded csv to stilesdata.com/dodgers/data/pitching/dodgers_historic_pitching_gamelogs_1958-present.csv
2024-05-17 08:18:58,202 - INFO - Uploaded json to stilesdata.com/dodgers/data/pitching/dodgers_historic_pitching_gamelogs_1958-present.json
2024-05-17 08:18:58,382 - INFO - Uploaded parquet to stilesdata.com/dodgers/data/pitching/dodgers_historic_pitching_gamelogs_1958-present.parquet


In [16]:
# Save a copy of notebook as a python script
# !jupyter nbconvert --to script --no-prompt --output ../scripts/11_fetch_process_historic_pitching_gamelogs 13_fetch_process_historic_pitching_gamelogs.ipynb