In [0]:
%pyspark
#!/usr/bin/env python3

import pybaseball
from pybaseball import statcast
import pandas as pd
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta

# Start the timer logcd

start_time = datetime.now()
print('Pipeline Initiated')

dflog = pd.DataFrame(columns=['pipeline', 'database', 'table', 'start_time', 'end_time', 'username'])

print('Connecting to Database')

# MySQL Database Connection Details
server_name = '10.139.0.31'
database_name = 'baseball'
username = 'sqltravis'
password = 

# Create SQLAlchemy Engine for MySQL
connection_string = f'mysql+pymysql://{username}:{password}@{server_name}/{database_name}'
engine = create_engine(connection_string)

print('Connection Successful')

# Define table name
table_name = 'PitchData'

# Determine the most recent loaded date in the database
with engine.connect() as conn:
    result = conn.execute(text(f"SELECT MAX(game_date) FROM {table_name}"))
    last_loaded_date = result.scalar()

# Set default start date if no data is present
if last_loaded_date is None:
    start_date = '2025-03-01'  # Fallback start date
else:
    start_date = (last_loaded_date + timedelta(days=1)).strftime('%Y-%m-%d')

# Set end date as today
end_date = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')  # Always yesterday

print(f'Fetching data from {start_date} to {end_date}')

# Get Data From StatCast
baseball_data = statcast(start_dt=start_date, end_dt=end_date)

# If no new data, exit
if baseball_data.empty:
    print("No new data to load. Exiting.")
else:
    # Add insert_time column
    baseball_data.insert(len(baseball_data.columns), 'insert_time', datetime.now())

    # Append data to the existing table
    baseball_data.to_sql(table_name, engine, if_exists='append', index=False)

    # Capture end time
    end_time = datetime.now()

    # Insert log data into DataFrame
    new_row_data = {
        'pipeline': 'savant-to-baseball',
        'database': database_name,
        'table': table_name,
        'start_time': start_time,
        'end_time': end_time,
        'username': username
    }

    # Append new row to the log DataFrame
    dflog.loc[len(dflog)] = new_row_data

    # Define log table name
    log_table_name = 'PipelineLog'

    # Insert log data into MySQL table
    dflog.to_sql(log_table_name, engine, if_exists='append', index=False)

    print(f'New data from {start_date} to {end_date} inserted into {table_name} successfully.')
