In [25]:
import pandas as pd
import numpy as np
import psycopg2 as ps
from psycopg2 import sql
from sqlalchemy import create_engine, text
import os
import openpyxl
from datetime import date
import glob

# Set display option to show all columns 
pd.set_option('display.max_columns', None)

# Change the current working directory
os.chdir('C:\\Users\\cantr\\OneDrive\\Coding\\GiordanoDB\\Peachtree\\registration_data')

In [48]:
today = date.today().strftime("%m/%d/%Y")

In [26]:
current_directory = os.getcwd()
print(current_directory)

C:\Users\cantr\OneDrive\Coding\GiordanoDB\Peachtree\registration_data


In [27]:
# Function to parse and localize datetime
def parse_and_localize(date_str):
    # Remove timezone info from string
    date_str_no_tz = date_str.rsplit(' ', 1)[0]
    # Parse the datetime without timezone
    dt = pd.to_datetime(date_str_no_tz, format='%m/%d/%Y %I:%M%p')
    # # Localize the timezone (in this case, EST)
    # dt = dt.tz_localize('US/Eastern')
    return dt

# Function to apply conditional division
def divide_or_zero(row):
    if row['division'] in [8, 13, 14]:
        row[columns_to_divide] = 0
    else:
        row[columns_to_divide] = row[columns_to_divide] / order_counts[row.name]
    return row

def sanitize_columns(col_index: pd.Index, desired_columns: list = None) -> pd.Index:
    # Remove literal backslash-uFEFF sequences
    col_index = col_index.str.replace(r'\\uFEFF', '', regex=True)
    # Remove any real BOM character at start
    col_index = col_index.str.lstrip('\ufeff')
    # Trim whitespace
    col_index = col_index.str.strip()
    # Normalize: lowercase and replace spaces with underscores
    col_index = col_index.str.lower().str.replace(' ', '_')
    
    return col_index

# def summarize_dataframe(df):
#     """Summarize a dataframe, and report missing values."""
#     missing_values = pd.concat([
#         pd.DataFrame(df.columns, columns=['Variable Name']),
#         pd.DataFrame(df.dtypes.values.reshape([-1,1]), columns=['Data Type']),
#         pd.DataFrame(df.isnull().sum().values, columns=['Missing Values']),
#         pd.DataFrame([df[name].nunique() for name in df.columns], columns=['Unique Values'])
#     ], axis=1).set_index('Variable Name')

#     with pd.option_context("display.max_rows", 1000):
#         summary = pd.concat([missing_values, df.describe(include='all').transpose()], axis=1).infer_objects(copy=False).fillna("")
#         display(summary)


In [28]:
# 1) Helper to clean up column names
#see function in block above

# 2) Season selector
season = 'spring 2026'
full_or_override = season.replace(' ', '_') if season != 'full' else 'full'

# 3) Gather CSVs
all_files = glob.glob('*.csv')

if full_or_override == 'full':
    csv_files_to_process = all_files
else:
    match = next((f for f in all_files if full_or_override in f), None)
    if not match:
        raise FileNotFoundError(f"No CSV file found matching '{full_or_override}'")
    csv_files_to_process = [match]

# 4) Define your clean, desired column order
desired_columns = [
    'Entry Number', 'SportsEngine ID', 'Registration Date', 'First Name',
    'Last Name', 'Preferred Name', 'Birthdate', 'Gender', 'Shirt Size',
    'Previous Seasons?', 'Previous Spring Team', 'Address 1', 'City',
    'State', 'Zip', 'Country', 'School', 'Grade', 'Sport',
    'division', 'Division (Softball)', 'Division (SB)', 'player_age', 'SB Age',
    'Coach Request', 'Player Request', 'All Girl Tball team?',
    'All Girl Coach Pitch Team', 'Sibling?', 'CANNOT Practice Days',
    'Guardian First Name', 'Guardian Last Name', 'play_up_acknowledgment',
    'Parent/Guardian Cell Phone', 'Guardian Home Phone', 'Guardian Email',
    'Coaching Interest?', 'Head vs Assistant Coach',
    'First Name of Potential Coach', 'Last Name of Potential Coach',
    'Shirt Size_Coach', 'Sponsor Interest?', 'Order Number',
    'Account Email', 'Entry Status', 'Order Status', 'Gross', 'Net',
    'Service Fee', 'Discount Amount', 'Discount Names', 'Subtotal',
    'Refunds', 'Donate', 'Season Team Name', 'Season Team Division Name'
]

# 5) Read, sanitize, tag, and collect
dfs = []
for file in csv_files_to_process:
    df = pd.read_csv(file, encoding='utf-8-sig')
    
    # Sanitize all header names
    df.columns = sanitize_columns(df.columns)
    
    # Map column variations to standard names before reindexing
    column_mapping = {
        'sport_': 'sport',
        'division_(baseball_and_t-ball)': 'division',
        'division_(bb/tb)': 'division',
        'division_(sb)': 'division_(softball)',
        'i_understand_the_criteria_for_evaluating_to_play_up_in_an_older_division_as_written_in_the_peachtree_bylaws_and_summarized_above.': 'play_up_acknowledgment',
        'bb_age': 'player_age'
    }
    df.rename(columns=column_mapping, inplace=True)
    
    # Reorder to desired columns (missing columns become NaN)
    desired_cols_lower = [col.lower().replace(' ', '_') for col in desired_columns]
    df = df.reindex(columns=desired_cols_lower)

    # Tag with season identifier
    if full_or_override == 'full':
        base = os.path.splitext(os.path.basename(file))[0]
        file_identifier = '_'.join(base.split('_')[1:3])
    else:
        file_identifier = full_or_override
    
    df['season'] = file_identifier

    # # Rename to final standard names
    # df.rename(columns={'division_(baseball_and_t-ball)': 'division'
    #                   , 'division_(softball)': 'division_softball'
    #                   }, inplace=True)
    
    dfs.append(df)

# 6) Combine everything
df_peachtree_orig = pd.concat(dfs, ignore_index=True)

In [29]:
# df_peachtree_orig[df_peachtree_orig['last_name']=='Giordano']
# df_peachtree_orig.head(5)
# df_peachtree_orig.columns
# df_peachtree_orig['division'].value_counts().reset_index(name='count')

In [30]:
conn = ps.connect(database="1264bra", user="postgres", password="password", host="localhost", port="5432") 

cur = conn.cursor()

sql_query_leagues = """
    select league_id
            , sub_division
    from peachtree.league_hierarchy
"""

cur.execute(sql_query_leagues)

results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_hierarchy = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_donations = """
    select gender_id
            , gender
    from peachtree.gender
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_gender = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_donations = """
    select all_girl_preference_id
            , all_girl_coach_pitch_team
            , preference_standardized
    from peachtree.all_girls_team_preference
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_all_girl_cp = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_donations = """
    select donation_id
            , donation_sub_category
            , amount
    from peachtree.donation_category
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_donations = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_player_info = """
    select peachtree_id
            , sportsengine_id
    from peachtree.player_info
"""
cur.execute(sql_query_player_info)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_player_info= pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_schools = """
        select school_original
                        , school_normalized_id
        from peachtree.vw_school_reference
"""
cur.execute(sql_query_schools)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_schools= pd.DataFrame(results, columns=column_names)


# Close the cursor and connection 
cur.close() 
conn.close()

In [31]:
df_peachtree_dict = df_peachtree_hierarchy.set_index('sub_division')['league_id'].to_dict()

df_peachtree_donation_dict = df_peachtree_donations.set_index('donation_sub_category')['donation_id'].to_dict()

df_peachtree_gender_dict = df_peachtree_gender.set_index('gender')['gender_id'].to_dict()

df_peachtree_all_girl_dict = df_peachtree_all_girl_cp.set_index('all_girl_coach_pitch_team')['all_girl_preference_id'].to_dict()

df_peachtree_player_dict = df_peachtree_player_info.set_index('sportsengine_id')['peachtree_id'].to_dict()

df_peachtree_schools_dict = df_peachtree_schools.set_index('school_original')['school_normalized_id'].to_dict()

In [32]:
df_peachtree = df_peachtree_orig.copy()

In [33]:
# df_peachtree_orig.head()

In [34]:
# Remove commas from 'Registration Date' column
df_peachtree['registration_date'] = df_peachtree['registration_date'].str.replace(',', '')

In [35]:
# Apply function to DataFrame column
df_peachtree['registration_date'] = df_peachtree['registration_date'].apply(parse_and_localize)

# Convert to datetime without timezone information
df_peachtree['registration_date'] = pd.to_datetime(df_peachtree['registration_date'], format='%m/%d/%Y %I:%M%p')

In [36]:
# Creating the new date column 'registration_date'
df_peachtree['registration_date'] = df_peachtree['registration_date'].dt.date

# Assuming df_peachtree is your DataFrame
df_peachtree['registration_date'] = pd.to_datetime(df_peachtree['registration_date'])

# Assuming df_peachtree is your DataFrame
df_peachtree = df_peachtree.map(lambda x: x.strip() if isinstance(x, str) else x)

# Extract 4-digit year
df_peachtree['season_year'] = df_peachtree['season'].str.extract(r'(\d{4})').astype(float)

# Mask: any season in 2026 or later
mask = df_peachtree['season_year'] >= 2026

df_peachtree['sport'] = df_peachtree['sport'].astype(str)

df_peachtree.loc[
    mask & df_peachtree['sport'].isin(['T-Ball','Coach Pitch','Baseball']),
    'sport'
] = 'Baseball'

# Apply the replacement only where the mask is True
df_peachtree.loc[mask & df_peachtree['sport'].isin(['T-Ball','Coach Pitch','Baseball']), 'sport'] = 'Baseball'

# Remove underscores from the 'filename' column
df_peachtree['season'] = df_peachtree['season'].str.replace('_', ' ')

# Remove ' from the 'school' column
df_peachtree['school'] = df_peachtree['school'].str.replace("â€™", "'")

# Removing '?' from any column headers 
df_peachtree.columns = df_peachtree.columns.str.replace('?', '')

# Applying title case to 'first_name' and 'last_name' columns
df_peachtree['first_name'] = df_peachtree['first_name'].str.title()
df_peachtree['last_name'] = df_peachtree['last_name'].str.title()


df_peachtree.loc[df_peachtree['division'].isnull(), 'division'] = df_peachtree.loc[df_peachtree['division'].isnull(), 'division_(softball)']
df_peachtree.loc[df_peachtree['player_age'].isnull(), 'player_age'] = df_peachtree.loc[df_peachtree['player_age'].isnull(), 'sb_age']
# df_peachtree['division'] = df_peachtree['division'].fillna(df_peachtree['division_(softball)'])

df_peachtree['division'] = df_peachtree['division'].map(df_peachtree_dict)

df_peachtree['donate'] = df_peachtree['donate'].map(df_peachtree_donation_dict)

df_peachtree['gender'] = df_peachtree['gender'].map(df_peachtree_gender_dict)

# Convert the dictionary keys to lowercase, handling NoneType keys
df_peachtree_schools_dict = {k.lower() if k is not None else 'unknown': v for k, v in df_peachtree_schools_dict.items()}

# Replace NoneType and 'None' string values in the 'school' column with 'Unknown'
df_peachtree['school'] = df_peachtree['school'].fillna('Unknown').replace('None', 'Unknown')

# Map the 'school' column in a case-insensitive manner
# df_peachtree['school'] = df_peachtree['school'].str.lower().map(df_peachtree_schools_dict)
df_peachtree['school'] = df_peachtree['school'].str.strip().str.lower().map(df_peachtree_schools_dict)

# Custom mapping function to handle NaN values
def map_values(val, mapping_dict):
    if pd.isna(val):
        return mapping_dict.get(None)
    else:
        return mapping_dict.get(val, val)

# Applying the custom mapping function
df_peachtree['all_girl_coach_pitch_team'] = df_peachtree['all_girl_coach_pitch_team'].apply(map_values, args=(df_peachtree_all_girl_dict,))

df_peachtree['peachtree_id'] = df_peachtree['sportsengine_id'].map(df_peachtree_player_dict)

# Define the columns and their target data types
columns_to_convert = {
    'gross': float,
    'net': float,
    'service_fee': float,
    'discount_amount': float,
    'subtotal': float,
    'refunds': float,
    'entry_number': object,
    'donate': object,
    'peachtree_id':object,
    'school': object, 
    'division': object
}

# Remove dollar signs and convert to the specified data types
for column, dtype in columns_to_convert.items():
    if dtype == float:
        df_peachtree[column] = df_peachtree[column].replace(r'[\$,]', '', regex=True).astype(dtype)
    elif dtype == 'int64':
        df_peachtree[column] = df_peachtree[column].fillna(0).astype(dtype)
    else:
        df_peachtree[column] = df_peachtree[column].astype(dtype)

# Update 'division' based on 'refunds to 'Refunded'/8
df_peachtree['division'] = df_peachtree.apply(lambda row: 8 if row['refunds'] > 0 else row['division'], axis=1)


# Calculate the count of records for each order_number, excluding divisions 13 and 14
order_counts = df_peachtree[df_peachtree['division'].isin([8, 13, 14]) == False].groupby('order_number')['order_number'].transform('count')

# Columns to be divided
columns_to_divide = ['gross', 'net', 'service_fee', 'subtotal']

# # Divide the selected columns by the count, excluding divisions 13 and 14
# df_peachtree[columns_to_divide] = df_peachtree.apply(lambda row: row[columns_to_divide] / order_counts[row.name] if row['division'] not in [13, 14] else row[columns_to_divide], axis=1)

# Apply the function to each row in the DataFrame 
df_peachtree = df_peachtree.apply(divide_or_zero, axis=1)

# Renaming the column 'donate' to 'donate_id' 
df_peachtree.rename(columns={'donate': 'donation_id'
                             ,'gender':'gender_id'
                             , 'all_girl_coach_pitch_team': 'all_girl_coach_pitch_id'
                             }, inplace=True)

# Fill null donation_id with 0.00
df_peachtree['donation_id'] = df_peachtree['donation_id'].fillna(0.00)

df_peachtree = df_peachtree.drop(columns=['division_(softball)'
                                          , 'sb_age'
                                          , 'division_(sb)'
                                        #   , 'last_name'
                                        #   , 'preferred_name'
                                        #   , 'birthdate'
                                        #   , 'address_1'
                                        #   , 'city'
                                        #   , 'state'
                                        #   , 'zip'
                                        #   , 'sportsengine_id'
                                          ])

In [37]:
# df_peachtree['division'].value_counts().reset_index()
# df_peachtree[df_peachtree['division']==8]

In [38]:
# Calculate subtotal_true with additional checks
df_peachtree['subtotal_true'] = df_peachtree.apply(lambda row: 
    row['subtotal'] - df_peachtree_donations[df_peachtree_donations['donation_id'] == row['donation_id']]['amount'].sum() 
    if row['donation_id'] != 0 else row['subtotal'], axis=1)


In [39]:
# df_peachtree[(df_peachtree['season']== 'spring 2025') & (df_peachtree['last_name']=="Giordano")]

In [40]:
# summarize_dataframe(df_peachtree)

In [41]:
# df_peachtree[(df_peachtree['school']==58) & (df_peachtree['season']== 'spring 2024')]
df_peachtree[(df_peachtree['school'].isna()) & (df_peachtree['season']== 'spring 2026')]
# df_peachtree[(df_peachtree['division'].isna()) & (df_peachtree['season']== 'fall 2025')]
# df_peachtree[df_peachtree['registration_date'] == '2023-11-30']
# df_peachtree['division'].value_counts()
# df_peachtree[df_peachtree['division']==6]
# df_peachtree.value_counts('gender')
# df_peachtree[df_peachtree['last_name']=='Teel']
# df_peachtree

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender_id,shirt_size,previous_seasons,previous_spring_team,address_1,city,state,zip,country,school,grade,sport,division,player_age,coach_request,player_request,all_girl_tball_team,all_girl_coach_pitch_id,sibling,cannot_practice_days,guardian_first_name,guardian_last_name,play_up_acknowledgment,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donation_id,season_team_name,season_team_division_name,season,season_year,peachtree_id,subtotal_true
15,156795885,SN-PBN708742,2025-12-01,Donald,Gheen,,12/19/2021,2,Y-S,0,,3355 Ridge Road,Charlottesville,VA,22901,United States,,Pre-School or Pre-K,"T-Ball, Coach Pitch and Baseball",7,4.0,,,,3,"Jocelyn Gheen, 8U Softball",,DONALD,GHEEN,,7032977123,7032977123,don.gheen@gmail.com,No,,,,,No,ALXI20271,don.gheen@gmail.com,Active,Paid,123.333333,118.826667,4.506667,0.0,,123.333333,0.0,0.0,,,spring 2026,2026.0,747,123.333333
28,156838528,SN-PMY788990,2025-12-02,Thomas,Rogers,Clark,10/28/2020,2,Y-XS,0,,8148 West End Drive,Crozet,VA,22932,United States,,Pre-School or Pre-K,"T-Ball, Coach Pitch and Baseball",7,5.0,,,,3,,,Thomas,Rogers,,4345470151,4345470151,thomascrogers2014@gmail.com,No,,,,,No,AIGR23654,thomascrogers2014@gmail.com,Active,Paid,105.0,100.09,4.91,0.0,,105.0,0.0,0.0,,,spring 2026,2026.0,750,105.0
60,156918921,SN-PET699304,2025-12-05,Caleb,Rubin,,05/12/2015,2,Y-L,3,Minors - Mariachis,1832 Wickham Place,Charlottesville,VA,22901,United States,,5,"T-Ball, Coach Pitch and Baseball",23,10.0,,,,3,,Wednesday,Jake,Rubin,I understand,4342495724,4342495724,rubin.jake@gmail.com,No,,,,,No,LUEY18135,rubin.jake@gmail.com,Active,Paid,120.0,114.6,5.4,0.0,,120.0,0.0,0.0,,,spring 2026,2026.0,473,120.0
70,156929551,SN-PLG792675,2025-12-05,William,Sewell,,01/24/2021,2,Y-S,0,,1045 Autumn Hill Ct,Crozet,VA,22932,United States,,Pre-School or Pre-K,"T-Ball, Coach Pitch and Baseball",7,5.0,,,,3,No,,Jesse,Sewell,,4349897191,4349897191,jwsewell1@gmail.com,No,,,,,No,LXOK55997,jwsewell1@gmail.com,Active,Paid,105.0,100.09,4.91,0.0,,105.0,0.0,0.0,,,spring 2026,2026.0,758,105.0
79,156963722,SN-PLV858088,2025-12-06,Miles,Wargo,,08/28/2021,2,Y-XS,0,,5044 Clearfields Court,Crozet,VA,22932,United States,,Pre-School or Pre-K,"T-Ball, Coach Pitch and Baseball",7,4.0,,,,3,,,Elaine,Wargo,,8146026258,8146026258,empiet90@gmail.com,No,,,,,No,TIEZ69923,elainemwargo@gmail.com,Active,Paid,130.0,124.28,5.72,0.0,,130.0,0.0,1.0,,,spring 2026,2026.0,760,105.0


In [42]:
# Apply season filter only if a specific override is set
if full_or_override != 'full':
    df_peachtree_orig = df_peachtree_orig[df_peachtree_orig['season'] == season]

In [43]:
# # Get the total sum of 'gross' grouped by 'season'
# gross_totals_by_season = df_peachtree.groupby('season')['gross'].sum()

# # Display the result
# print(gross_totals_by_season)

In [44]:
df_peachtree.shape[0]

107

In [45]:
# Determine write mode based on override
ReplaceOrAppend = 'replace' if full_or_override == 'full' else 'append'

# Database connection
engine = create_engine('postgresql://postgres:password@localhost:5432/1264bra')

# Target schema and table
destinationschema = 'peachtree'
destinationtable = 'player_registrations'

In [46]:
df_peachtree.drop(columns=['season_year'], inplace=True, errors='ignore')

In [47]:
# Count rows before upload
upload_count = len(df_peachtree)

with engine.begin() as conn:
    if full_or_override != 'full':
        delete_query = text(f"""
            DELETE FROM {destinationschema}.{destinationtable}
            WHERE season = :season
        """)
        result = conn.execute(delete_query, {"season": season})
        print(f"Deleted {result.rowcount} rows for season '{season}'")

    df_peachtree.to_sql(
        name=destinationtable,
        con=conn,
        schema=destinationschema,
        if_exists=ReplaceOrAppend,
        index=False
    )

print(f"Uploaded {upload_count} rows for '{season}' load")

Deleted 105 rows for season 'spring 2026'
Uploaded 107 rows for 'spring 2026' load


In [49]:
df_peachtree[df_peachtree['registration_date']==today]

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender_id,shirt_size,previous_seasons,previous_spring_team,address_1,city,state,zip,country,school,grade,sport,division,player_age,coach_request,player_request,all_girl_tball_team,all_girl_coach_pitch_id,sibling,cannot_practice_days,guardian_first_name,guardian_last_name,play_up_acknowledgment,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donation_id,season_team_name,season_team_division_name,season,peachtree_id,subtotal_true
104,157125691,SN-PSB134158,2025-12-12,Bennett,Finn,Bennie,11/04/2011,1,A-XL,1,,2560 Wind River Rd,Charlottesville,VA,22901,United States,22.0,8,Softball,12,13.0,,,,3,,Monday nights,William,Finn,,9176853844,9176853844,werbal25@yahoo.com,No,,,,,No,QESD43141,werbal25@yahoo.com,Active,Paid,120.0,114.6,5.4,0.0,,120.0,0.0,0.0,,,spring 2026,662,120.0
105,157139150,SN-PCW409807,2025-12-12,Graham,Ford,,03/18/2015,2,A-S,5,Minors-Mudcats,4815 grassy knl,Charlottesville,VA,22901,United States,11.0,5,"T-Ball, Coach Pitch and Baseball",2,11.0,,,,3,,,Gregory,Ford,,3018737877,3018737877,gford79@gmail.com,No,,,,,No,WOSK39550,gford79@gmail.com,Active,Paid,120.0,114.6,5.4,0.0,,120.0,0.0,0.0,,,spring 2026,179,120.0
106,157140425,SN-PRZ645375,2025-12-12,Kamillia,Gibbs,,06/25/2018,1,Y-M,2,Penguins,1374 Lanetown rd,Crozet,VA,22932,United States,11.0,2,Softball,9,7.0,,,,3,No,Thursday,Diane,Gibbs,,8049373354,8049373354,diane.p.gibbs@gmail.com,Yes,Head Coach,Diane,Gibbs,,No,ZSXM18616,diane.p.gibbs@gmail.com,Active,Paid,120.0,114.6,5.4,0.0,,120.0,0.0,0.0,,,spring 2026,210,120.0


    pg_dump -U postgres -h localhost 1264bra > E:\backups\1264bra_%DATE:~10,4%_%DATE:~4,2%.sql