In [1]:
import pandas as pd
import numpy as np
import psycopg2 as ps
from psycopg2 import sql
from sqlalchemy import create_engine
import os
import openpyxl
from datetime import date
import glob

# Set display option to show all columns 
pd.set_option('display.max_columns', None)

# Change the current working directory
os.chdir('C:\\Users\\cantr\\OneDrive\\Coding\\GiordanoDB\\Peachtree\\registration_data')

In [13]:
# 1) Define columns once (clean names only)
desired_columns = [
    'SportsEngine ID', 'First Name', 'Last Name',
    'Birthdate', 'Address 1', 'City', 'State', 'Zip'
]

def sanitize_columns(col_index):
    # Remove any literal backslash-uFEFF sequences
    col_index = col_index.str.replace(r'\\uFEFF', '', regex=True)
    # Drop any real BOM character if present
    col_index = col_index.str.lstrip('\ufeff')
    # Trim stray whitespace
    col_index = col_index.str.strip()
    return col_index

# 2) Iterate, sanitize, reindex, collect
dfs = []
for file in glob.glob("*.csv"):
    # Use utf-8-sig to nuke BOM on read
    df = pd.read_csv(file, encoding='utf-8-sig')
    # Sanitize the headers
    df.columns = sanitize_columns(df.columns)
    print(f"{file} ->", df.columns.tolist())  # confirm cleanup
    # Reindex to your known-good list
    df = df.reindex(columns=desired_columns)
    dfs.append(df)

# 3) Concatenate once everything looks correct
df_peachtree_orig = pd.concat(dfs, ignore_index=True)

dashboard_fall_2024_report.csv -> ['First Name', 'Last Name', 'Preferred Name', 'Birthdate', 'Gender', 'Shirt Size', 'Previous Seasons?', 'Previous Spring Team', 'Address 1', 'City', 'State', 'Zip', 'Country', 'School', 'Grade', 'Sport', 'Division (Baseball and T-Ball)', 'Division (Softball)', 'Coach Request', 'Player Request', 'All Girl Tball team?', 'All Girl Coach Pitch Team', 'Sibling?', 'CANNOT Practice Days', 'Any Comments?', 'Guardian First Name', 'Guardian Last Name', 'Parent/Guardian Cell Phone', 'Guardian Home Phone', 'Guardian Email', 'Coaching Interest?', 'Head vs Assistant Coach', 'First Name of Potential Coach', 'Last Name of Potential Coach', 'Shirt Size_Coach', 'Sponsor Interest?', 'Entry Number', 'Registration Date', 'Order Number', 'Account Email', 'Entry Status', 'SportsEngine ID', 'Order Status', 'Gross', 'Net', 'Service Fee', 'Discount Amount', 'Discount Names', 'Subtotal', 'Refunds', 'Season Team Name', 'Season Team Division Name', 'Donate']
dashboard_fall_2025_re

In [14]:
df_peachtree = df_peachtree_orig.copy()

# Assuming df_peachtree is your DataFrame
df_peachtree = df_peachtree.map(lambda x: x.strip() if isinstance(x, str) else x)

df_peachtree.columns = df_peachtree.columns.str.lower().str.replace(' ', '_')

# # Applying title case to 'first_name' and 'last_name' columns
# df_peachtree['first_name'] = df_peachtree['first_name'].str.title()
# df_peachtree['last_name'] = df_peachtree['last_name'].str.title()

In [15]:
df_peachtree[df_peachtree['first_name'].isnull()]

Unnamed: 0,sportsengine_id,first_name,last_name,birthdate,address_1,city,state,zip


In [16]:
conn = ps.connect(database="1264bra", user="postgres", password="password", host="localhost", port="5432") 

cur = conn.cursor()

sql_query_players = """
    select sportsengine_id
            , peachtree_id
    from peachtree.player_info
"""

cur.execute(sql_query_players)

results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_existing_peachtree_ids = pd.DataFrame(results, columns=column_names)

# Close the cursor and connection 
cur.close() 
conn.close()

In [17]:
existing_sportsengine_ids = set(df_existing_peachtree_ids['sportsengine_id'])

In [18]:
# Step 2: Determine the highest existing peachtree_id
if not df_existing_peachtree_ids.empty:
    highest_peachtree_id = df_existing_peachtree_ids['peachtree_id'].max()
else:
    highest_peachtree_id = 0

In [19]:
highest_peachtree_id

np.int64(764)

In [20]:
# Step 2: Filter the DataFrame to exclude rows with existing sportsengine_id values
df_new_entries_orig = df_peachtree[~df_peachtree['sportsengine_id'].isin(existing_sportsengine_ids)]

In [21]:
df_new_entries = df_new_entries_orig.copy()

In [22]:
df_new_entries.shape

(0, 8)

In [23]:
# Group by 'First Name' and 'Last Name' and filter groups with more than one unique 'SportsEngine ID'
different_sportsengine_id = df_new_entries.groupby(['last_name', 'first_name', 'birthdate']).filter(lambda x: x['sportsengine_id'].nunique() > 1)

In [24]:
# Display the result
different_sportsengine_id.sort_values(by=['last_name', 'first_name']).reset_index(drop=True)

Unnamed: 0,sportsengine_id,first_name,last_name,birthdate,address_1,city,state,zip


In [25]:
# Step 2: Create a unique peachtree_id for each person based on 'last_name', 'first_name', and 'birthdate'
df_new_entries['peachtree_id'] = df_new_entries.groupby(['last_name', 'first_name', 'birthdate']).ngroup() + highest_peachtree_id + 1

# Move the 'peachtree_id' column to the front
columns = ['peachtree_id'] + [col for col in df_new_entries.columns if col != 'peachtree_id']
df_new_entries = df_new_entries[columns]

In [26]:
df_new_entries

Unnamed: 0,peachtree_id,sportsengine_id,first_name,last_name,birthdate,address_1,city,state,zip


In [27]:
# Step 3: Keep only one row for each unique peachtree_id
df_unique_new_entries = df_new_entries.drop_duplicates(subset='sportsengine_id', keep='first')

In [148]:
df_unique_new_entries[df_unique_new_entries['last_name']=='Giordano']

Unnamed: 0,peachtree_id,sportsengine_id,first_name,last_name,birthdate,address_1,city,state,zip


In [149]:
df_unique_new_entries = df_unique_new_entries.sort_values(by=['last_name', 'first_name']).reset_index(drop=True)

In [150]:
df_unique_new_entries

Unnamed: 0,peachtree_id,sportsengine_id,first_name,last_name,birthdate,address_1,city,state,zip
0,763,SN-PXJ724651,Leo,Circh,01/31/2022,5316 Raven Stone Rd,Crozet,VA,22932
1,764,SN-PKP777841,Braxton,Simmons,05/27/2014,7122 Bradbury Court,Crozet,VA,22932


In [151]:
engine = create_engine('postgresql://postgres:password@localhost:5432/1264bra')

ReplaceOrAppend = 'append'
destinationschema = 'peachtree'
destinationtable = f'player_info'


In [152]:
with engine.connect() as conn:
    df_unique_new_entries.to_sql(destinationtable, con=conn, schema=destinationschema, if_exists=ReplaceOrAppend, index=False)