In [374]:
import pandas as pd
import numpy as np
import psycopg2 as ps
from psycopg2 import sql
from sqlalchemy import create_engine
import os
import openpyxl
from datetime import date
import glob

# Set display option to show all columns 
pd.set_option('display.max_columns', None)

# Change the current working directory
os.chdir('C:\\Users\\cantr\\OneDrive\\Coding\\GiordanoDB\\Peachtree\\registration_data')

In [375]:
current_directory = os.getcwd()

In [376]:
print(current_directory)

C:\Users\cantr\OneDrive\Coding\GiordanoDB\Peachtree\registration_data


In [377]:
# Function to parse and localize datetime
def parse_and_localize(date_str):
    # Remove timezone info from string
    date_str_no_tz = date_str.rsplit(' ', 1)[0]
    # Parse the datetime without timezone
    dt = pd.to_datetime(date_str_no_tz, format='%m/%d/%Y %I:%M%p')
    # # Localize the timezone (in this case, EST)
    # dt = dt.tz_localize('US/Eastern')
    return dt

In [467]:
# Use glob to grab all CSV files in the current directory
csv_files = glob.glob('*.csv')

# Initialize an empty list to hold the dataframes
dfs = []

# Define the desired column order (adjust this list based on your actual columns)
desired_columns = ['Entry Number', 'SportsEngine ID', 'Registration Date', 'First Name',
       'Last Name', 'Preferred Name', 'Birthdate', 'Gender', 'Shirt Size',
       'Previous Seasons?', 'Previous Spring Team', 'Address 1', 'City',
       'State', 'Zip', 'Country', 'School', 'Grade', 'Sport ',
       'Division (Baseball and T-Ball)', 'Division (Softball)',
       'Coach Request', 'Player Request', 'All Girl Tball team?',
       'All Girl Coach Pitch Team', 'Sibling?', 'CANNOT Practice Days',
       'Guardian First Name', 'Guardian Last Name',
       'Parent/Guardian Cell Phone', 'Guardian Home Phone', 'Guardian Email',
       'Coaching Interest?', 'Head vs Assistant Coach',
       'First Name of Potential Coach', 'Last Name of Potential Coach',
       'Shirt Size_Coach', 'Sponsor Interest?', 'Order Number',
       'Account Email', 'Entry Status', 'Order Status', 'Gross', 'Net',
       'Service Fee', 'Discount Amount', 'Discount Names', 'Subtotal',
       'Refunds', 'Donate', 'Season Team Name', 'Season Team Division Name']  # Add all your columns here

# Loop through the list of CSV files
for file in csv_files:
    # Read the CSV file into a dataframe
    df = pd.read_csv(file)
    # Extract the relevant part of the file name (e.g., fall_2024)
    # Ensure the columns are in the desired order 
    df = df.reindex(columns=desired_columns)
    file_identifier = '_'.join(os.path.splitext(os.path.basename(file))[0].split('_')[1:3])
    # Add a new column with the extracted file name part
    df['season'] = file_identifier
    # Append the dataframe to the list
    dfs.append(df)

# Concatenate all dataframes in the list into a single dataframe
df_peachtree_orig = pd.concat(dfs, ignore_index=True)

In [468]:
conn = ps.connect(database="1264bra", user="postgres", password="password", host="localhost", port="5432") 

cur = conn.cursor()

sql_query_leagues = """
    select league_id
            , sub_division
    from peachtree.league_hierarchy
"""

cur.execute(sql_query_leagues)

results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_hierarchy = pd.DataFrame(results, columns=column_names)

In [469]:
conn.rollback()
sql_query_donations = """
    select donation_id
            , donation_sub_category
    from peachtree.donation_category
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_donations = pd.DataFrame(results, columns=column_names)

# Close the cursor and connection 
cur.close() 
conn.close()

In [470]:
df_peachtree_dict = df_peachtree_hierarchy.set_index('sub_division')['league_id'].to_dict()

df_peachtree_donation_dict = df_peachtree_donations.set_index('donation_sub_category')['donation_id'].to_dict()

In [523]:
df_peachtree = df_peachtree_orig.copy()

In [524]:
# number of columns in the peachtree df
df_peachtree.shape[1]

53

In [525]:
df_peachtree.columns = df_peachtree.columns.str.lower().str.replace(' ', '_')

In [526]:
# Remove commas from 'Registration Date' column
df_peachtree['registration_date'] = df_peachtree['registration_date'].str.replace(',', '')

In [527]:
# Apply function to DataFrame column
df_peachtree['registration_date'] = df_peachtree['registration_date'].apply(parse_and_localize)

# Convert to datetime without timezone information
df_peachtree['registration_date'] = pd.to_datetime(df_peachtree['registration_date'], format='%m/%d/%Y %I:%M%p')

In [528]:
# Creating the new date column 'registration_date'
df_peachtree['registration_date'] = df_peachtree['registration_date'].dt.date

# Assuming df_peachtree is your DataFrame
df_peachtree['registration_date'] = pd.to_datetime(df_peachtree['registration_date'])

# Assuming 'df' is your DataFrame
df_peachtree = df_peachtree.rename(columns={'division_(baseball_and_t-ball)': 'division'
                                            , 'sport_' : 'sport'
                                            })

# Remove underscores from the 'filename' column
df_peachtree['season'] = df_peachtree['season'].str.replace('_', ' ')

# Replace null values in 'Division' where 'Sport' is 'Softball'
df_peachtree.loc[(df_peachtree['sport'] == 'Softball') & (df_peachtree['division'].isnull()), 'division'] = df_peachtree['division_(softball)']

df_peachtree = df_peachtree.drop(columns=['division_(softball)', 'sport'])

df_peachtree['division'] = df_peachtree['division'].map(df_peachtree_dict)

df_peachtree['donate'] = df_peachtree['donate'].map(df_peachtree_donation_dict)

# Define the columns and their target data types
columns_to_convert = {
    'gross': float,
    'net': float,
    'service_fee': float,
    'discount_amount': float,
    'subtotal': float,
    'refunds': float,
    'entry_number': object,
    'donate': 'int64'
}

# Remove dollar signs and convert to the specified data types
for column, dtype in columns_to_convert.items():
    if dtype == float:
        df_peachtree[column] = df_peachtree[column].replace(r'[\$,]', '', regex=True).astype(dtype)
    elif dtype == 'int64':
        df_peachtree[column] = df_peachtree[column].fillna(0).astype(dtype)
    else:
        df_peachtree[column] = df_peachtree[column].astype(dtype)

# Update 'division' based on 'refunds to 'Refunded'/8
df_peachtree['division'] = df_peachtree.apply(lambda row: 8 if row['refunds'] > 0 else row['division'], axis=1)

# Calculate the count of records for each order_number
order_counts = df_peachtree.groupby('order_number')['order_number'].transform('count')

# # # Divide the 'gross' and 'net' columns by the count
# # df_peachtree['gross'] = df_peachtree['gross'] / order_counts
# # df_peachtree['net'] = df_peachtree['net'] / order_counts
# # df_peachtree['service_fee'] = df_peachtree['service_fee'] / order_counts
# # df_peachtree['subtotal'] = df_peachtree['subtotal'] / order_counts

# Columns to be divided
columns_to_divide = ['gross', 'net', 'service_fee', 'subtotal']

# Divide the selected columns by the count
df_peachtree[columns_to_divide] = df_peachtree[columns_to_divide].apply(lambda x: x / order_counts)

In [529]:
df_peachtree[(df_peachtree['season']== 'spring 2025') & (df_peachtree['last_name']=="Kirchner")]

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender,shirt_size,previous_seasons?,previous_spring_team,address_1,city,state,zip,country,school,grade,division,coach_request,player_request,all_girl_tball_team?,all_girl_coach_pitch_team,sibling?,cannot_practice_days,guardian_first_name,guardian_last_name,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest?,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest?,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donate,season_team_name,season_team_division_name,season
834,141344842,SN-PXJ233349,2024-12-16,Andrew,Kirchner,,01/22/2016,Male,Y-M,4,Dodgers - Rookie,123 Walk Around Lane,Roseland,VA,22967,United States,Rockfish River Elementary,3rd,3,,,,,,,Meagan,Kirchner,4349071167,4349071167,mkidwel2@gmail.com,Yes,Assistant Coach,Charles,Kirchner,A-M,Yes,UJKA34137,mkidwel2@gmail.com,Active,Paid,345.0,332.29,12.71,0.0,,345.0,0.0,0,,,spring 2025
835,141344941,SN-PZE698951,2024-12-16,Audrey,Kirchner,,10/10/2017,Female,Y-S,2,Tball,123 Walk Around Lane,Roseland,VA,22967,United States,Rockfish River Elementary,1st,9,,,,,,,Meagan,Kirchner,4349071167,4349071167,mkidwel2@gmail.com,No,,,,,No,UJKA34137,mkidwel2@gmail.com,Active,Paid,345.0,332.29,12.71,0.0,,345.0,0.0,0,,,spring 2025
836,141345100,SN-PZP046548,2024-12-16,Arthur,Kirchner,,06/03/2019,Male,Y-XS,1,tball,123 Walk Around Lane,Roseland,VA,22967,United States,Rockfish River Elementary,K,6,,,,,,depends on practice location and sibling teams...,Meagan,Kirchner,4349071167,4349071167,mkidwel2@gmail.com,No,,,,,No,UJKA34137,mkidwel2@gmail.com,Active,Paid,345.0,332.29,12.71,0.0,,345.0,0.0,0,,,spring 2025


In [514]:
# def summarize_dataframe(df):
#     """Summarize a dataframe, and report missing values."""
#     missing_values = pd.concat([
#         pd.DataFrame(df.columns, columns=['Variable Name']),
#         pd.DataFrame(df.dtypes.values.reshape([-1,1]), columns=['Data Type']),
#         pd.DataFrame(df.isnull().sum().values, columns=['Missing Values']),
#         pd.DataFrame([df[name].nunique() for name in df.columns], columns=['Unique Values'])
#     ], axis=1).set_index('Variable Name')

#     with pd.option_context("display.max_rows", 1000):
#         summary = pd.concat([missing_values, df.describe(include='all').transpose()], axis=1).infer_objects(copy=False).fillna("")
#         display(summary)


In [515]:
# summarize_dataframe(df_peachtree)

In [516]:
# df_peachtree[df_peachtree['last_name']=='Giordano']
# df_peachtree[df_peachtree['registration_date'] == '2023-11-30']
# df_peachtree['division'].value_counts()
# df_peachtree[df_peachtree['division']==6]
# df_peachtree.value_counts('gender')

In [517]:
# Get the total sum of 'gross' grouped by 'season'
gross_totals_by_season = df_peachtree.groupby('season')['gross'].sum()

# Display the result
print(gross_totals_by_season)

season
fall 2024      26181.0
spring 2024    51677.0
spring 2025    19170.0
Name: gross, dtype: float64


In [518]:
df_peachtree

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender,shirt_size,previous_seasons?,previous_spring_team,address_1,city,state,zip,country,school,grade,division,coach_request,player_request,all_girl_tball_team?,all_girl_coach_pitch_team,sibling?,cannot_practice_days,guardian_first_name,guardian_last_name,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest?,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest?,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donate,season_team_name,season_team_division_name,season
0,134707507,SN-PLJ375188,2024-07-15,Ainsley,Dority,,10/07/2016,Female,Y-M,3,8U Pumas,PO BOX 411,Batesville,VA,22924,United States,Brownsville Elementary,2nd,8,,,,,,,Mandy,Dority,434 2577092,4342577092,mandydority@icloud.com,Yes,Assistant Coach,Mandy,Dority,,,EIVT62700,mandydority@icloud.com,Active,Paid,120.0,115.32,4.68,0.0,,120.0,22.0,0,8U Penguins,8U Softball,fall 2024
1,134708124,SN-PHX152990,2024-07-15,Addison,Pietro,,09/08/2016,Female,Y-M,0,,2939 Rambling Brook Lane,Crozet,VA,22932,United States,Brownsville Elementary School,3rd,8,,,,,No,No,Justin,Pietro,4349872578,4349872578,jpietro@gmail.com,No,,,,,No,LDRS69789,gemmaapietro@gmail.com,Active,Paid,120.0,115.32,4.68,0.0,,120.0,22.0,0,8U Penguins,8U Softball,fall 2024
2,134708295,SN-PZT444902,2024-07-15,William,OBrien,James,05/01/2018,Male,Y-S,0,,1402 Stillhouse Ridge Ln,Charlottesville,VA,22903,United States,Brownsville Elementary,1st,8,,Request James play with other children from Br...,,,,,John,OBrien,434-906-7884,434-906-7884,jmobrien88@gmail.com,No,,,,,No,IACM98013,jmobrien88@gmail.com,Active,Paid,105.0,100.32,4.68,0.0,,105.0,7.0,0,CP Yankees,Coach Pitch,fall 2024
3,134708910,SN-PNE750392,2024-07-15,Genevieve,Fosdick,,09/04/2020,Female,Y-XS,0,,1554 Wickham Pond Drive,Charlottesville,VA,22901,United States,Bright Beginnings Preschool,K,8,,,Yes,,No,,Jessica,Fosdick,5406886345,5406886345,jefosdick@gmail.com,No,,,,,No,JTXW58280,jefosdick@gmail.com,Active,Paid,105.0,100.32,4.68,0.0,,105.0,7.0,0,Tball Athletics,Tball,fall 2024
4,134861403,SN-PDR648385,2024-07-17,James,Buckett,,08/30/2017,Male,Y-M,3,Twins Tball,1042 Rolling Meadow Ln,Crozet,VA,22932,United States,Crozet Elementary,1st,4,,,,,No,Monday and Wednesday,Jennifer,Buckett,434-326-6315,4343266315,jenbuckett@gmail.com,No,,,,,Yes,YPDC95639,jenbuckett@gmail.com,Active,Paid,123.0,117.50,5.50,0.0,,123.0,0.0,1,Rookie Yankees,Rookie,fall 2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,141627690,SN-PAC089220,2024-12-30,Dean,O'Brien,,08/07/2018,Male,Y-S,4,Coach pitch,2519 White Hall Rd,Crozet,VA,22932,United States,Crozet elementary,K,6,Brian Donovan,"Brooks and Barrett Thompson, Bryn Donovan",,,,,Joseph,O'Brien,4349063875,4349063875,joseph.francis.obrien@gmail.com,Yes,Assistant Coach,Joe,O'Brien,A-L,No,LEBP82669,karenerobrien@gmail.com,Active,Paid,105.0,100.84,4.16,0.0,,105.0,0.0,0,,,spring 2025
900,141627727,SN-PQE005106,2024-12-30,Sean,O'Brien,,12/08/2020,Male,Y-XS,0,,2519 White Hall Rd,Crozet,VA,22932,United States,Daylily,K,7,Brian Donovan,Wyatt Donovan and Gray Thompson,,,Dean is playing coach pitch again,,Joseph,O'Brien,4349063875,4349063875,joseph.francis.obrien@gmail.com,Yes,Assistant Coach,Joe,O'Brien,A-L,No,LEBP82669,karenerobrien@gmail.com,Active,Paid,105.0,100.84,4.16,0.0,,105.0,0.0,0,,,spring 2025
901,141628036,SN-PMX806345,2024-12-30,Maeve,Sowers,,06/02/2017,Female,Y-S,4,Sloths 8u,367 Newtown Road,Greenwood,VA,22943,United States,brownsville,2nd,9,,,,,,,Julia,Sowers,609-216-4168,609-216-4168,julia.sowers@gmail.com,No,,,,,No,EFUP36767,julia.sowers@gmail.com,Active,Paid,120.0,114.60,5.40,0.0,,120.0,0.0,0,,,spring 2025
902,141628193,SN-PSN002552,2024-12-30,Shayn,Steppe,,05/03/2014,Male,A-M,4,Emeralds,1640 Wickham Way,Charlottesville,VA,22901,United States,Crozet Elementary,5th,2,,,,,No,Tuesday and Thursdays are very difficult,Wilbert,Steppe,614-893-2608,434-812-2258,madic4me@gmail.com,No,,,,,No,ZUEP48000,eosteppe@gmail.com,Active,Open,0.0,0.00,0.00,0.0,,120.0,0.0,0,,,spring 2025


In [519]:
engine = create_engine('postgresql://postgres:password@localhost:5432/1264bra')

In [520]:
ReplaceOrAppend = 'replace'
destinationschema = 'peachtree'
destinationtable = f'player_registrations'

# # Assume that df is your DataFrame
# df_hankfeeds.to_sql('dailyfeeds', engine, if_exists=ReplaceOrAppend, index=False)

In [521]:
with engine.connect() as conn:
    df_peachtree.to_sql(destinationtable, con=conn, schema=destinationschema, if_exists=ReplaceOrAppend, index=False)