In [1]:
import pandas as pd
import numpy as np
import psycopg2 as ps
from psycopg2 import sql
from sqlalchemy import create_engine
import os
import openpyxl
from datetime import date
import glob

# Set display option to show all columns 
pd.set_option('display.max_columns', None)

# Change the current working directory
os.chdir('C:\\Users\\cantr\\OneDrive\\Coding\\GiordanoDB\\Peachtree\\registration_data')

In [2]:
current_directory = os.getcwd()

In [3]:
print(current_directory)

C:\Users\cantr\OneDrive\Coding\GiordanoDB\Peachtree\registration_data


In [4]:
# Function to parse and localize datetime
def parse_and_localize(date_str):
    # Remove timezone info from string
    date_str_no_tz = date_str.rsplit(' ', 1)[0]
    # Parse the datetime without timezone
    dt = pd.to_datetime(date_str_no_tz, format='%m/%d/%Y %I:%M%p')
    # # Localize the timezone (in this case, EST)
    # dt = dt.tz_localize('US/Eastern')
    return dt

In [271]:
# Use glob to grab all CSV files in the current directory
csv_files = glob.glob('*.csv')

# Initialize an empty list to hold the dataframes
dfs = []

# Define the desired column order (adjust this list based on your actual columns)
desired_columns = ['Entry Number', 'SportsEngine ID', 'Registration Date', 'First Name',
       'Last Name', 'Preferred Name', 'Birthdate', 'Gender', 'Shirt Size',
       'Previous Seasons?', 'Previous Spring Team', 'Address 1', 'City',
       'State', 'Zip', 'Country', 'School', 'Grade', 'Sport ',
       'Division (Baseball and T-Ball)', 'Division (Softball)',
       'Coach Request', 'Player Request', 'All Girl Tball team?',
       'All Girl Coach Pitch Team', 'Sibling?', 'CANNOT Practice Days',
       'Guardian First Name', 'Guardian Last Name',
       'Parent/Guardian Cell Phone', 'Guardian Home Phone', 'Guardian Email',
       'Coaching Interest?', 'Head vs Assistant Coach',
       'First Name of Potential Coach', 'Last Name of Potential Coach',
       'Shirt Size_Coach', 'Sponsor Interest?', 'Order Number',
       'Account Email', 'Entry Status', 'Order Status', 'Gross', 'Net',
       'Service Fee', 'Discount Amount', 'Discount Names', 'Subtotal',
       'Refunds', 'Donate', 'Season Team Name', 'Season Team Division Name']  # Add all your columns here

# Loop through the list of CSV files
for file in csv_files:
    # Read the CSV file into a dataframe
    df = pd.read_csv(file)
    # Extract the relevant part of the file name (e.g., fall_2024)
    # Ensure the columns are in the desired order 
    df = df.reindex(columns=desired_columns)
    file_identifier = '_'.join(os.path.splitext(os.path.basename(file))[0].split('_')[1:3])
    # Add a new column with the extracted file name part
    df['season'] = file_identifier
    # Append the dataframe to the list
    dfs.append(df)

# Concatenate all dataframes in the list into a single dataframe
df_peachtree_orig = pd.concat(dfs, ignore_index=True)

In [242]:
conn = ps.connect(database="1264bra", user="postgres", password="password", host="localhost", port="5432") 

cur = conn.cursor()

sql_query_leagues = """
    select league_id
            , sub_division
    from peachtree.league_hierarchy
"""

cur.execute(sql_query_leagues)

results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_hierarchy = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_donations = """
    select gender_id
            , gender
    from peachtree.gender
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_gender = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_donations = """
    select all_girl_coach_pitch_id
            , all_girl_coach_pitch_team
            , preference_standardized
    from peachtree.all_girls_coach_pitch_team
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_all_girl_cp = pd.DataFrame(results, columns=column_names)

###

conn.rollback()
sql_query_donations = """
    select donation_id
            , donation_sub_category
            , amount
    from peachtree.donation_category
"""
cur.execute(sql_query_donations)


results = cur.fetchall()
column_names = [desc[0] for desc in cur.description]
df_peachtree_donations = pd.DataFrame(results, columns=column_names)

# Close the cursor and connection 
cur.close() 
conn.close()

In [243]:
df_peachtree_dict = df_peachtree_hierarchy.set_index('sub_division')['league_id'].to_dict()

df_peachtree_donation_dict = df_peachtree_donations.set_index('donation_sub_category')['donation_id'].to_dict()

df_peachtree_gender_dict = df_peachtree_gender.set_index('gender')['gender_id'].to_dict()

df_peachtree_all_girl_dict = df_peachtree_all_girl_cp.set_index('all_girl_coach_pitch_team')['all_girl_coach_pitch_id'].to_dict()

In [273]:
df_peachtree_all_girl_dict

{'No': 2, None: 3, 'Yes': 1}

In [274]:
df_peachtree = df_peachtree_orig.copy()

In [275]:
# number of columns in the peachtree df
df_peachtree.shape[1]

53

In [276]:
df_peachtree.columns = df_peachtree.columns.str.lower().str.replace(' ', '_')

In [277]:
# Remove commas from 'Registration Date' column
df_peachtree['registration_date'] = df_peachtree['registration_date'].str.replace(',', '')

In [278]:
# Apply function to DataFrame column
df_peachtree['registration_date'] = df_peachtree['registration_date'].apply(parse_and_localize)

# Convert to datetime without timezone information
df_peachtree['registration_date'] = pd.to_datetime(df_peachtree['registration_date'], format='%m/%d/%Y %I:%M%p')

In [279]:
# Creating the new date column 'registration_date'
df_peachtree['registration_date'] = df_peachtree['registration_date'].dt.date

# Assuming df_peachtree is your DataFrame
df_peachtree['registration_date'] = pd.to_datetime(df_peachtree['registration_date'])

# Assuming df_peachtree is your DataFrame
df_peachtree = df_peachtree.map(lambda x: x.strip() if isinstance(x, str) else x)


# Assuming 'df' is your DataFrame
df_peachtree = df_peachtree.rename(columns={'division_(baseball_and_t-ball)': 'division'
                                            , 'sport_' : 'sport'
                                            })

# Remove underscores from the 'filename' column
df_peachtree['season'] = df_peachtree['season'].str.replace('_', ' ')

# Applying title case to 'first_name' and 'last_name' columns
df_peachtree['first_name'] = df_peachtree['first_name'].str.title()
df_peachtree['last_name'] = df_peachtree['last_name'].str.title()

# Replace null values in 'Division' where 'Sport' is 'Softball'
df_peachtree.loc[(df_peachtree['sport'] == 'Softball') & (df_peachtree['division'].isnull()), 'division'] = df_peachtree['division_(softball)']

df_peachtree = df_peachtree.drop(columns=['division_(softball)', 'sport'])

df_peachtree['division'] = df_peachtree['division'].map(df_peachtree_dict)

df_peachtree['donate'] = df_peachtree['donate'].map(df_peachtree_donation_dict)

df_peachtree['gender'] = df_peachtree['gender'].map(df_peachtree_gender_dict)

# Custom mapping function to handle NaN values
def map_values(val, mapping_dict):
    if pd.isna(val):
        return mapping_dict.get(None)
    else:
        return mapping_dict.get(val, val)

# Applying the custom mapping function
df_peachtree['all_girl_coach_pitch_team'] = df_peachtree['all_girl_coach_pitch_team'].apply(map_values, args=(df_peachtree_all_girl_dict,))

# Define the columns and their target data types
columns_to_convert = {
    'gross': float,
    'net': float,
    'service_fee': float,
    'discount_amount': float,
    'subtotal': float,
    'refunds': float,
    'entry_number': object,
    'donate': 'int64'
}

# Remove dollar signs and convert to the specified data types
for column, dtype in columns_to_convert.items():
    if dtype == float:
        df_peachtree[column] = df_peachtree[column].replace(r'[\$,]', '', regex=True).astype(dtype)
    elif dtype == 'int64':
        df_peachtree[column] = df_peachtree[column].fillna(0).astype(dtype)
    else:
        df_peachtree[column] = df_peachtree[column].astype(dtype)

# Update 'division' based on 'refunds to 'Refunded'/8
df_peachtree['division'] = df_peachtree.apply(lambda row: 8 if row['refunds'] > 0 else row['division'], axis=1)

# Calculate the count of records for each order_number
order_counts = df_peachtree.groupby('order_number')['order_number'].transform('count')

# # # Divide the 'gross' and 'net' columns by the count
# # df_peachtree['gross'] = df_peachtree['gross'] / order_counts
# # df_peachtree['net'] = df_peachtree['net'] / order_counts
# # df_peachtree['service_fee'] = df_peachtree['service_fee'] / order_counts
# # df_peachtree['subtotal'] = df_peachtree['subtotal'] / order_counts

# Columns to be divided
columns_to_divide = ['gross', 'net', 'service_fee', 'subtotal']

# Divide the selected columns by the count
df_peachtree[columns_to_divide] = df_peachtree[columns_to_divide].apply(lambda x: x / order_counts)

# Renaming the column 'donate' to 'donate_id' 
df_peachtree.rename(columns={'donate': 'donation_id'
                             ,'gender':'gender_id'
                             , 'all_girl_coach_pitch_team': 'all_girl_coach_pitch_id'
                             }, inplace=True)

In [280]:
# Calculate subtotal_true with additional checks
df_peachtree['subtotal_true'] = df_peachtree.apply(lambda row: 
    row['subtotal'] - df_peachtree_donations[df_peachtree_donations['donation_id'] == row['donation_id']]['amount'].sum() 
    if row['donation_id'] != 0 else row['subtotal'], axis=1)


In [281]:
df_peachtree[(df_peachtree['season']== 'spring 2024') & (df_peachtree['last_name']=="Keller")]

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender_id,shirt_size,previous_seasons?,previous_spring_team,address_1,city,state,zip,country,school,grade,division,coach_request,player_request,all_girl_tball_team?,all_girl_coach_pitch_id,sibling?,cannot_practice_days,guardian_first_name,guardian_last_name,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest?,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest?,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donation_id,season_team_name,season_team_division_name,season,subtotal_true
408,124209346,SN-PQL662446,2023-12-27,Eli,Keller,,12/01/2017,2,Y-S,3,Coach Pitch,1337 Gate Post Lane,Charlottesville,VA,22901,United States,Crozet Elementary,K,5,,"Gavin Mims, Ava Calabrese, Logan Fore",,3,,,Joe,Keller,5402230545,5402230545,kellerfam23@gmail.com,Yes,Assistant Coach,Joe,Keller,A-L,No,SMYK24146,,Active,Paid,105.0,100.09,4.91,0.0,,105.0,0.0,0,CP Tigers,Coach Pitch Tball,spring 2024,105.0


In [282]:
# def summarize_dataframe(df):
#     """Summarize a dataframe, and report missing values."""
#     missing_values = pd.concat([
#         pd.DataFrame(df.columns, columns=['Variable Name']),
#         pd.DataFrame(df.dtypes.values.reshape([-1,1]), columns=['Data Type']),
#         pd.DataFrame(df.isnull().sum().values, columns=['Missing Values']),
#         pd.DataFrame([df[name].nunique() for name in df.columns], columns=['Unique Values'])
#     ], axis=1).set_index('Variable Name')

#     with pd.option_context("display.max_rows", 1000):
#         summary = pd.concat([missing_values, df.describe(include='all').transpose()], axis=1).infer_objects(copy=False).fillna("")
#         display(summary)


In [283]:
# summarize_dataframe(df_peachtree)

In [284]:
df_peachtree[df_peachtree['last_name']=='Giordano']
# df_peachtree[df_peachtree['registration_date'] == '2023-11-30']
# df_peachtree['division'].value_counts()
# df_peachtree[df_peachtree['division']==6]
# df_peachtree.value_counts('gender')

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender_id,shirt_size,previous_seasons?,previous_spring_team,address_1,city,state,zip,country,school,grade,division,coach_request,player_request,all_girl_tball_team?,all_girl_coach_pitch_id,sibling?,cannot_practice_days,guardian_first_name,guardian_last_name,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest?,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest?,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donation_id,season_team_name,season_team_division_name,season,subtotal_true
126,135367389,SN-PRY237084,2024-07-26,Jack,Giordano,,09/20/2019,2,Y-XS,2,Intermediate T-Ball; Cubs Team,1264 Blue Ridge Avenue,Crozet,VA,22932,United States,Daylily Presschool,K,5,,Mackie Rector,,3,,,Isabelle,Giordano,434-249-5933,434-249-5933,isabelle.marshall@gmail.com,Yes,Assistant Coach,Andrew,Giordano,A-M,Yes,JXPB97309,isabelle.marshall@gmail.com,Active,Paid,98.0,93.32,4.68,0.0,,98.0,0.0,0,Tball Phillies,Tball,fall 2024,98.0
313,123466769,SN-PRY237084,2023-12-04,Jack,Giordano,,09/20/2019,2,Y-XS,1,,1264 Blue Ridge Avenue,Crozet,VA,22932,United States,Daylily Preschool,K,7,Andrew Giordano,Jack Bollier,,3,,,Isabelle,Giordano,434-249-5933,434-249-5933,isabelle.marshall@gmail.com,Yes,Head Coach,Andrew,Giordano,A-S,Yes,GXZJ62465,,Active,Paid,105.0,100.09,4.91,0.0,,105.0,0.0,0,Int Cubs,Intermediate Tball,spring 2024,105.0
1068,141185704,SN-PNL764028,2024-12-10,Jack,Giordano,,09/20/2019,2,Y-XS,3,Phillies/tball,1264 BLUE RIDGE AVE,CROZET,VA,22932,United States,Daylily Preschool,K,6,Josh Rector,"Will Zilenski, Benton Grammo, Sterling Villalobos",,3,No,Friday,Andrew,Giordano,7033441243,7033441243,giordano.andrew@gmail.com,Yes,Assistant Coach,Andrew,Giordano,A-M,Yes,VLOD76234,giordano.andrew@gmail.com,Active,Paid,130.0,124.28,5.72,0.0,,130.0,0.0,1,,,spring 2025,105.0


In [285]:
# # Get the total sum of 'gross' grouped by 'season'
# gross_totals_by_season = df_peachtree.groupby('season')['gross'].sum()

# # Display the result
# print(gross_totals_by_season)

In [286]:
df_peachtree

Unnamed: 0,entry_number,sportsengine_id,registration_date,first_name,last_name,preferred_name,birthdate,gender_id,shirt_size,previous_seasons?,previous_spring_team,address_1,city,state,zip,country,school,grade,division,coach_request,player_request,all_girl_tball_team?,all_girl_coach_pitch_id,sibling?,cannot_practice_days,guardian_first_name,guardian_last_name,parent/guardian_cell_phone,guardian_home_phone,guardian_email,coaching_interest?,head_vs_assistant_coach,first_name_of_potential_coach,last_name_of_potential_coach,shirt_size_coach,sponsor_interest?,order_number,account_email,entry_status,order_status,gross,net,service_fee,discount_amount,discount_names,subtotal,refunds,donation_id,season_team_name,season_team_division_name,season,subtotal_true
0,134707507,SN-PLJ375188,2024-07-15,Ainsley,Dority,,10/07/2016,1,Y-M,3,8U Pumas,PO BOX 411,Batesville,VA,22924,United States,Brownsville Elementary,2nd,8,,,,3,,,Mandy,Dority,434 2577092,4342577092,mandydority@icloud.com,Yes,Assistant Coach,Mandy,Dority,,,EIVT62700,mandydority@icloud.com,Active,Paid,120.0,115.32,4.68,0.0,,120.0,22.0,0,8U Penguins,8U Softball,fall 2024,120.0
1,134708124,SN-PHX152990,2024-07-15,Addison,Pietro,,09/08/2016,1,Y-M,0,,2939 Rambling Brook Lane,Crozet,VA,22932,United States,Brownsville Elementary School,3rd,8,,,,3,No,No,Justin,Pietro,4349872578,4349872578,jpietro@gmail.com,No,,,,,No,LDRS69789,gemmaapietro@gmail.com,Active,Paid,120.0,115.32,4.68,0.0,,120.0,22.0,0,8U Penguins,8U Softball,fall 2024,120.0
2,134708295,SN-PZT444902,2024-07-15,William,Obrien,James,05/01/2018,2,Y-S,0,,1402 Stillhouse Ridge Ln,Charlottesville,VA,22903,United States,Brownsville Elementary,1st,8,,Request James play with other children from Br...,,3,,,John,OBrien,434-906-7884,434-906-7884,jmobrien88@gmail.com,No,,,,,No,IACM98013,jmobrien88@gmail.com,Active,Paid,105.0,100.32,4.68,0.0,,105.0,7.0,0,CP Yankees,Coach Pitch,fall 2024,105.0
3,134708910,SN-PNE750392,2024-07-15,Genevieve,Fosdick,,09/04/2020,1,Y-XS,0,,1554 Wickham Pond Drive,Charlottesville,VA,22901,United States,Bright Beginnings Preschool,K,8,,,Yes,3,No,,Jessica,Fosdick,5406886345,5406886345,jefosdick@gmail.com,No,,,,,No,JTXW58280,jefosdick@gmail.com,Active,Paid,105.0,100.32,4.68,0.0,,105.0,7.0,0,Tball Athletics,Tball,fall 2024,105.0
4,134861403,SN-PDR648385,2024-07-17,James,Buckett,,08/30/2017,2,Y-M,3,Twins Tball,1042 Rolling Meadow Ln,Crozet,VA,22932,United States,Crozet Elementary,1st,4,,,,3,No,Monday and Wednesday,Jennifer,Buckett,434-326-6315,4343266315,jenbuckett@gmail.com,No,,,,,Yes,YPDC95639,jenbuckett@gmail.com,Active,Paid,123.0,117.50,5.50,0.0,,123.0,0.0,1,Rookie Yankees,Rookie,fall 2024,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,140856692,SN-PRC230852,2024-12-01,Maxwell,Dailey,,06/27/2013,2,Y-XL,4,Minors-Bulls,1091 Haden Ter,Crozet,VA,22932,United States,Brownsville Elementary School,5th,2,,,,3,,,Paige,Dailey,210-727-9789,210-727-9789,paigebennettdailey@gmail.com,,,,,,,HOZE71302,paigebennettdailey@gmail.com,Active,Paid,120.0,114.60,5.40,0.0,,120.0,0.0,0,,,spring 2025,120.0
1128,140854786,SN-PFD695559,2024-12-01,Dominic,Trujillo,,06/03/2015,2,Y-L,1,,6124 Westhall Dr.,Crozet,VA,22932,United States,Homeschool,4th,3,,,,3,,Thursday,Lauren,Trujillo,5404715362,5404715362,lauren.trujillo@gmail.com,No,,,,,No,GKYM70307,lauren.trujillo@gmail.com,Active,Paid,120.0,114.60,5.40,0.0,,120.0,0.0,0,,,spring 2025,120.0
1129,140853475,SN-PUD250321,2024-12-01,Verick,Durrer,,02/01/2019,2,Y-S,1,Beginner T ball,1722 Albemarle Pippin Ct,Crozet,VA,22932,United States,Home School,K,6,John Kronstain,Matthew Kronstain,,3,,,Evan,Durrer,4342499122,4342499122,e.r.durrer@gmail.com,Yes,Assistant Coach,John,Kronstain,A-L,Yes,TPSB47528,e.r.durrer@gmail.com,Active,Paid,105.0,100.09,4.91,0.0,,105.0,0.0,0,,,spring 2025,105.0
1130,140847340,SN-PMG577584,2024-12-01,Luka,Circh,,04/22/2020,2,Y-XS,0,,5316 Raven Stone Rd,Crozet,VA,22932,United States,Crozet Elementary,K,7,,,,3,,,Ryan,Circh,5857384615,5857384615,ryancirch@gmail.com,Yes,Assistant Coach,Ryan,Circh,A-M,No,TXKC96801,ryancirch@yahoo.com,Active,Paid,130.0,124.28,5.72,0.0,,130.0,0.0,1,,,spring 2025,105.0


In [287]:
engine = create_engine('postgresql://postgres:password@localhost:5432/1264bra')

In [288]:
ReplaceOrAppend = 'replace'
destinationschema = 'peachtree'
destinationtable = f'player_registrations'

# # Assume that df is your DataFrame
# df_hankfeeds.to_sql('dailyfeeds', engine, if_exists=ReplaceOrAppend, index=False)

In [289]:
with engine.connect() as conn:
    df_peachtree.to_sql(destinationtable, con=conn, schema=destinationschema, if_exists=ReplaceOrAppend, index=False)