In [1]:
import getpass
import pandas as pd
import psycopg2 
import json
import requests
from datetime import datetime
import numpy as np

# Variables
weekly_payout = 15.51

# Define PostgreSQL database connection parameters
# user = input("username")
api_key = getpass.getpass("Enter DataGolf API key:")
host = getpass.getpass("Enter Database Host:")
port = "5432" # The default port for PosgreSQL Server
dbname = 'postgres'
user = getpass.getpass("Enter Username:")
password = getpass.getpass("Enter Password:")

# Define a SQLAlchemy URI string for connecting to the database
# The URI structure is [DB_FLAVOR]+[DB_PYTHON_LIBRARY]://[USERNAME]:[PASSWORD]@[DB_HOST]:[PORT]/[DB_NAME]
db_URI = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}"

In [None]:
# A helper function to open a psycopg2 connection, set auto-commit to true, execute the sql, and close the connection.
# This will mainly be used for writing to the db
# def execute_sql(sql, echo=False):
#     try:
#         pg_conn = psycopg2.connect(
#             dbname=dbname,
#             user=user,
#             password=password,
#             host=host,
#             port=port
#         )
#     except psycopg2.Error as e:
#         error_message = e.pgerror
#         print("Error Connecting:", error_message)
   
#     try:
#         # Set the connection to autocommit (everything is treated as an individual transaction)
#         pg_conn.set_session(autocommit=True)
        
#         # The cursor is used to execute ddl statements.
#         pg_cursor = pg_conn.cursor() 

#         pg_cursor.execute(sql)
#         if echo:
#             print(sql)

#         results = pg_cursor.fetchall()
#         return results
    
#     except psycopg2.Error as e:
#         error_message = e.pgerror
#         print("SQL Failed:", error_message)
#         return []
    
#     finally:
#         if pg_cursor:
#             pg_cursor.close()
#         if pg_conn:
#             pg_conn.close()

In [2]:
#function for uploading df to aws postgre SQL database
def insert_df_to_sql(table_name, df):
    try:
        pg_conn = psycopg2.connect(
            dbname=dbname,
            user=user,
            password=password,
            host=host,
            port=port
        )
    except psycopg2.Error as e:
        error_message = e.pgerror
        print("Error Connecting:", error_message)

    try:
        # Create a cursor object
        cursor = pg_conn.cursor()

        # Convert the DataFrame to a list of tuples for insertion
        insert_query = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({', '.join(['%s'] * len(df.columns))})"
        data_to_insert = df.to_records(index=False).tolist()

        # Insert data using executemany()
        cursor.executemany(insert_query, data_to_insert)

        # Commit the transaction
        pg_conn.commit()
        print("Data inserted successfully!")
    except psycopg2.Error as e:
        error_message = e.pgerror
        print("Exception uploading to "+table_name+" table."+error_message)

    finally:
        # Close the cursor and connection
        cursor.close()
        pg_conn.close()


In [None]:
# def psyco_read_sql(sql):
#     try:
#         pg_conn = psycopg2.connect(
#             dbname=dbname,
#             user=user,
#             password=password,
#             host=host,
#             port=port
#         )
#     except psycopg2.Error as e:
#         error_message = e.pgerror
#         print("Error Connecting:", error_message)

#     # Use pd.read_sql to execute the query and load data into a DataFrame
#     df = pd.read_sql(sql, pg_conn)

#     # Print the DataFrame
#     print(df)

#     # Close the database connection
#     pg_conn.close()


In [3]:
#function to gather df from json response
def df_from_json(feed):
    response = requests.get(feed)
    json_data = response.json()
    df = pd.DataFrame(json_data)
    return df

# Check for missing players

In [19]:
#get the players list from datagolf
players_dg = df_from_json('https://feeds.datagolf.com/get-player-list?file_format=json&key='+api_key)

#get the unique dg_id from players already in aws db
players_aws = pd.read_sql(sql = """
                          select distinct dg_id from player;
                          """, con=db_URI)

In [20]:
#find players that are in the datagolf db that are missing from the aws db
missing_players = players_dg[~players_dg['dg_id'].isin(players_aws['dg_id'])] # ~ operator inverts - this is looking for all golfers in datagolf db that are not in aws
missing_players

Unnamed: 0,amateur,country,country_code,dg_id,player_name
673,1,United States,USA,33459,"Cui, Edan"
812,1,United States,USA,33466,"Doyal, Connor"
1481,1,Dominican Republic,DOM,33467,"Huerta, Rodrigo"
1959,0,Korea - Republic of,KOR,29639,"Lee, Junseo"
2010,1,China,CHN,33457,"Li, Zhengda"
2011,1,China,CHN,33458,"Li, Zhengqian"


In [21]:
#if there's missing players, add them to the aws db
if(len(missing_players['player_name'])>0):
    insert_df_to_sql('player', missing_players)

Data inserted successfully!


# Check for Event Updates

In [22]:
#dynamic current year so script can be re-used next year
current_year = datetime.now().year

#capture datagolf events and the current year events already uploaded to aws
events_dg = df_from_json('https://feeds.datagolf.com/historical-raw-data/event-list?file_format=json&key='+api_key)
events_aws_current_year = pd.read_sql(sql = f"""
                          select event_id from event where calendar_year = {current_year};
                          """, con=db_URI)

#filter datagolf historical events to current year, pga
events_dg_current_year = events_dg[(events_dg['tour'] == 'pga') & (events_dg['calendar_year'] == events_dg['calendar_year'].max())]

In [23]:
#find which events are not in aws
missing_events = events_dg_current_year[~events_dg_current_year['event_id'].isin(events_aws_current_year['event_id'])]
missing_events

Unnamed: 0,calendar_year,date,event_id,event_name,sg_categories,tour,traditional_stats
0,2025,2025-04-20,522,Corales Puntacana Championship,no,pga,basic
1,2025,2025-04-20,12,RBC Heritage,yes,pga,yes


# Check for DFS event

In [24]:
#create array to hold missing dfs event ids
no_dfs = []

#loop through each missing event to check for empty response code 400
for id_ev in missing_events['event_id']:
    feed = 'https://feeds.datagolf.com/historical-dfs-data/points?tour=pga&site=fanduel&event_id='+str(id_ev)+'&year='+str(current_year)+'&file_format=json&key='+api_key
    response = requests.get(feed)
    if(response.status_code == 400):
        no_dfs.append(id_ev)

#remove any missing events from missing_events df as the DFS event may not be ready yet (or will never exist)
missing_events = missing_events[~missing_events['event_id'].isin(no_dfs)]
missing_events

Unnamed: 0,calendar_year,date,event_id,event_name,sg_categories,tour,traditional_stats
0,2025,2025-04-20,522,Corales Puntacana Championship,no,pga,basic
1,2025,2025-04-20,12,RBC Heritage,yes,pga,yes


In [None]:
#if there's any missing events, add them to the aws db
if(len(missing_events['event_id'])>0):
    missing_events_w_dfs_payout = missing_events.copy().drop(['sg_categories', 'tour', 'traditional_stats'], axis=1)
    missing_events_w_dfs_payout['dfs_payout'] = None
    insert_df_to_sql('event', missing_events_w_dfs_payout)

Data inserted successfully!


In [26]:
pd.read_sql("""select * from event where calendar_year = 2025 order by date desc""", con=db_URI)

Unnamed: 0,id_event,calendar_year,event_id,date,event_name,dfs_payout
0,8c46912d-245a-4ff4-b971-e0c209a224b2,2025,12,2025-04-20,RBC Heritage,15.51
1,60e782da-5a5d-432f-9860-3e3bcb4fe215,2025,522,2025-04-20,Corales Puntacana Championship,15.51
2,4d9eff7e-ef77-470e-b215-00c3993e2eec,2025,14,2025-04-13,Masters Tournament,15.51
3,0e770c7a-3347-4db5-8635-9045ffea4e6b,2025,41,2025-04-06,Valero Texas Open,15.51
4,f3ff5440-abd5-4036-b4de-e86901c04e44,2025,20,2025-03-30,Texas Children's Houston Open,15.51
5,95b17445-b05c-435b-9bbe-a0bc955f43a2,2025,475,2025-03-23,Valspar Championship,15.51
6,f1820b22-2ac1-4c9d-85e3-e0934edb1c97,2025,11,2025-03-16,THE PLAYERS Championship,15.51
7,0166a697-e2aa-40d6-8360-30968bdb3762,2025,483,2025-03-09,Puerto Rico Open,
8,1e8ca378-d675-4964-8bf2-a637f6a818f6,2025,9,2025-03-09,Arnold Palmer Invitational presented by Master...,15.51
9,e205d69a-e6d0-4ea0-b0f2-9ac184182188,2025,10,2025-03-02,Cognizant Classic in The Palm Beaches,15.51


# Parse Raw Player Scoring & Data

In [27]:
#get headers from an event with everthing documented (event 2 in 2025 is one example)
event_for_names = df_from_json('https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id=2&year=2025&file_format=json&key='+api_key)
basic_names = ['event_id', 'calendar_year', 'dg_id', 'round', 'fin_text']
stat_names = basic_names + pd.json_normalize(event_for_names['scores'].loc[0]['round_1'], max_level=0).columns.tolist()
stat_names.sort()

#create placeholder array to store round information.  *Getting warnings appending to pre-set dataframe so went the array route for now.  Could potentially pre-define/pre-fill df with None but this gets tricky because some rounds will not be uploaded due to CUT or WD
rounds = []

for id_ev in missing_events['event_id']:
    #get the event historical data
    event_temp = df_from_json('https://feeds.datagolf.com/historical-raw-data/rounds?tour=pga&event_id='+str(id_ev)+'&year='+str(current_year)+'&file_format=json&key='+api_key)
    
    #loop through players
    player_summary = pd.json_normalize(event_temp['scores'], max_level=0)
    for row in player_summary.iterrows():
        player = row[1]

        #create array of round info.  If there is a missing round, do not add
        event_rounds = []
        if pd.notna(player['round_1']):
            event_rounds = event_rounds + [pd.json_normalize(player['round_1'], max_level = 0)]
        if pd.notna(player['round_2']):
            event_rounds = event_rounds + [pd.json_normalize(player['round_2'], max_level = 0)]
        if pd.notna(player['round_3']):
            event_rounds = event_rounds + [pd.json_normalize(player['round_3'], max_level = 0)]
        if pd.notna(player['round_4']):
            event_rounds = event_rounds + [pd.json_normalize(player['round_4'], max_level = 0)]

        for index, rnd in enumerate(event_rounds):
            #add player summary info (basic_names) to each round df and add to rounds
            basic_cols = pd.DataFrame({'dg_id':[player['dg_id']], 'fin_text':[player['fin_text']], 'round':[index+1], 'event_id':[id_ev], 'calendar_year':[current_year]})
            rnd[basic_cols.columns] = basic_cols

            #find differences in the column headers (if any - some events do not contain all the data, so will need to fill with NA)
            diffs = set(stat_names)-set(rnd.columns)

            #add NA to missing columns
            for item in diffs:
                rnd[item] = None

            #sort the columns so array can be easily converted to df later
            rnd_sorted = rnd.copy()[sorted(rnd.columns)]

            #append array
            rounds.append(rnd_sorted.loc[0].values)

#convert to dataframe and normalize NAs
rounds = pd.DataFrame(rounds, columns=stat_names).replace([np.nan, 'missing'], None, inplace=False)
rounds.sample(10)

Unnamed: 0,birdies,bogies,calendar_year,course_name,course_num,course_par,dg_id,doubles_or_worse,driving_acc,driving_dist,...,score,scrambling,sg_app,sg_arg,sg_ott,sg_putt,sg_t2g,sg_total,start_hole,teetime
47,1,2,2025,Puntacana Resort (Corales Golf Course),244,72,16433,0,0.786,302.5,...,73,,,,,,,0.129,1,1:00pm
541,3,1,2025,Harbour Town Golf Links,12,71,14181,0,0.5,291.0,...,69,0.8,-0.73,0.22,0.088,0.519,-0.422,0.097,1,11:20am
333,3,3,2025,Puntacana Resort (Corales Golf Course),244,72,23542,0,0.714,327.0,...,72,,,,,,,-0.634,1,6:57am
426,5,3,2025,Harbour Town Golf Links,12,71,8825,0,0.643,290.4,...,69,0.714,0.099,1.253,0.716,-1.317,2.068,0.75,1,1:35pm
3,3,3,2025,Puntacana Resort (Corales Golf Course),244,72,24342,0,0.857,321.5,...,72,,,,,,,1.129,1,1:30pm
652,1,3,2025,Harbour Town Golf Links,12,71,12577,0,0.5,279.6,...,73,0.615,-1.063,-1.162,-0.972,-0.47,-3.197,-3.667,1,2:10pm
268,6,3,2025,Puntacana Resort (Corales Golf Course),244,72,13831,0,0.714,289.5,...,69,,,,,,,0.568,10,6:45am
622,1,4,2025,Harbour Town Golf Links,12,71,24304,0,0.571,278.5,...,74,0.571,-0.056,-0.98,-1.314,-1.899,-2.35,-4.25,1,10:45am
400,3,5,2025,Puntacana Resort (Corales Golf Course),244,72,15330,0,0.786,287.5,...,74,,,,,,,-4.432,10,6:45am
83,2,3,2025,Puntacana Resort (Corales Golf Course),244,72,17639,0,0.857,280.0,...,73,,,,,,,0.129,1,12:35pm


## Upload Any Missing Courses

In [28]:
new_courses = rounds.drop_duplicates('course_num')[['course_name', 'course_num', 'course_par']].reset_index().drop('index', axis=1)
courses_aws = pd.read_sql(sql="""SELECT * FROM COURSE;""", con=db_URI)
missing_courses = new_courses[~new_courses['course_num'].isin(courses_aws['course_num'])]
missing_courses

Unnamed: 0,course_name,course_num,course_par


In [29]:
#if there's missing courses, add them to the aws db
if(len(missing_courses['course_num'])>0):
    insert_df_to_sql('course', missing_courses)

## Upload Player Scoring

In [30]:
#cleanup rounds df
rounds_clean = rounds.drop(['course_name', 'course_par', 'fin_text'], axis=1)

#get necessary event information
events_aws = pd.read_sql(sql="""SELECT id_event, event_id, calendar_year FROM EVENT WHERE CALENDAR_YEAR = 2025;""", con=db_URI)
events_aws['id_event'] = events_aws['id_event'].astype(str)
events_aws

#get necessary course information
courses_aws = pd.read_sql(sql="""SELECT id_course, course_num FROM COURSE;""", con=db_URI)

#replace aws id's with dg
rounds_clean = pd.merge(rounds_clean, events_aws, on=['event_id', 'calendar_year'], how='left').drop(['event_id', 'calendar_year'], axis=1)
rounds_clean = pd.merge(rounds_clean, courses_aws, on=['course_num'], how='left').drop('course_num', axis=1)

#preview
rounds_clean.sample(10)

Unnamed: 0,birdies,bogies,dg_id,doubles_or_worse,driving_acc,driving_dist,eagles_or_better,gir,great_shots,pars,...,sg_app,sg_arg,sg_ott,sg_putt,sg_t2g,sg_total,start_hole,teetime,id_event,id_course
373,4,8,10498,0,0.786,278.5,0,0.5,,6,...,,,,,,-4.634,10,8:21am,60e782da-5a5d-432f-9860-3e3bcb4fe215,392e390e-cb09-49e8-9d73-f3ef18fa62fb
536,3,2,18628,0,0.714,266.6,0,0.667,2.0,13,...,-0.504,1.121,-0.601,-0.683,0.016,-0.667,1,11:20am,8c46912d-245a-4ff4-b971-e0c209a224b2,fa1be5e0-7034-4248-946c-49afe94ce935
625,3,0,17543,1,0.929,278.0,0,0.778,2.0,14,...,-0.311,-1.083,1.159,-0.668,-0.235,-0.903,1,8:50am,8c46912d-245a-4ff4-b971-e0c209a224b2,fa1be5e0-7034-4248-946c-49afe94ce935
660,3,2,13965,2,0.5,290.2,0,0.444,2.0,11,...,-5.393,0.325,-0.098,0.499,-5.166,-4.667,1,12:05pm,8c46912d-245a-4ff4-b971-e0c209a224b2,fa1be5e0-7034-4248-946c-49afe94ce935
661,5,2,13965,0,0.643,286.1,0,0.667,3.0,11,...,-0.002,-0.501,0.345,1.255,-0.158,1.097,1,8:00am,8c46912d-245a-4ff4-b971-e0c209a224b2,fa1be5e0-7034-4248-946c-49afe94ce935
288,2,2,18187,0,0.714,283.5,0,0.667,,14,...,,,,,,-2.432,10,11:57am,60e782da-5a5d-432f-9860-3e3bcb4fe215,392e390e-cb09-49e8-9d73-f3ef18fa62fb
79,3,2,19840,0,0.643,299.5,0,0.722,,13,...,,,,,,2.129,1,11:30am,60e782da-5a5d-432f-9860-3e3bcb4fe215,392e390e-cb09-49e8-9d73-f3ef18fa62fb
432,4,1,14578,0,0.571,272.7,0,0.611,3.0,13,...,-0.296,1.642,-0.711,0.698,0.635,1.333,1,1:10pm,8c46912d-245a-4ff4-b971-e0c209a224b2,fa1be5e0-7034-4248-946c-49afe94ce935
177,4,3,6419,0,0.786,308.0,0,0.5,,11,...,,,,,,0.366,1,8:33am,60e782da-5a5d-432f-9860-3e3bcb4fe215,392e390e-cb09-49e8-9d73-f3ef18fa62fb
399,3,6,33467,1,0.786,293.0,0,0.444,,8,...,,,,,,-5.634,1,8:33am,60e782da-5a5d-432f-9860-3e3bcb4fe215,392e390e-cb09-49e8-9d73-f3ef18fa62fb


In [31]:
insert_df_to_sql('round', rounds_clean)

Data inserted successfully!


# Upload Player DFS

In [32]:
#get all stat headers for dfs scoring (they all exist for event 2 in 2025)
dfs = df_from_json('https://feeds.datagolf.com/historical-dfs-data/points?tour=pga&site=fanduel&event_id=2&year=2025&file_format=json&key='+api_key)
stat_names = pd.json_normalize(dfs['dfs_points'].loc[0], max_level=0).drop(['ownership', 'player_name'], axis=1).columns.tolist()+['id_event']
stat_names.sort()

#set placeholder array
dfs_array = []

#loop through the missing events
for id_ev in missing_events['event_id']:

    #get the event historical data
    event_temp = df_from_json('https://feeds.datagolf.com/historical-dfs-data/points?tour=pga&site=fanduel&event_id='+str(id_ev)+'&year='+str(current_year)+'&file_format=json&key='+api_key)

    #loop through golfer's dfs event summary
    for player in event_temp['dfs_points']:
        #get the dfs points, add calendar year and event id so it can be merged with the aws id_event
        dfs_pts = pd.json_normalize(player, max_level=0).drop(['player_name', 'ownership'], axis=1)
        dfs_pts['calendar_year'] = current_year
        dfs_pts['event_id'] = id_ev
        dfs_pts = pd.merge(dfs_pts, events_aws, on=['event_id', 'calendar_year']).drop(['calendar_year', 'event_id'], axis=1)
        
        #find differences in the column headers (if any - some events do not contain all the data, so will need to fill with NA)
        diffs = set(stat_names)-set(dfs_pts.columns)

        #add NA to missing columns
        for item in diffs:
            dfs_pts[item] = None
    
        #sort the columns so array can be easily converted to df later
        dfs_pts_sorted = dfs_pts.copy()[sorted(dfs_pts.columns)]

        #append to placeholder array
        dfs_array.append(dfs_pts_sorted.loc[0].values)

#convert to dataframe and normalize NAs
dfs_array = pd.DataFrame(dfs_array, columns=stat_names).replace([np.nan, 'missing'], None, inplace=False)
dfs_array.sample(10)
    

Unnamed: 0,bogey_free_pts,bounce_back_pts,dg_id,fin_text,finish_pts,five_birdie_pts,hole_score_pts,id_event,salary,streak_pts,total_pts
103,0,0.6,11955,CUT,0,4,20.3,60e782da-5a5d-432f-9860-3e3bcb4fe215,,1.2,26.1
63,0,0.3,30892,70,0,4,47.2,60e782da-5a5d-432f-9860-3e3bcb4fe215,,2.4,53.9
50,0,1.2,25363,T49,0,4,56.3,60e782da-5a5d-432f-9860-3e3bcb4fe215,,1.8,63.3
104,0,0.0,11117,CUT,0,0,26.1,60e782da-5a5d-432f-9860-3e3bcb4fe215,,0.0,26.1
109,0,1.2,14140,CUT,0,0,23.7,60e782da-5a5d-432f-9860-3e3bcb4fe215,,0.0,24.9
96,0,0.3,11328,CUT,0,0,27.6,60e782da-5a5d-432f-9860-3e3bcb4fe215,,0.6,28.5
91,0,0.6,7960,CUT,0,0,27.8,60e782da-5a5d-432f-9860-3e3bcb4fe215,,1.2,29.6
36,5,1.2,27996,T41,0,4,59.4,60e782da-5a5d-432f-9860-3e3bcb4fe215,,2.4,72.0
8,0,1.2,15004,T7,10,8,80.3,60e782da-5a5d-432f-9860-3e3bcb4fe215,,2.4,101.9
97,0,0.3,17266,CUT,0,0,27.6,60e782da-5a5d-432f-9860-3e3bcb4fe215,,0.6,28.5


In [36]:
insert_df_to_sql('dfs_total', dfs_array)

Data inserted successfully!
