In [1]:
# imports
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# load each file into a dict of dataframes
data = {}
file_names = os.listdir('data/afl')
for file_name in file_names:
    with open('data/afl/' + file_name) as f:
        data[file_name[:-4]] = pd.read_csv(f)
        # rename the ID column to Player ID
        data[file_name[:-4]].rename(columns={'ID': 'Player ID'}, inplace=True)
        # snake case the column names
        data[file_name[:-4]].columns = data[file_name[:-4]].columns.str.lower().str.replace(' ', '_')
        # add year column to each dataframe
        data[file_name[:-4]]['year'] = file_name[0:4]

In [3]:
data['2010_stats'].tail()

Unnamed: 0,player,player_id,team,opponent,round,kicks,marks,hand_balls,disp,goals,behinds,hit_outs,tackles,rebounds,inside_50,clearances,clangers,frees_for,frees_against,brownlow,contested_possessions,uncontested_possessions,contested_marks,marks_inside_50,one_percenters,bounces,goal_assists,%_time_played,year
8179,Steele Sidebottom,11751,CW,SK,GF,16,6,9,25,2,0,0,4,3,3,2,0,1,0,0,9,16,0,2,2,0,1,80,2010
8180,Dane Swan,1460,CW,SK,GF,18,6,8,26,1,0,0,11,0,4,7,1,3,0,0,9,16,0,0,2,1,0,90,2010
8181,Dale Thomas,4152,CW,SK,GF,20,8,7,27,1,2,1,1,2,3,4,3,1,0,0,11,17,0,1,3,2,0,80,2010
8182,Alan Toovey,11553,CW,SK,GF,8,7,7,15,0,0,0,9,0,1,1,4,0,0,0,4,11,0,1,1,0,0,100,2010
8183,Sharrod Wellingham,11654,CW,SK,GF,11,3,9,20,2,1,0,8,0,3,7,1,1,0,0,11,9,0,1,2,0,1,70,2010


In [4]:
# get distinct Teams from data['2023_stats]
teams = data['2023_stats']['team'].unique()
teams

array(['RI', 'CA', 'CW', 'GE', 'NM', 'WC', 'PA', 'BL', 'SY', 'GC', 'ME',
       'WB', 'AD', 'GW', 'HW', 'ES', 'SK', 'FR'], dtype=object)

In [5]:
# Our composite key will be player, year and round.
# In 2010 there were 2 grand finals, so we need to change the second `round` to "GF2", but keep the first as "GF"
# Step 1: Find all the rounds that are GF in 2010
data['2010_stats'][data['2010_stats']['round'].str.contains('GF')]

# Step 2: Edit the round column to have GF and GF2
mask = data['2010_stats']['round'].str.contains('GF')
data['2010_stats'].loc[mask & (data['2010_stats'].index < 8140), 'round'] = 'GF'
data['2010_stats'].loc[mask & (data['2010_stats'].index >= 8140), 'round'] = 'GF2'

In [6]:
data['2010_stats'][data['2010_stats']['round'].str.contains('GF')]

Unnamed: 0,player,player_id,team,opponent,round,kicks,marks,hand_balls,disp,goals,behinds,hit_outs,tackles,rebounds,inside_50,clearances,clangers,frees_for,frees_against,brownlow,contested_possessions,uncontested_possessions,contested_marks,marks_inside_50,one_percenters,bounces,goal_assists,%_time_played,year
8096,Steven Baker,967,SK,CW,GF,4,2,4,8,0,0,0,4,2,0,1,0,1,0,0,4,6,1,0,3,0,0,83,2010
8097,Jason Blake,974,SK,CW,GF,5,3,7,12,0,0,14,5,1,0,2,2,1,1,0,5,6,0,0,6,0,0,65,2010
8098,Nick Dal Santo,1132,SK,CW,GF,10,1,8,18,0,0,1,8,2,2,7,2,1,0,0,10,9,0,0,0,0,1,72,2010
8099,Zac Dawson,4141,SK,CW,GF,3,4,5,8,0,0,0,2,3,0,0,0,0,0,0,5,3,1,0,9,0,0,100,2010
8100,Sean Dempster,4073,SK,CW,GF,9,5,7,16,0,0,0,3,3,2,0,1,1,0,0,7,8,0,0,4,0,0,88,2010
8101,Robert Eddy,11681,SK,CW,GF,3,2,7,10,0,1,0,3,0,0,0,2,1,1,0,5,6,0,0,0,0,0,67,2010
8102,Sam Fisher,3975,SK,CW,GF,22,8,3,25,0,0,0,2,12,3,1,6,1,1,0,10,15,5,0,5,0,1,100,2010
8103,Michael Gardiner,1056,SK,CW,GF,2,0,2,4,0,0,10,0,0,0,1,0,0,0,0,1,3,0,0,1,0,0,32,2010
8104,Sam Gilbert,4178,SK,CW,GF,9,4,7,16,1,0,0,1,2,1,1,3,0,2,0,8,8,1,0,4,1,1,93,2010
8105,Brendon Goddard,1443,SK,CW,GF,18,5,13,31,2,0,5,5,6,2,5,5,2,2,0,14,14,3,2,3,4,0,88,2010


### Now insert data into database.

In [9]:
import sqlite3

# Create afl database
conn = sqlite3.connect('afl.db')
c = conn.cursor()

PLAYER_STATS_TABLE = """
CREATE TABLE player_stats (
    round INT,
    year INT,
    player_id INT,
    player_name VARCHAR(255),
    team VARCHAR(50),
    opponent VARCHAR(50),
    kicks INT,
    marks INT,
    hand_balls INT,
    disp INT,
    goals INT,
    behinds INT,
    hit_outs INT,
    tackles INT,
    rebounds INT,
    inside_50 INT,
    clearances INT,
    clangers INT,
    frees_for INT,
    frees_against INT,
    brownlow INT,
    contested_possessions INT,
    uncontested_possessions INT,
    contested_marks INT,
    marks_inside_50 INT,
    one_percenters INT,
    bounces INT,
    goal_assists INT,
    percent_time_played INT,
    PRIMARY KEY (round, year, player_id)
);
"""

c.execute(PLAYER_STATS_TABLE)


<sqlite3.Cursor at 0x29368ededc0>

In [10]:
def format_dataframe(df, year):
    # Rename columns to match the database schema
    # Assuming your DataFrame has columns named similarly to your database schema
    # Update this dictionary with the correct mappings
    column_mappings = {
        'player': 'player_name',
        'player_id': 'player_id',
        '%_time_played': 'percent_time_played',
        # Add other columns as necessary
    }
    df = df.rename(columns=column_mappings)

    # Convert data types if necessary
    # For example: df['goals'] = df['goals'].astype(int)

    # Add the 'year' column
    df['year'] = year

    return df

In [11]:
# Iterate over each DataFrame and insert into the database
for filename, df in data.items():
    year = filename[0:4]
    print('Inserting data for year: ' + year)

    # Add the 'year' column to the DataFrame
    formatted_df = format_dataframe(df, year)

    # Insert into afl.db using sqlite3
    formatted_df.to_sql('player_stats', conn, if_exists='append', index=False)

print("Data insertion complete.")

Inserting data for year: 1996
Inserting data for year: 1997
Inserting data for year: 1998
Inserting data for year: 1999
Inserting data for year: 2000
Inserting data for year: 2001
Inserting data for year: 2002
Inserting data for year: 2003
Inserting data for year: 2004
Inserting data for year: 2005
Inserting data for year: 2006
Inserting data for year: 2007
Inserting data for year: 2008
Inserting data for year: 2009
Inserting data for year: 2010
Inserting data for year: 2011
Inserting data for year: 2012
Inserting data for year: 2013
Inserting data for year: 2014
Inserting data for year: 2015
Inserting data for year: 2016
Inserting data for year: 2017
Inserting data for year: 2018
Inserting data for year: 2019
Inserting data for year: 2020
Inserting data for year: 2021
Inserting data for year: 2022
Inserting data for year: 2023
Data insertion complete.


In [17]:
# test query to see if data is in the database
c.execute("SELECT * FROM player_stats LIMIT 5;")
results = c.fetchall()

### Below is OLD, for AWS RDS

In [56]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

# PostgreSQL connection information
username = 'AFLadmin'
password = 'tQpwdxkTbxdnDzCh73Qr'
host = 'afldatabase.cahwanquwzda.ap-southeast-2.rds.amazonaws.com'
port = '5432'
database = 'afldatabase'

# Create a database connection using psycopg2
conn = psycopg2.connect(
    user=username,
    password=password,
    host=host,
    port=port,
    database=database
)

# Create a database engine
engine = create_engine(f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}')

In [57]:
def format_dataframe(df, year):
    # Rename columns to match the database schema
    # Assuming your DataFrame has columns named similarly to your database schema
    # Update this dictionary with the correct mappings
    column_mappings = {
        'player': 'player_name',
        'player_id': 'player_id',
        '%_time_played': 'percent_time_played',
        # Add other columns as necessary
    }
    df = df.rename(columns=column_mappings)

    # Convert data types if necessary
    # For example: df['goals'] = df['goals'].astype(int)

    # Add the 'year' column
    df['year'] = year

    return df

In [58]:
# Remove all rows from the database table: player_stats
engine.execute('DELETE FROM player_stats')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x14ae250d610>

In [59]:
# Iterate over each DataFrame and insert into the database
for filename, df in data.items():
    year = filename[0:4]
    print('Inserting data for year: ' + year)

    # Add the 'year' column to the DataFrame
    formatted_df = format_dataframe(df, year)

    # print(formatted_df.head(1))

    # Insert the data into the 'player_stats' table
    formatted_df.to_sql('player_stats', engine, if_exists='append', index=False)

print("Data insertion complete.")

Inserting data for year: 1996
Inserting data for year: 1997
Inserting data for year: 1998
Inserting data for year: 1999
Inserting data for year: 2000
Inserting data for year: 2001
Inserting data for year: 2002
Inserting data for year: 2003
Inserting data for year: 2004
Inserting data for year: 2005
Inserting data for year: 2006
Inserting data for year: 2007
Inserting data for year: 2008
Inserting data for year: 2009
Inserting data for year: 2010
Inserting data for year: 2011
Inserting data for year: 2012
Inserting data for year: 2013
Inserting data for year: 2014
Inserting data for year: 2015
Inserting data for year: 2016
Inserting data for year: 2017
Inserting data for year: 2018
Inserting data for year: 2019
Inserting data for year: 2020
Inserting data for year: 2021
Inserting data for year: 2022
Inserting data for year: 2023
Data insertion complete.
