In [12]:
# imports
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [13]:
# load each file into a dict of dataframes
data = {}
file_names = os.listdir('../data/afl')
for file_name in file_names:
    with open('../data/afl/' + file_name) as f:
        data[file_name[:-4]] = pd.read_csv(f)
        # rename the ID column to Player ID
        data[file_name[:-4]].rename(columns={'ID': 'Player ID'}, inplace=True)
        # snake case the column names
        data[file_name[:-4]].columns = data[file_name[:-4]].columns.str.lower().str.replace(' ', '_')
        # add year column to each dataframe
        data[file_name[:-4]]['year'] = file_name[0:4]

In [14]:
# get distinct Teams from data['2023_stats]
teams = data['2023_stats']['team'].unique()
mapping = {'RI': 'Richmond', 'CA': 'Carlton', 'ES': 'Essendon', 'AD': 'Adelaide', 'WB': 'Western Bulldogs',
           'CW': 'Collingwood', 'WC': 'West Coast', 'ME': 'Melbourne', 'SY': 'Sydney', 'GE': 'Geelong',
           'NM': 'North Melbourne', 'FR': 'Fremantle', 'PA': 'Port Adelaide', 'GC': 'Gold Coast', 'BL': 'Brisbane Lions',
           'SK': 'St Kilda', 'HW': 'Hawthorn', 'GW': 'GWS Giants', 'FI': 'Fitzroy', 'BB': 'Brisbane Bears'}

In [15]:
for key, df in data.items():
    df['team'] = df['team'].apply(lambda x: mapping[x])
    df['opponent'] = df['opponent'].apply(lambda x: mapping[x])

In [16]:
# Handle grand final replay
data['2010_stats'][data['2010_stats']['round'].str.contains('GF')]

mask = data['2010_stats']['round'].str.contains('GF')
data['2010_stats'].loc[mask & (data['2010_stats'].index < 8140), 'round'] = 'GF'
data['2010_stats'].loc[mask & (data['2010_stats'].index >= 8140), 'round'] = 'GF2'

In [17]:
# Add mapping for finals
finals_mapping = {'QF': 'Qualifying Final', 'EF': 'Elimination Final', 
                  'GF': 'Grand Final', 'GF2': 'Grand Final Replay',
                  'SF': 'Semi Final', 'PF': 'Preliminary Final'}

In [18]:
# Apply mapping to all dataframes
for key, df in data.items():
    df['round'] = df['round'].apply(lambda x: finals_mapping[x] if x in finals_mapping else x)

### Now insert data into database.

In [19]:
import sqlite3

# Create afl database
conn = sqlite3.connect('../afl.db')
c = conn.cursor()

PLAYER_STATS_TABLE = """
CREATE TABLE player_stats (
    round INT,
    year INT,
    player_id INT,
    player_name VARCHAR(255),
    team VARCHAR(50),
    opponent VARCHAR(50),
    kicks INT,
    marks INT,
    hand_balls INT,
    disp INT,
    goals INT,
    behinds INT,
    hit_outs INT,
    tackles INT,
    rebounds INT,
    inside_50 INT,
    clearances INT,
    clangers INT,
    frees_for INT,
    frees_against INT,
    brownlow INT,
    contested_possessions INT,
    uncontested_possessions INT,
    contested_marks INT,
    marks_inside_50 INT,
    one_percenters INT,
    bounces INT,
    goal_assists INT,
    percent_time_played INT,
    PRIMARY KEY (round, year, player_id)
);
"""

# Drop all tables
c.execute('DROP TABLE IF EXISTS player_stats')

c.execute(PLAYER_STATS_TABLE)


<sqlite3.Cursor at 0x2040f21bd50>

In [20]:
def format_dataframe(df, year):
    # Rename columns to match the database schema
    # Assuming your DataFrame has columns named similarly to your database schema
    # Update this dictionary with the correct mappings
    column_mappings = {
        'player': 'player_name',
        'player_id': 'player_id',
        '%_time_played': 'percent_time_played',
        # Add other columns as necessary
    }
    df = df.rename(columns=column_mappings)

    # Convert data types if necessary
    # For example: df['goals'] = df['goals'].astype(int)

    # Add the 'year' column
    df['year'] = year

    return df

In [21]:
# Iterate over each DataFrame and insert into the database
for filename, df in data.items():
    year = filename[0:4]
    print('Inserting data for year: ' + year)

    # Add the 'year' column to the DataFrame
    formatted_df = format_dataframe(df, year)

    # Insert into afl.db using sqlite3
    formatted_df.to_sql('player_stats', conn, if_exists='append', index=False)

print("Data insertion complete.")

Inserting data for year: 1996
Inserting data for year: 1997
Inserting data for year: 1998
Inserting data for year: 1999
Inserting data for year: 2000
Inserting data for year: 2001
Inserting data for year: 2002
Inserting data for year: 2003
Inserting data for year: 2004
Inserting data for year: 2005
Inserting data for year: 2006
Inserting data for year: 2007
Inserting data for year: 2008
Inserting data for year: 2009
Inserting data for year: 2010
Inserting data for year: 2011
Inserting data for year: 2012
Inserting data for year: 2013
Inserting data for year: 2014
Inserting data for year: 2015
Inserting data for year: 2016
Inserting data for year: 2017
Inserting data for year: 2018
Inserting data for year: 2019
Inserting data for year: 2020
Inserting data for year: 2021
Inserting data for year: 2022
Inserting data for year: 2023
Data insertion complete.


In [22]:
# test query to see if data is in the database
c.execute("SELECT * FROM player_stats LIMIT 5;")
results = c.fetchall()
results

[(1,
  1996,
  720,
  'Alastair Clarkson',
  'Melbourne',
  'Geelong',
  19,
  6,
  5,
  24,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0),
 (1,
  1996,
  718,
  'David Cockatoo-Collins',
  'Melbourne',
  'Geelong',
  3,
  0,
  2,
  5,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0),
 (1,
  1996,
  719,
  'Don Cockatoo-Collins',
  'Melbourne',
  'Geelong',
  5,
  3,
  1,
  6,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  2,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0),
 (1,
  1996,
  715,
  'Greg Doyle',
  'Melbourne',
  'Geelong',
  8,
  4,
  1,
  9,
  4,
  1,
  7,
  1,
  0,
  0,
  0,
  0,
  2,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0),
 (1,
  1996,
  712,
  'Damien Gaspar',
  'Melbourne',
  'Geelong',
  4,
  3,
  3,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0)]