In [43]:
import requests
import pandas as pd
import datetime
import time
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [44]:
def get_all_match_data(headers):
    data = {}
    current_year = datetime.datetime.now().year
    years = range(1996, current_year+1)
    rounds = range(1, 29)
    for year in tqdm(years):
        for round in rounds:
            r = requests.get('https://api.squiggle.com.au/?q=games;year={};round={}'.format(year, round), headers=headers)
            games = r.json()
            games_df = pd.DataFrame(games['games'])
            data['{}-{}'.format(year, round)] = games_df
            time.sleep(1)
    return data

headers = {
    'User-Agent': 'AFLNaturalLanguageQueryBuilder/1.0 (tobyprofitt1@gmail.com)'
}

def load_all_from_csv(path):
    data = {}
    current_year = datetime.datetime.now().year
    years = range(1996, current_year+1)
    for year in tqdm(years):
        try:
            games_df = pd.read_csv('{}{}.csv'.format(path, year))
            data['{}'.format(year)] = games_df
        except:
            pass
    return data

# data = get_all_match_data(headers)
path_to_data = '../data/matchinfo/'
data = load_all_from_csv(path_to_data)

100%|██████████| 28/28 [00:00<00:00, 95.61it/s]


In [45]:
def save_to_csv():
    years = range(1996, 2024)
    rounds = range(1, 29)
    for year in years:
        # Stack each round of each year on top of each other
        df = pd.concat([data['{}'.format(year)] for _ in rounds])
            
        if not df.empty:
            df.to_csv('../data/matchinfo/{}.csv'.format(year), index=False)

## Now add to database

In [46]:
import sqlite3

conn = sqlite3.connect('../afl.db')

In [47]:
MATCHINFO = '''
CREATE TABLE game (
    round INTEGER,
    agoals INTEGER,
    game_id INTEGER PRIMARY KEY,
    unixtime INTEGER,
    date TEXT,
    hteamid INTEGER,
    ateam TEXT,
    tz TEXT,
    venue TEXT,
    localtime TEXT,
    winner TEXT, -- draw if winner = None
    complete INTEGER,
    hbehinds INTEGER,
    ascore INTEGER,
    roundname TEXT,
    year INTEGER,
    is_final INTEGER,
    hscore INTEGER,
    abehinds INTEGER,
    is_grand_final INTEGER,
    updated TEXT,
    hgoals INTEGER,
    timestr TEXT,
    hteam TEXT,
    winnerteamid REAL,
    ateamid INTEGER
);
'''

conn.execute("DROP TABLE IF EXISTS game")

conn.execute(MATCHINFO)

<sqlite3.Cursor at 0x1e56008f6c0>

In [48]:
# Stack dataframes on top of each other to each year, and add year column
years = range(1996, 2024)
rounds = range(1, 29)
df = pd.DataFrame()
for year in years:
    temp_df = data['{}'.format(year)]
    temp_df['year'] = year
    df = pd.concat([df, temp_df])
# Rename id column to game_id
df.rename(columns={'id': 'game_id'}, inplace=True)

In [49]:
# Replace the `round` column with a simplified version of the roundname column.
# 1: First remove "Round " from the start and replace in df
df['round'] = df['roundname'].str.replace('Round ', '')

# 2: Then apply the mapping selectively
mapping = {
    'Semi-Finals': 'Semi Final',
    'Preliminary Finals': 'Preliminary Final',
}

# Update only the specific values that are in the mapping
for original, new in mapping.items():
    df.loc[df['round'] == original, 'round'] = new


In [50]:
# Insert the data into the database
df.to_sql('game', conn, if_exists='append', index=False)

5419

In [51]:
# Check the data has been inserted
print(len(conn.execute('SELECT * FROM game').fetchall()))

5419


In [55]:
dbdata = pd.read_sql('SELECT * FROM game', conn)
dbdata.head(1)

Unnamed: 0,round,agoals,game_id,unixtime,date,hteamid,ateam,tz,venue,localtime,winner,complete,hbehinds,ascore,roundname,year,is_final,hscore,abehinds,is_grand_final,updated,hgoals,timestr,hteam,winnerteamid,ateamid
0,1,20,1,1490257200,2017-03-23 19:20:00,3,Richmond,+11:00,M.C.G.,2017-03-23 19:20:00,Richmond,100,5,132,Round 1,2017,0,89,12,0,2017-04-15 15:59:16,14,Full Time,Carlton,14.0,14
