In [1]:
import numpy as np
import os
import pandas as pd
import psycopg2
import psycopg2.extras as extras
from sqlalchemy import create_engine
from password import password

### Load CSVs to Dataframes

In [2]:
# read player_info csv to df
team_info_df = pd.read_csv("data/team_info.csv")
team_info_df.head(5)

Unnamed: 0,team_id,franchiseId,shortName,teamName,abbreviation,link
0,1,23,New Jersey,Devils,NJD,/api/v1/teams/1
1,4,16,Philadelphia,Flyers,PHI,/api/v1/teams/4
2,26,14,Los Angeles,Kings,LAK,/api/v1/teams/26
3,14,31,Tampa Bay,Lightning,TBL,/api/v1/teams/14
4,6,6,Boston,Bruins,BOS,/api/v1/teams/6


In [3]:
# read game csv to df
game_df = pd.read_csv("data/game.csv")
game_df.head(5)

Unnamed: 0,game_id,season,type,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2016020045,20162017,R,2016-10-19,4,16,4,7,home win REG,right,United Center,/api/v1/venues/null,America/Chicago,-5,CDT
1,2017020812,20172018,R,2018-02-07,24,7,4,3,away win OT,left,KeyBank Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2015020314,20152016,R,2015-11-24,21,52,4,1,away win REG,right,MTS Centre,/api/v1/venues/null,America/Winnipeg,-5,CDT
3,2015020849,20152016,R,2016-02-17,52,12,1,2,home win REG,right,PNC Arena,/api/v1/venues/null,America/New_York,-4,EDT
4,2017020586,20172018,R,2017-12-30,20,24,1,2,home win REG,left,Honda Center,/api/v1/venues/null,America/Los_Angeles,-7,PDT


In [4]:
# read game_plays csv to df
game_plays_df = pd.read_csv("data/game_plays.csv")
game_plays_df.head(5)

Unnamed: 0,play_id,game_id,team_id_for,team_id_against,event,secondaryType,x,y,period,periodType,periodTime,periodTimeRemaining,dateTime,goals_away,goals_home,description,st_x,st_y
0,2016020045_1,2016020045,,,Game Scheduled,,,,1,REGULAR,0,1200.0,2016-10-18 23:40:58,0,0,Game Scheduled,,
1,2016020045_2,2016020045,,,Period Ready,,,,1,REGULAR,0,1200.0,2016-10-19 01:35:28,0,0,Period Ready,,
2,2016020045_3,2016020045,,,Period Start,,,,1,REGULAR,0,1200.0,2016-10-19 01:40:50,0,0,Period Start,,
3,2016020045_4,2016020045,16.0,4.0,Faceoff,,0.0,0.0,1,REGULAR,0,1200.0,2016-10-19 01:40:50,0,0,Jonathan Toews faceoff won against Claude Giroux,0.0,0.0
4,2016020045_5,2016020045,16.0,4.0,Shot,Wrist Shot,-71.0,9.0,1,REGULAR,54,1146.0,2016-10-19 01:41:44,0,0,Artem Anisimov Wrist Shot saved by Michal Neuv...,71.0,-9.0


In [5]:
# read game_teams_stats csv to df
game_teams_stats_df = pd.read_csv("data/game_teams_stats.csv")
game_teams_stats_df.head(5)

Unnamed: 0,game_id,team_id,HoA,won,settled_in,head_coach,goals,shots,hits,pim,powerPlayOpportunities,powerPlayGoals,faceOffWinPercentage,giveaways,takeaways,blocked,startRinkSide
0,2016020045,4,away,False,REG,Dave Hakstol,4.0,27.0,30.0,6.0,4.0,2.0,50.9,12.0,9.0,11.0,left
1,2016020045,16,home,True,REG,Joel Quenneville,7.0,28.0,20.0,8.0,3.0,2.0,49.1,16.0,8.0,9.0,left
2,2017020812,24,away,True,OT,Randy Carlyle,4.0,34.0,16.0,6.0,3.0,1.0,43.8,7.0,4.0,14.0,right
3,2017020812,7,home,False,OT,Phil Housley,3.0,33.0,17.0,8.0,2.0,1.0,56.2,5.0,6.0,14.0,right
4,2015020314,21,away,True,REG,Patrick Roy,4.0,29.0,17.0,9.0,3.0,1.0,45.7,13.0,5.0,20.0,left


In [6]:
# load game_shifts csv to df
game_scratches_df = pd.read_csv("data/game_scratches.csv")
game_scratches_df.head()

Unnamed: 0,game_id,team_id,player_id
0,2016020045,16,8477845
1,2016020045,16,8477451
2,2016020045,16,8465058
3,2016020045,4,8476393
4,2016020045,4,8475462


In [7]:
# load game_officials csv to df
game_officials_df = pd.read_csv("data/game_officials.csv")
game_officials_df.head()

Unnamed: 0,game_id,official_name,official_type
0,2016020045,Dan O'Rourke,Referee
1,2016020045,Trevor Hanson,Referee
2,2016020045,Scott Driscoll,Linesman
3,2016020045,Lonnie Cameron,Linesman
4,2017020812,Justin St. Pierre,Referee


In [8]:
# load game_shifts csv to df
game_shifts_df = pd.read_csv("data/game_shifts.csv")
game_shifts_df.head()

Unnamed: 0,game_id,player_id,period,shift_start,shift_end
0,2018020001,8466139,1,0,42.0
1,2018020001,8466139,1,207,247.0
2,2018020001,8466139,1,375,413.0
3,2018020001,8466139,1,556,574.0
4,2018020001,8466139,1,605,631.0


In [9]:
# load game_shifts csv to df
game_penalties_df = pd.read_csv("data/game_penalties.csv")
game_penalties_df.head(5)

Unnamed: 0,play_id,penaltySeverity,penaltyMinutes
0,2016020045_41,Minor,2
1,2016020045_101,Minor,2
2,2016020045_134,Minor,2
3,2016020045_174,Minor,2
4,2016020045_189,Minor,2


In [10]:
# load game_shifts csv to df
game_skater_stats_df = pd.read_csv("data/game_skater_stats.csv")
game_skater_stats_df.head(5)

Unnamed: 0,game_id,player_id,team_id,timeOnIce,assists,goals,shots,hits,powerPlayGoals,powerPlayAssists,...,faceoffTaken,takeaways,giveaways,shortHandedGoals,shortHandedAssists,blocked,plusMinus,evenTimeOnIce,shortHandedTimeOnIce,powerPlayTimeOnIce
0,2016020045,8468513,4,955,1,0,0,2.0,0,0,...,0,1.0,1.0,0,0,1.0,1,858,97,0
1,2016020045,8476906,4,1396,1,0,4,2.0,0,0,...,0,1.0,2.0,0,0,2.0,0,1177,0,219
2,2016020045,8474668,4,915,0,0,1,1.0,0,0,...,0,2.0,0.0,0,0,0.0,-1,805,0,110
3,2016020045,8473512,4,1367,3,0,0,0.0,0,2,...,27,0.0,0.0,0,0,0.0,-1,1083,19,265
4,2016020045,8471762,4,676,0,0,3,2.0,0,0,...,0,0.0,1.0,0,0,0.0,-1,613,63,0


### Connection to Postgres Database

In [11]:
# create connector
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/hockey_db')

# display tables in database
engine.table_names()

['game',
 'game_plays',
 'game_teams_stats',
 'game_scratches',
 'team_info',
 'game_officials',
 'game_shifts',
 'game_penalties',
 'game_skater_stats']

### Load Dataframes into Postgres Database

###### Team Info table

In [12]:
team_info_df.to_sql(
    'team_info',
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False
)

###### Game table

In [13]:
# drop duplicate rows, keep first record
game_df.drop_duplicates(subset = 'game_id', keep = 'first', inplace = True)

In [14]:
# load df to sql db table
game_df.to_sql(
    name = 'game',
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False
)

###### Game Plays table

In [15]:
# drop duplicate rows, keep first record
game_plays_df.drop_duplicates(subset = 'play_id', keep = 'first', inplace = True)

In [16]:
# this one takes forever to load - takes 15-20 minutes using pd.to_sql()
# using execute_values() at bottom of notebook, which runs 4x-5x faster


# load df to sql db table
# game_plays_df.to_sql(
#     'game_plays',
#     con = engine,
#     schema = 'public',
#     if_exists = 'append',
#     index = False,
#     method = 'multi',
#     chunksize = 25000
# )

###### Game Teams Stats table

In [17]:
# load df to sql db table
game_teams_stats_df.to_sql(
    name = 'game_teams_stats',
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False
)

###### Game Scratches table

In [18]:
# load df to sql db table
game_scratches_df.to_sql(
    name = 'game_scratches',
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False
)

###### Game Officials table

In [19]:
# load df to sql db table
game_officials_df.to_sql(
    name = 'game_officials',
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False
)

###### Game Shifts table

In [20]:
# this one takes forever to load - took 19 minutes
# using execute_values() at bottom of notebook, which runs 4x-5x faster

# load df to sql db table
# game_shifts_df.to_sql(
#     name = 'game_shifts',
#     con = engine,
#     schema = 'public',
#     if_exists = 'append',
#     index = False,
#     method = 'multi',
#     chunksize = 25000
# )

###### Game Penalties table

In [21]:
game_penalties_df.to_sql(
    name = 'game_penalties',
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False
)

###### Game Skater Stats table

In [22]:
game_skater_stats_df.to_sql(
    name = "game_skater_stats",
    con = engine,
    schema = 'public',
    if_exists = 'append',
    index = False,
    method = 'multi',
    chunksize = 25000
    )

# Alternative Method to Load to SQL (4-5x faster)

In [23]:
# set connection parameters
# next time, use config.py isntead of password.py
param_dic = {
    "host"      : "localhost",
    "database"  : "hockey_db",
    "user"      : "postgres",
    "password"  : password
}

In [24]:
# create connection function which prints success/error
def connect(params):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [25]:
# test database connection
conn = connect(param_dic)

Connecting to the PostgreSQL database...
Connection successful


In [26]:
# function for executing queries
def execute_query(conn, query):
    ret = 0 # Return value
    cursor = conn.cursor()
    try:
        cursor.execute(query)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    # if SELECT query, return result
    if 'select' in query.lower():
        ret = cursor.fetchall()
    cursor.close()
    return ret

In [27]:
# function for reading dataframe, loading to sql database
def execute_values(conn, df, table):
    # create list of tupples from dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    # comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL query to execute
    query  = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("CSV to SQL execute_values() process complete")
    cursor.close()

In [28]:
# load dataframe to sql db table
# this runs in 4min 15sec, which is 3x-4x faster than to_sql()
execute_values(conn, game_plays_df, 'game_plays')

CSV to SQL execute_values() process complete


In [29]:
# load dataframe to sql db table
# this runs in 6min 1sec, which is 3-4x faster than to_sql()
execute_values(conn, game_shifts_df, 'game_shifts')

CSV to SQL execute_values() process complete
