In [5]:
import sqlite3
import pandas as pd
import datetime as dt

In [3]:
connection = sqlite3.connect('nbadata.db')
cursor = connection.cursor()

In [4]:
!ls

2018 NBA Hackathon Application - Business Track Prompt.pdf
game_data.csv
NBA Business Analytics.ipynb
nbadata.db
player_data.csv
test_set.csv
training_set.csv


In [7]:
start = dt.datetime.now()
chunksize = 20000
j = 0
index_start = 1

for df in pd.read_csv('training_set.csv', chunksize=chunksize, iterator=True, encoding='utf-8'):
    
    df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns

    df['Game_Date'] = pd.to_datetime(df['Game_Date']) # Convert to datetimes

    df.index += index_start

    # Columns to keep
    columns = ['Season', 'Game_ID', 'Game_Date', 'Away_Team', 'Home_Team', 'Country', 'RoundedViewers']

    for c in df.columns:
        if c not in columns:
            df = df.drop(c, axis=1)    

    j+=1

    df.to_sql('test_data', connection, if_exists='append') # name of SQL table, connection, append
    
    
    print('{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize))
    
    index_start = df.index[-1] + 1 # update index start

3 seconds: completed 5000 rows
6 seconds: completed 10000 rows
9 seconds: completed 15000 rows
13 seconds: completed 20000 rows
16 seconds: completed 25000 rows
19 seconds: completed 30000 rows
23 seconds: completed 35000 rows
26 seconds: completed 40000 rows
29 seconds: completed 45000 rows
33 seconds: completed 50000 rows
36 seconds: completed 55000 rows
39 seconds: completed 60000 rows
43 seconds: completed 65000 rows
46 seconds: completed 70000 rows
49 seconds: completed 75000 rows
53 seconds: completed 80000 rows
56 seconds: completed 85000 rows
59 seconds: completed 90000 rows
63 seconds: completed 95000 rows
66 seconds: completed 100000 rows
69 seconds: completed 105000 rows
73 seconds: completed 110000 rows
76 seconds: completed 115000 rows
80 seconds: completed 120000 rows
83 seconds: completed 125000 rows
87 seconds: completed 130000 rows
91 seconds: completed 135000 rows
95 seconds: completed 140000 rows
99 seconds: completed 145000 rows
103 seconds: completed 150000 rows
10

In [16]:
pd.read_sql_query('SELECT * FROM test_data', connection).head()

Unnamed: 0,index,Season,Game_ID,Game_Date,Away_Team,Home_Team,Country,RoundedViewers
0,1,2016-17,21600001,2016-10-25 00:00:00,NYK,CLE,C113,18
1,2,2016-17,21600001,2016-10-25 00:00:00,NYK,CLE,C193,1
2,3,2016-17,21600001,2016-10-25 00:00:00,NYK,CLE,C126,4
3,4,2016-17,21600001,2016-10-25 00:00:00,NYK,CLE,C163,11
4,5,2016-17,21600001,2016-10-25 00:00:00,NYK,CLE,C73,3


In [17]:
# Total Viewership by Game
pd.read_sql_query('SELECT Game_ID, Game_Date, Away_Team, Home_Team, sum(RoundedViewers) FROM test_data GROUP BY Game_ID', connection).head()

Unnamed: 0,Game_ID,Game_Date,Away_Team,Home_Team,sum(RoundedViewers)
0,21600001,2016-10-25 00:00:00,NYK,CLE,115382
1,21600002,2016-10-25 00:00:00,UTA,POR,81480
2,21600003,2016-10-25 00:00:00,SAS,GSW,120856
3,21600004,2016-10-26 00:00:00,MIA,ORL,45512
4,21600005,2016-10-26 00:00:00,DAL,IND,67844


In [14]:
# Total Viewership by HomeTeam
pd.read_sql_query('SELECT Home_Team, sum(RoundedViewers) as TotalViews FROM test_data GROUP BY Home_Team ORDER BY TotalViews desc', connection)

Unnamed: 0,Home_Team,TotalViews
0,CLE,2321753
1,GSW,2203672
2,OKC,1842342
3,BOS,1548213
4,LAL,1523875
5,NYK,1327579
6,HOU,1325614
7,SAS,1319997
8,PHI,1293910
9,TOR,1287139


In [19]:
# Total Viewership by AwayTeam
pd.read_sql_query('SELECT Away_Team, sum(RoundedViewers) as TotalViews FROM test_data GROUP BY Away_Team ORDER BY TotalViews desc', connection)

Unnamed: 0,Away_Team,TotalViews
0,GSW,2656232
1,CLE,2299218
2,OKC,1721317
3,SAS,1467414
4,BOS,1460833
5,HOU,1415267
6,LAL,1353337
7,NYK,1260201
8,PHI,1198370
9,MIN,1183195
