In [None]:
# import dependencies
import pandas as pd
import nba_api
import time
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import commonplayerinfo, teamgamelogs, playercareerstats, playergamelogs, \
leaguegamelog, leaguegamefinder, playerdashboardbyyearoveryear
from nba_api.stats.library.parameters import SeasonAll
from itertools import zip_longest
import os

In [None]:
# Header is necessary for connecting with the API
custom_headers = {
    'Host': 'stats.nba.com',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [None]:
# Get all players dictionary.
all_players_dict = players.get_players()
all_players_dict

In [None]:
# flatten data aka melting
all_players_df = pd.json_normalize(all_players_dict)
all_players_df

In [None]:
# Get all teams.
all_teams_dict = teams.get_teams()

# flatten data
all_teams_df = pd.json_normalize(all_teams_dict)

In [None]:
all_teams_df.head()

In [None]:
# check to see how to pull in a player using Lebron as an example
bron = [player for player in all_players_dict if player['full_name'] == 'LeBron James'][0]
bron

In [None]:
# finding lebron's player ID. 
bron_id = str(bron['id'])
bron_id

In [None]:
# getting Golden State Warriors team info
gsw = [team for team in all_teams_dict if team['full_name'] == "Golden State Warriors"][0]
gsw

In [None]:
# getting GSW team id
gsw_id = gsw['id']
gsw_id

In [None]:
# getting lebrons year stats
gamelog_bron = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id = bron_id)
gamelog_bron

In [None]:
# in JSON format, so must turn into data frames. 
gamelog_bron_df = gamelog_bron.get_data_frames()[1]
gamelog_bron_df

In [None]:
# check to see what features/stats comes with API
columns = gamelog_bron_df.columns
print(len(columns))
columns

In [None]:
# get all player IDs into a list so we can find all the stats of the players  
player_ids = [player_id['id'] for player_id in all_players_dict]
print(player_ids)

In [None]:
# check how many players there are 
print(len(player_ids))

In [None]:
# use the grouper function to try to create chunks of player id so API won't be mad at me and give me Timeout 
# this will group the player id's in a tuple
def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

In [None]:
# chunker function does the same as grouper, just implemented differently into a list instead of a tuple
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
# (in python 2 use xrange() instead of range() to avoid allocating a list)

In [None]:
# check to see if chunker works
for chunk in chunker(player_ids, 5):
    print(chunk)

In [None]:
# check to see if grouper works
for group in grouper(player_ids, 10):
    print(group)

In [None]:
# GSW_games = leaguegamefinder.LeagueGameFinder(team_id_nullable=gsw_id).get_data_frames()[0]
# GSW_games.head()

In [None]:
# create a gamelog data frame so we can start appending our players into the the dataframe.
# data wont have player ID, so append player ID into 'id' column at the end of the df
gamelog_players_df = pd.DataFrame(columns = columns)
gamelog_players_df['id'] = ''
gamelog_players_df

In [None]:
# Connect into the api and get the first 100 players with a range of player ids to get game logs
for player in player_ids[1:100]:
    gamelog_players = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id = player)
    temp_df = gamelog_players.get_data_frames()[1]
    temp_df['id'] = player 
    gamelog_players_df = pd.concat([gamelog_players_df, temp_df], ignore_index = True)
    
gamelog_players_df.sample(5)

In [None]:
# check if the data frame is actually pulling data from API
gamelog_players_df.shape

In [None]:
# Using bigger chunk now so that I won't get a timeout error
for player in player_ids[100:500]:
    gamelog_players = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id = player)
    temp_df = gamelog_players.get_data_frames()[1]
    temp_df['id'] = player 
    gamelog_players_df = pd.concat([gamelog_players_df, temp_df], ignore_index = True)
    
gamelog_players_df.sample(5)

In [None]:
# checking shape
gamelog_players_df.shape

In [None]:
# pushing boundaries by using bigger chunks
for player in player_ids[500:2000]:
    gamelog_players = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id = player)
    temp_df = gamelog_players.get_data_frames()[1]
    temp_df['id'] = player 
    gamelog_players_df = pd.concat([gamelog_players_df, temp_df], ignore_index = True)
    
gamelog_players_df.tail(5)

In [None]:
# check to see if appending
gamelog_players_df.shape

In [None]:
# checking chunks of 1500
for player in player_ids[2000:3500]:
    gamelog_players = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id = player)
    temp_df = gamelog_players.get_data_frames()[1]
    temp_df['id'] = player 
    gamelog_players_df = pd.concat([gamelog_players_df, temp_df], ignore_index = True)
    
gamelog_players_df.tail(5)

In [None]:
# checking data
gamelog_players_df.shape

In [None]:
# last chunk of player ids 
for player in player_ids[3500:]:
    gamelog_players = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id = player)
    temp_df = gamelog_players.get_data_frames()[1]
    temp_df['id'] = player 
    gamelog_players_df = pd.concat([gamelog_players_df, temp_df], ignore_index = True)
    
gamelog_players_df.tail(5)

In [None]:
# checking size of data set
gamelog_players_df.shape

In [None]:
# save the data into a csv so I won't have to keep pulling for API
pwd = os.getcwd()
save_file_path = os.path.join(pwd,'playerstatsbyseason.csv')
gamelog_players_df.to_csv(save_file_path, index = False)

In [None]:
# checking the types of the columns
gamelog_players_df.dtypes

In [None]:
# changing id into int so I can use .loc function for filtering
gamelog_players_df["id"] = gamelog_players_df["id"].astype(str).astype(int)

In [None]:
gamelog_players_df.dtypes

In [None]:
# check how far back the gamelogs go
gamelog_players_df['GROUP_VALUE'].unique()

In [None]:
# push data into our AWS for team to use
from sqlalchemy import create_engine
import psycopg2 
import io
from config import password

DB_address = 'nbadb.ca9dadq6ltaa.us-east-2.rds.amazonaws.com'
engine = create_engine(f'postgresql://team:{password}@{DB_address}:5432/NBA_database')
gamelog_players_df.head(0).to_sql('playerstatsbyseason', engine, if_exists='replace',index=False) #drops old table and creates new empty table

conn = engine.raw_connection()
cur = conn.cursor()
output = io.StringIO()
gamelog_players_df.to_csv(save_file_path, header= False, index = False)
output.seek(0)
contents = output.getvalue()
cur.copy_from(output, 'playerstatsbyseason', null="") # null values become ''
conn.commit()