In [None]:
# import dependencies
import pandas as pd
import nba_api
import time
from nba_api.stats.static import players, teams
from nba_api.stats.endpoints import commonplayerinfo, teamgamelogs, playercareerstats, playergamelogs, \
leaguegamelog, leaguegamefinder, playerdashboardbyyearoveryear
from nba_api.stats.library.parameters import SeasonAll
from itertools import zip_longest
import os
import numpy as np

In [None]:
pwd = os.getcwd()

In [None]:
# header to connect to the API
custom_headers = {
    'Host': 'stats.nba.com',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
}

In [None]:
# Get all players dictionary.
all_players_dict = players.get_players()
all_players_dict

In [None]:
# flatten data aka melting
all_players_df = pd.json_normalize(all_players_dict)
all_players_df

In [None]:
# load in salaries data
salaries_path = os.path.join(pwd, 'Resources', 'salaries_1985to2022.csv')
salaries_df = pd.read_csv(salaries_path)
salaries_df 

In [None]:
# Modern salaries data
modern_salaries_df = salaries_df.loc[salaries_df['season_start'] >= 2000]
modern_salaries_df

In [None]:
# Modern Raptor Data 
rap_mod = os.path.join(pwd, 'Resources', 'modern_RAPTOR_by_player.csv')
mod_rap_df = pd.read_csv(rap_mod)

In [None]:
# Historical Raptor Data
rap_hist = os.path.join(pwd, 'Resources', 'historical_RAPTOR_by_player.csv')
hist_rap_df = pd.read_csv(rap_hist)

In [None]:
# check how recent is modern raptor data
print(mod_rap_df['season'].max())
print(mod_rap_df['season'].min())

In [None]:
# check dates for historical raptor data
print(hist_rap_df['season'].max())
print(hist_rap_df['season'].min())

In [None]:
hist_rap_df.head(15)

In [None]:
mod_rap_df.head(15)

In [None]:
# check to see if historical contains the same as modern
hist_rap_df.loc[hist_rap_df['player_name'] == 'Alex Abrines']

In [None]:
# get seasons from 2000 upwards
two_thousands_rap_df = hist_rap_df.loc[hist_rap_df['season'] >= 2000]
two_thousands_rap_df

In [None]:
# check to see if season minimum starts at 2000
two_thousands_rap_df['season'].min()

In [None]:
# check column features
two_thousands_rap_df.columns.tolist()

In [None]:
# drop unnecessary data
two_thousands_rap_df= two_thousands_rap_df.drop(columns=['poss', 'war_reg_season', 'war_playoffs','predator_offense', 
                                   'predator_defense','pace_impact', 'predator_total'])

In [None]:
# check df
two_thousands_rap_df

In [None]:
# save the file into csv so we won't have to repeat process
path = os.path.join(pwd, 'Resources', 'playermainstats.csv')
main_stats = pd.read_csv(path)
main_stats

In [None]:
# use regex to find group value that starts with 2 so we get 2000 and newer since Group_value is an object (string)
recent_stats = main_stats[main_stats['GROUP_VALUE'].str.contains(r'2(?!$)')]
recent_stats

In [None]:
# save the salaries dataset 
salary_path = os.path.join(pwd, 'Recources', 'salaries_1985to2022.csv')
salaries_df = pd.read_csv(salaries_path)
salaries_df

In [None]:
# use loc to find salaries starting from 2000 and above
recent_salaries_df = salaries_df.loc[salaries_df['season_start'] >= 2000]
recent_salaries_df

In [None]:
# look through df
recent_stats.head(5)

In [None]:
# split the season we find only season start. change string to int
dates_df = recent_stats['GROUP_VALUE'].str.split(pat="-", expand = True).astype(int)
dates_df.columns=['season', 'drop']
dates_df.head()

In [None]:
# ass new column dates df with recent stats
dates_df = dates_df.drop(['drop'],axis=1)
recent_stats = pd.concat([recent_stats,dates_df], axis = 1)
recent_stats = recent_stats.drop(['GROUP_VALUE'], axis =1)

In [None]:
recent_stats.head(5)

In [None]:
# drop season and season end columns and rename season start to only season
recent_salaries_df = recent_salaries_df.drop(columns = ['season', 'season_end'])
recent_salaries_df= recent_salaries_df.rename(columns = {'season_start': 'season'})
recent_salaries_df.head(5)

In [None]:
# split full name to first name and last name for easier merging
name_split = recent_salaries_df.name.str.split(expand=True)
name_split = name_split.drop(columns = [2, 3])
name_split.columns = ['first_name', 'last_name']
name_split.head(5)

In [None]:
# concat recent salaries with split names
recent_salaries_df = pd.concat([recent_salaries_df,name_split], axis = 1)
recent_salaries_df.head()

In [None]:
# inner join with recent salaries and recent stats to combine the df
salary_stat = pd.merge(recent_salaries_df, recent_stats,  how='inner', left_on=['season','first_name', 'last_name'], 
                       right_on = ['season','first_name', 'last_name'])

In [None]:
salary_stat.loc[salary_stat['first_name'] == 'LeBron']

In [None]:
print(salary_stat.shape)
salary_stat.head(15)

In [None]:
# split full name in raptor data
rap_name_split = two_thousands_rap_df.player_name.str.split(expand=True)
rap_name_split = rap_name_split.drop(columns = [2, 3])
rap_name_split.columns = ['first_name', 'last_name']
rap_name_split.head(5)

In [None]:
# concat the first and alst name into the recent raptor data
two_thousands_rap_df = pd.concat([two_thousands_rap_df,rap_name_split], axis = 1)
two_thousands_rap_df.head()

In [None]:
# inner join the raptor data with the salaries and the stats data
final_df = pd.merge(salary_stat, two_thousands_rap_df,  how='inner', left_on=['season','first_name', 'last_name'], 
                       right_on = ['season','first_name', 'last_name'])

print(final_df.shape)
final_df.head(15)

In [None]:
# check to see if inner join worked with our favorite player Lebron
final_df.loc[final_df['first_name'] == 'LeBron']

In [None]:
# check columns
final_df.columns.tolist()

In [None]:
# drop unnecessary columns
columns_drop = ['id','player_name','player_id', 'name']
final_df = final_df.drop(columns = columns_drop, axis = 1)
final_df

In [None]:
# check columns
final_df.columns

In [None]:
# reorder columns for readability
columns_reorder = ['season', 'team','TEAM_ABBREVIATION', 'first_name', 'last_name',
       'full_name', 'position', 'salary', 'MIN', 'FG_PCT', 'FG3_PCT', 'FT_PCT',
       'REB', 'AST', 'TOV', 'STL', 'BLK', 'PTS', 'mp', 'raptor_offense',
       'raptor_defense', 'raptor_total', 'war_total']
final_df = final_df[columns_reorder]
final_df

In [None]:
# save file path
save_file_path = os.path.join(pwd,'salraptstats.csv')
final_df.to_csv(save_file_path, index = False)

In [None]:
# They have an equation they give which calculates WAR from RAPTOR.
# Basically, if I'm interpreting correctly, RAPTOR is the assessment of the play, 
# WAR is cumulative value relative to a replacement player.
# "WAR" has proven a confusing name to give a stat in the NBA because all the metrics have 
# a variation attempting to estimate wins above replacement.

In [None]:
# next steps: 
# merge salaries with stats in Postgres from 2000-current
# load in merged data
# clean raptor data so only has raptor offense, defense, total, war total
# add cleaned raptor to salary stats table
# clean table so it is easy to read with only a few features
# build dashboard

In [None]:
# store into sql database
from sqlalchemy import create_engine
import psycopg2 
import io
from config import password
import time

DB_address = 'nbadb.ca9dadq6ltaa.us-east-2.rds.amazonaws.com'
engine = create_engine(f'postgresql://team:{password}@{DB_address}:5432/NBA_database')
final_df.to_sql('salraptstats', engine, if_exists='replace',index=False) #drops old table and creates new empty table

# import the ratings data in chunks because it is too big 
# create a variable for the number of rows imported
rows_imported = 0
# gets the start time from time.time()
start_time = time.time()
for data in pd.read_csv(f'{pwd}/salraptstats.csv', chunksize=1000000):

    # print out the range of rows that are being imported
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='salraptstats', con=engine, if_exists='append', index=False)

    # increment the number of rows imported by the chunksize
    rows_imported += len(data)
    # print that the rows have finished importing
    print(f'Done. {time.time()- start_time} total seconds elapsed')