In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sqlite3
import requests

url = 'https://raw.githubusercontent.com/xlisouski/DataCoder/main/nba_salary.sqlite'

# Nos vamos a conectar a una base de SQLite
consulta = "SELECT name FROM sqlite_master WHERE type='table'"

# Descargar la base de datos desde la URL
response = requests.get(url)

# Guardar el contenido descargado en un archivo local
with open('nba_salary.sqlite', 'wb') as f:
    f.write(response.content)
    
# Conectar a la base de datos SQLite descargada
conexion = sqlite3.connect('nba_salary.sqlite')

# Levanto la data
tablas_disponibles = pd.read_sql_query(consulta,conexion)

# Cerrar la conexión
conexion.close()

In [3]:
tablas_disponibles

Unnamed: 0,name
0,NBA_season1718_salary
1,Seasons_Stats


In [4]:
con = sqlite3.connect("nba_salary.sqlite")
df = pd.read_sql_query('SELECT * FROM NBA_season1718_salary',con)
df1 = pd.read_sql_query('SELECT * From Seasons_Stats',con)
con.close()

In [5]:
df.head()

Unnamed: 0,X1,Player,Tm,season17_18
0,1.0,Stephen Curry,GSW,34682550.0
1,2.0,LeBron James,CLE,33285709.0
2,3.0,Paul Millsap,DEN,31269231.0
3,4.0,Gordon Hayward,BOS,29727900.0
4,5.0,Blake Griffin,DET,29512900.0


In [6]:
df1.head()

Unnamed: 0,X1,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0.0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1.0,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2.0,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3.0,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4.0,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


# Valores vacíos 

In [43]:
# NBA_season1718_salary no tiene vacíos 
df.isnull().values.any()

False

In [44]:
df.isnull().sum()

X1             0
Player         0
Tm             0
season17_18    0
dtype: int64

In [45]:
# Nulos en Seasons_Stats
nulos = df1.isnull().sum()
nulos_pct = nulos / df1.shape[0]
nulos_resumen = pd.DataFrame({'Cant_Nulos': nulos,
                              'Pct_Nulos': nulos_pct})
nulos_resumen = nulos_resumen.loc[nulos_resumen['Cant_Nulos'] > 0,]
nulos_resumen = nulos_resumen.sort_values(by = 'Cant_Nulos',ascending=False)
nulos_resumen

Unnamed: 0,Cant_Nulos,Pct_Nulos
blanl,24691,1.0
blank2,24691,1.0
USG%,24633,0.997651
TRB%,24436,0.989672
DRB%,24320,0.984974
BPM,24279,0.983314
TOV%,24241,0.981775
OBPM,24191,0.97975
DBPM,23975,0.971002
DRB,23956,0.970232


In [47]:
# Eliminar columnas con al menos 60% de vacíos 
col_del = nulos_resumen.index[nulos_resumen['Pct_Nulos']>0.6]
df1_sinnulls = df1.drop(labels = col_del, axis=1)
print(df1.shape,df1_sinnulls.shape)

(24691, 53) (24691, 29)


In [46]:
col_del

Index(['blanl', 'blank2', 'USG%', 'TRB%', 'DRB%', 'BPM', 'TOV%', 'OBPM',
       'DBPM', 'DRB', 'AST%', 'TOV', 'ORB%', 'ORB', 'STL', 'STL%', 'BLK%',
       'VORP', '3PAr', 'BLK', '3P%', '3PA', 'GS', '3P'],
      dtype='object')

# Seasons_Stats con año 2017

In [48]:
df1_2017 = df1_sinnulls[df1_sinnulls['Year'] == 2017]
df1_2017.head()

Unnamed: 0,X1,Year,Player,Pos,Age,Tm,G,MP,PER,TS%,...,2PA,2P%,eFG%,FT,FTA,FT%,TRB,AST,PF,PTS
24096,24096.0,2017.0,Alex Abrines,SG,23.0,OKC,68.0,1055.0,10.1,0.56,...,94.0,0.426,0.531,44.0,49.0,0.898,86.0,40.0,114.0,406.0
24097,24097.0,2017.0,Quincy Acy,PF,26.0,TOT,38.0,558.0,11.8,0.565,...,80.0,0.413,0.521,45.0,60.0,0.75,115.0,18.0,67.0,222.0
24098,24098.0,2017.0,Quincy Acy,PF,26.0,DAL,6.0,48.0,-1.4,0.355,...,10.0,0.4,0.324,2.0,3.0,0.667,8.0,0.0,9.0,13.0
24099,24099.0,2017.0,Quincy Acy,PF,26.0,BRK,32.0,510.0,13.1,0.587,...,70.0,0.414,0.542,43.0,57.0,0.754,107.0,18.0,58.0,209.0
24100,24100.0,2017.0,Steven Adams,C,23.0,OKC,80.0,2389.0,16.5,0.589,...,654.0,0.572,0.571,157.0,257.0,0.611,615.0,86.0,195.0,905.0


In [49]:
df1_2017.loc[df1_2017['Player'].duplicated(),:].sort_values(by='Player')

Unnamed: 0,X1,Year,Player,Pos,Age,Tm,G,MP,PER,TS%,...,2PA,2P%,eFG%,FT,FTA,FT%,TRB,AST,PF,PTS
24155,24155.0,2017.0,Andrew Bogut,C,32.0,DAL,26.0,582.0,9.4,0.460,...,80.0,0.475,0.469,3.0,11.0,0.273,218.0,49.0,84.0,79.0
24156,24156.0,2017.0,Andrew Bogut,C,32.0,CLE,1.0,1.0,-35.3,,...,0.0,,,0.0,0.0,,0.0,0.0,2.0,0.0
24492,24492.0,2017.0,Andrew Nicholson,PF,27.0,BRK,10.0,111.0,5.0,0.430,...,23.0,0.478,0.412,2.0,2.0,1.000,27.0,3.0,18.0,30.0
24491,24491.0,2017.0,Andrew Nicholson,PF,27.0,WAS,28.0,231.0,6.3,0.425,...,61.0,0.443,0.409,7.0,12.0,0.583,34.0,7.0,37.0,70.0
24167,24167.0,2017.0,Anthony Brown,SF,24.0,NOP,9.0,143.0,5.9,0.415,...,17.0,0.471,0.415,0.0,0.0,,26.0,6.0,13.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24250,24250.0,2017.0,Tyreke Evans,SF,27.0,NOP,26.0,474.0,15.8,0.491,...,157.0,0.446,0.447,45.0,58.0,0.776,86.0,91.0,41.0,248.0
24581,24581.0,2017.0,Wayne Selden,SG,22.0,MEM,11.0,189.0,6.2,0.475,...,29.0,0.586,0.430,12.0,18.0,0.667,11.0,12.0,12.0,55.0
24580,24580.0,2017.0,Wayne Selden,SG,22.0,NOP,3.0,47.0,10.0,0.820,...,1.0,1.000,0.875,2.0,4.0,0.500,5.0,1.0,6.0,16.0
24261,24261.0,2017.0,Yogi Ferrell,PG,23.0,DAL,36.0,1046.0,14.1,0.541,...,196.0,0.418,0.499,64.0,73.0,0.877,99.0,155.0,79.0,408.0


# NBA_season1718_salary jugadores duplicados


In [50]:
df_duplis = df.loc[df['Player'].duplicated(),:].sort_values(by='Player')
df_duplis.head()

Unnamed: 0,X1,Player,Tm,season17_18
331,332.0,Anthony Tolliver,SAC,2000000.0
555,556.0,Antonius Cleveland,DAL,50000.0
377,378.0,Arron Afflalo,SAC,1500000.0
552,553.0,Briante Weber,HOU,50000.0
550,551.0,Briante Weber,LAL,50000.0


In [51]:
# Ejemplo:
df.loc[df['Player'] == 'Briante Weber']

Unnamed: 0,X1,Player,Tm,season17_18
527,528.0,Briante Weber,MEM,83129.0
550,551.0,Briante Weber,LAL,50000.0
552,553.0,Briante Weber,HOU,50000.0


In [52]:
df1_2017.loc[df1_2017['Player'] == 'Briante Weber']

Unnamed: 0,X1,Year,Player,Pos,Age,Tm,G,MP,PER,TS%,...,2PA,2P%,eFG%,FT,FTA,FT%,TRB,AST,PF,PTS
24650,24650.0,2017.0,Briante Weber,PG,24.0,TOT,20.0,205.0,11.0,0.462,...,50.0,0.48,0.425,11.0,16.0,0.688,26.0,21.0,16.0,62.0
24651,24651.0,2017.0,Briante Weber,PG,24.0,GSW,7.0,46.0,5.9,0.392,...,11.0,0.455,0.357,2.0,3.0,0.667,4.0,5.0,4.0,12.0
24652,24652.0,2017.0,Briante Weber,PG,24.0,CHO,13.0,159.0,12.5,0.483,...,39.0,0.487,0.446,9.0,13.0,0.692,22.0,16.0,12.0,50.0


# Joins

In [53]:
# ¿Qué columnas tenemos? 
print(df.columns)
print(df1.columns)

Index(['X1', 'Player', 'Tm', 'season17_18'], dtype='object')
Index(['X1', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P',
       '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')


In [54]:
# ¿Qué columnas coinciden en ambos datasets?
lista = df.columns
lista1 = df1.columns
list(set(lista1).intersection(lista))

['X1', 'Tm', 'Player']

In [55]:
# Mergeo 
df_merge = pd.merge(df1, df, 
                    on =['Player'],
                    how= 'inner')
df_merge.head()

Unnamed: 0,X1_x,Year,Player,Pos,Age,Tm_x,G,GS,MP,PER,...,TRB,AST,STL,BLK,TOV,PF,PTS,X1_y,Tm_y,season17_18
0,4738.0,1977.0,Mike Dunleavy,PG,22.0,PHI,32.0,,359.0,11.9,...,34.0,56.0,,,,64.0,154.0,352.0,ATL,1662500.0
1,5108.0,1978.0,Mike Dunleavy,PG,23.0,TOT,15.0,,119.0,15.9,...,10.0,28.0,,1.0,,12.0,53.0,352.0,ATL,1662500.0
2,5109.0,1978.0,Mike Dunleavy,PG,23.0,PHI,4.0,,17.0,17.4,...,1.0,6.0,1.0,0.0,,0.0,8.0,352.0,ATL,1662500.0
3,5110.0,1978.0,Mike Dunleavy,PG,23.0,HOU,11.0,,102.0,15.7,...,9.0,22.0,,1.0,,12.0,45.0,352.0,ATL,1662500.0
4,5471.0,1979.0,Mike Dunleavy,SG,24.0,HOU,74.0,,1486.0,14.9,...,128.0,324.0,,,,168.0,589.0,352.0,ATL,1662500.0


In [56]:
# Me traigo solo los campos que me interesan:
df_merge = df_merge[['Player','Year','PTS','season17_18']]
df_merge.head()

Unnamed: 0,Player,Year,PTS,season17_18
0,Mike Dunleavy,1977.0,154.0,1662500.0
1,Mike Dunleavy,1978.0,53.0,1662500.0
2,Mike Dunleavy,1978.0,8.0,1662500.0
3,Mike Dunleavy,1978.0,45.0,1662500.0
4,Mike Dunleavy,1979.0,589.0,1662500.0


In [57]:
agrupacion = df_merge.groupby(['Player','Year']).agg(      
    Salario_Total = ('season17_18', 'sum'),
    Cant_Puntos = ('PTS', 'sum')
).reset_index()
agrupacion

Unnamed: 0,Player,Year,Salario_Total,Cant_Puntos
0,A.J. Hammons,2017.0,1312611.0,48.0
1,Aaron Brooks,2008.0,2116955.0,264.0
2,Aaron Brooks,2009.0,2116955.0,894.0
3,Aaron Brooks,2010.0,2116955.0,1604.0
4,Aaron Brooks,2011.0,6350865.0,1268.0
...,...,...,...,...
2584,Zaza Pachulia,2013.0,3477600.0,306.0
2585,Zaza Pachulia,2014.0,3477600.0,408.0
2586,Zaza Pachulia,2015.0,3477600.0,606.0
2587,Zaza Pachulia,2016.0,3477600.0,650.0
