In [72]:
import pandas as pd
import numpy as np
import pprint

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
import statsmodels.api as sm

from xgboost import XGBClassifier
import xgboost as xgb

import seaborn as sns
import matplotlib.pyplot as plt
plt.rc("font", size=14)

pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)

In [73]:
df1 = pd.read_csv('NBA Player Stats(1950 - 2022).csv', header=0)
df2 = pd.read_csv('NBA Salaries(1990-2023).csv', header=0)
print(f'Rows : {df1.shape[0]}')
print(f'Columns : {df1.shape[1]}')
print(f'Rows : {df2.shape[0]}')
print(f'Columns : {df2.shape[1]}')

pprint.pprint(df2.dtypes)


Rows : 28237
Columns : 32
Rows : 15857
Columns : 5
Unnamed: 0             int64
playerName            object
seasonStartYear        int64
salary                object
inflationAdjSalary    object
dtype: object


In [74]:
df3 = df1[df1['Season'] > 1989]
df3 = df3.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
df2 = df2.drop(columns=['Unnamed: 0'])
df2 = df2.rename(columns={"playerName": "Player", "seasonStartYear": "Season"})
print(df2)
print(df3)

                  Player  Season      salary inflationAdjSalary
0          Patrick Ewing    1990  $4,250,000         $9,694,547
1       Hot Rod Williams    1990  $3,785,000         $8,633,850
2        Hakeem Olajuwon    1990  $3,175,000         $7,242,397
3        Charles Barkley    1990  $2,900,000         $6,615,103
4           Chris Mullin    1990  $2,850,000         $6,501,049
...                  ...     ...         ...                ...
15852    Jaime Echenique    2021     $53,176            $57,993
15853       Luca Vildoza    2021     $42,789            $46,665
15854     Zavier Simpson    2021     $37,223            $40,595
15855  Mfiondu Kabengele    2021     $19,186            $20,924
15856     Melvin Frazier    2021     $13,294            $14,498

[15857 rows x 4 columns]
       Season          Player Pos   Age   Tm     G    GS      MP     FG  \
9580     1990      Mark Acres   C  27.0  ORL  80.0  50.0  1691.0  138.0   
9581     1990   Michael Adams  PG  27.0  DEN  79.0  74.0

In [75]:
df = pd.merge(df2, df3, on=['Player', 'Season'])
df['Isalary'] = df['inflationAdjSalary'].replace('[\$,]', '', regex=True).astype(int)
df

Unnamed: 0,Player,Season,salary,inflationAdjSalary,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Isalary
0,Patrick Ewing,1990,"$4,250,000","$9,694,547",C,27.0,NYK,82.0,82.0,3165.0,922.0,1673.0,0.551,1.0,4.0,0.250,921.0,1669.0,0.552,0.551,502.0,648.0,0.775,235.0,658.0,893.0,182.0,78.0,327.0,278.0,325.0,2347.0,9694547
1,Hot Rod Williams,1990,"$3,785,000","$8,633,850",PF,27.0,CLE,82.0,29.0,2776.0,528.0,1070.0,0.493,0.0,0.0,,528.0,1070.0,0.493,0.493,325.0,440.0,0.739,220.0,443.0,663.0,168.0,86.0,167.0,143.0,214.0,1381.0,8633850
2,Hakeem Olajuwon,1990,"$3,175,000","$7,242,397",C,27.0,HOU,82.0,82.0,3124.0,806.0,1609.0,0.501,1.0,6.0,0.167,805.0,1603.0,0.502,0.501,382.0,536.0,0.713,299.0,850.0,1149.0,234.0,174.0,376.0,316.0,314.0,1995.0,7242397
3,Charles Barkley,1990,"$2,900,000","$6,615,103",SF,26.0,PHI,79.0,79.0,3085.0,706.0,1177.0,0.600,20.0,92.0,0.217,686.0,1085.0,0.632,0.608,557.0,744.0,0.749,361.0,548.0,909.0,307.0,148.0,50.0,243.0,250.0,1989.0,6615103
4,Chris Mullin,1990,"$2,850,000","$6,501,049",SF,26.0,GSW,78.0,78.0,2830.0,682.0,1272.0,0.536,87.0,234.0,0.372,595.0,1038.0,0.573,0.570,505.0,568.0,0.889,130.0,333.0,463.0,319.0,123.0,45.0,239.0,142.0,1956.0,6501049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13932,Mfiondu Kabengele,2021,"$19,186","$20,924",PF,23.0,LAC,23.0,0.0,94.0,9.0,32.0,0.281,4.0,18.0,0.222,5.0,14.0,0.357,0.344,5.0,6.0,0.833,1.0,13.0,14.0,5.0,2.0,3.0,6.0,18.0,27.0,20924
13933,Mfiondu Kabengele,2021,"$19,186","$20,924",PF,23.0,CLE,16.0,0.0,186.0,24.0,57.0,0.421,9.0,32.0,0.281,15.0,25.0,0.600,0.500,11.0,14.0,0.786,11.0,35.0,46.0,12.0,6.0,10.0,9.0,18.0,68.0,20924
13934,Mfiondu Kabengele,2021,"$19,186","$20,924",PF,23.0,TOT,39.0,0.0,280.0,33.0,89.0,0.371,13.0,50.0,0.260,20.0,39.0,0.513,0.444,16.0,20.0,0.800,12.0,48.0,60.0,17.0,8.0,13.0,15.0,36.0,95.0,20924
13935,Mfiondu Kabengele,2021,"$19,186","$20,924",PF,23.0,LAC,23.0,0.0,94.0,9.0,32.0,0.281,4.0,18.0,0.222,5.0,14.0,0.357,0.344,5.0,6.0,0.833,1.0,13.0,14.0,5.0,2.0,3.0,6.0,18.0,27.0,20924


In [76]:
print(df.isnull().sum())

Player                   0
Season                   0
salary                   0
inflationAdjSalary       0
Pos                      0
Age                      0
Tm                       0
G                        0
GS                       0
MP                       0
FG                       0
FGA                      0
FG%                     35
3P                       0
3PA                      0
3P%                   1842
2P                       0
2PA                      0
2P%                     49
eFG%                    35
FT                       0
FTA                      0
FT%                    328
ORB                      0
DRB                      0
TRB                      0
AST                      0
STL                      0
BLK                      0
TOV                      0
PF                       0
PTS                      0
Isalary                  0
dtype: int64


In [77]:
df['FG%'].fillna(0,inplace=True)
df['3P%'].fillna(0,inplace=True)
df['2P%'].fillna(0,inplace=True)
df['eFG%'].fillna(0,inplace=True)
df['FT%'].fillna(0,inplace=True)

In [78]:
df.corr()

  df.corr()


Unnamed: 0,Season,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Isalary
Season,1.0,-0.104368,-0.155916,-0.089876,-0.137449,-0.10657,-0.104176,0.00454,0.207663,0.210646,0.185735,-0.18221,-0.207537,0.142508,0.167357,-0.141442,-0.155147,0.02521,-0.200549,-0.051194,-0.100901,-0.091752,-0.153068,-0.084131,-0.16894,-0.218856,-0.090202,0.233951
Age,-0.104368,1.0,0.052855,0.066988,0.071367,0.022957,0.023725,0.007016,0.058906,0.04757,0.031523,0.008051,0.009406,-0.01656,0.035833,0.007045,-0.007673,0.102843,0.000618,0.066511,0.04773,0.071845,0.038136,-0.009724,0.008796,0.042708,0.024418,0.190271
G,-0.155916,0.052855,1.0,0.619871,0.850933,0.714005,0.718734,0.260121,0.41119,0.427557,0.169304,0.678147,0.686558,0.217779,0.272134,0.59402,0.609596,0.310559,0.583884,0.684544,0.676459,0.537765,0.691496,0.443833,0.70273,0.849496,0.703222,0.216461
GS,-0.089876,0.066988,0.619871,1.0,0.866626,0.817582,0.811068,0.217812,0.427756,0.442645,0.133658,0.789492,0.790545,0.179771,0.208128,0.717742,0.727206,0.208933,0.583079,0.747943,0.721509,0.639677,0.72782,0.477833,0.788565,0.723234,0.809442,0.460475
MP,-0.137449,0.071367,0.850933,0.866626,1.0,0.921075,0.925617,0.237674,0.532917,0.553415,0.215988,0.874071,0.883075,0.203822,0.256219,0.807772,0.814423,0.312103,0.631369,0.810313,0.781568,0.719041,0.842431,0.487922,0.887389,0.859271,0.916777,0.422203
FG,-0.10657,0.022957,0.714005,0.817582,0.921075,1.0,0.989073,0.253096,0.520663,0.540123,0.212582,0.966405,0.963915,0.217292,0.252874,0.890967,0.89003,0.302654,0.583465,0.775028,0.740989,0.689958,0.780182,0.462721,0.89729,0.752467,0.992748,0.503889
FGA,-0.104176,0.023725,0.718734,0.811068,0.925617,0.989073,1.0,0.182505,0.584058,0.609625,0.252632,0.935048,0.949388,0.161004,0.206833,0.880922,0.871187,0.327734,0.523664,0.735699,0.693705,0.716994,0.798273,0.407396,0.902434,0.737728,0.988681,0.494368
FG%,0.00454,0.007016,0.260121,0.217812,0.237674,0.253096,0.182505,1.0,-0.062665,-0.079922,-0.069817,0.30313,0.249023,0.883746,0.902298,0.21151,0.244383,0.084284,0.378622,0.329997,0.357255,0.069225,0.144292,0.315372,0.196262,0.307096,0.22882,0.142207
3P,0.207663,0.058906,0.41119,0.427756,0.532917,0.520663,0.584058,-0.062665,1.0,0.990947,0.507185,0.283735,0.302922,0.048068,0.225094,0.391754,0.335521,0.335596,-0.079498,0.240753,0.14656,0.507878,0.495564,-0.046701,0.447401,0.283202,0.572876,0.361142
3PA,0.210646,0.04757,0.427557,0.442645,0.553415,0.540123,0.609625,-0.079922,0.990947,1.0,0.498997,0.308317,0.329783,0.047672,0.200319,0.416462,0.361774,0.339677,-0.070134,0.25462,0.159473,0.537803,0.527133,-0.041635,0.47895,0.298184,0.591956,0.36922


In [79]:
df[['GS', 'FG', 'FGA', '3PA', '2P', 'FT', 'DRB', 'PTS', 'TOV']].corr()['Isalary'].sort_values(ascending=False)

KeyError: 'Isalary'