In [2]:
import numpy as np
import pandas as pd
import unicodedata

In [7]:
# reading in previous datasets
df = pd.read_csv("player_mvp_stats.csv")
mvp_df = pd.read_csv("mvp_votes_1980_1999.csv")
per_game_df = pd.read_csv("per_game_stats_1980_1999.csv")
advanced_stats_df = pd.read_csv("advanced_stats_1980_1999.csv")
team_records = pd.read_csv("records_1980_1999_csv")

# Cleaning the data

In [8]:
mvp_df = mvp_df[["Player", "Year", "Pts Won", "Pts Max", "Share"]]
mvp_df

Unnamed: 0,Player,Year,Pts Won,Pts Max,Share
0,Kareem Abdul-Jabbar,1980,147.0,221,0.665
1,Julius Erving,1980,31.5,221,0.143
2,George Gervin,1980,19.0,221,0.086
3,Larry Bird,1980,15.0,221,0.068
4,Tiny Archibald,1980,2.0,221,0.009
...,...,...,...,...,...
366,Anfernee Hardaway,1999,1.0,1180,0.001
367,Mark Jackson,1999,1.0,1180,0.001
368,Glenn Robinson,1999,1.0,1180,0.001
369,Steve Smith,1999,1.0,1180,0.001


In [12]:
per_game_df = per_game_df.drop(["Unnamed: 0", "Rk"], axis=1)
per_game_df["Player"] = per_game_df["Player"].str.replace("*","",regex=False)

In [15]:
# function to remove duplicate rows in the same year 
# because of players playing for multiple teams in the same year
# if that's the case
def single_row(group):
    if group.shape[0]==1:
        return group
    else:
        row = group[group["Tm"] == "TOT"]
        row["Tm"] = group.iloc[-1,:]["Tm"]
        return row


per_game_df = per_game_df.groupby(["Player", "Year"]).apply(single_row)

In [16]:
# run this twice to get rid of the player and year indexing caused by the groupby
per_game_df.index = per_game_df.index.droplevel()
per_game_df.index = per_game_df.index.droplevel()

In [17]:
# Looking at null values in dataset
per_game_df.isna().sum()

Player       0
Pos          0
Age          0
Tm           0
G            0
GS         543
MP           0
FG           0
FGA          0
FG%         20
3P           0
3PA          0
3P%       1271
2P           0
2PA          0
2P%         24
eFG%        20
FT           0
FTA          0
FT%        162
ORB          0
DRB          0
TRB          0
AST          0
STL          0
BLK          0
TOV          0
PF           0
PTS          0
Year         0
dtype: int64

In [21]:
per_game_df.head(50)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
2302,A.C. Green,PF,22,LAL,82,1.0,18.8,2.5,4.7,0.539,...,2.0,2.7,4.6,0.7,0.6,0.6,1.2,2.8,6.4,1986
2686,A.C. Green,PF,23,LAL,79,72.0,28.4,4.0,7.4,0.538,...,2.7,5.1,7.8,1.1,0.9,1.0,1.3,2.2,10.8,1987
3072,A.C. Green,PF,24,LAL,82,64.0,32.1,3.9,7.8,0.503,...,3.0,5.7,8.7,1.1,1.1,0.5,1.5,2.5,11.4,1988
3506,A.C. Green,PF,25,LAL,82,82.0,30.6,4.9,9.2,0.529,...,3.1,5.9,9.0,1.3,1.1,0.7,1.5,2.1,13.3,1989
3951,A.C. Green,PF,26,LAL,82,82.0,33.0,4.7,9.8,0.478,...,3.2,5.5,8.7,1.1,0.8,0.6,1.4,2.5,12.9,1990
4428,A.C. Green,PF,27,LAL,82,21.0,26.4,3.1,6.6,0.476,...,2.5,3.8,6.3,0.9,0.7,0.3,1.2,1.4,9.1,1991
4878,A.C. Green,PF,28,LAL,82,53.0,35.4,4.7,9.8,0.476,...,3.7,5.6,9.3,1.4,1.1,0.4,1.4,1.7,13.6,1992
5320,A.C. Green,PF,29,LAL,82,55.0,34.4,4.6,8.6,0.537,...,3.5,5.2,8.7,1.4,1.1,0.5,1.4,1.8,12.8,1993
5788,A.C. Green,PF,30,PHO,82,55.0,34.5,5.7,11.3,0.502,...,3.4,5.8,9.2,1.7,0.9,0.5,1.2,1.7,14.7,1994
6257,A.C. Green,SF,31,PHO,82,52.0,32.8,3.8,7.5,0.504,...,2.4,5.8,8.2,1.5,0.7,0.4,1.4,1.8,11.2,1995


In [27]:
# Filling in all null values for per game dataframe since they relate to percentages and those null values
# represent players who did not take those shots
per_game_df[per_game_df['FG%'].isna()]
per_game_df['FG%'] = per_game_df['FG%'].fillna(0)
per_game_df['3P%'] = per_game_df['3P%'].fillna(0)
per_game_df['2P%'] = per_game_df['2P%'].fillna(0)
per_game_df['eFG%'] = per_game_df['eFG%'].fillna(0)
per_game_df["GS"] = df.groupby("Year")["GS"].transform(lambda x: x.fillna(x.mean()))

In [29]:
advanced_stats_df.columns

Index(['Unnamed: 0', 'Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'Unnamed: 19', 'OWS', 'DWS', 'WS', 'WS/48',
       'Unnamed: 24', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Year'],
      dtype='object')

In [30]:
# Dropping weird columns
advanced_stats_df = advanced_stats_df.drop(['Unnamed: 0', 'Rk', 'Unnamed: 19', 'Unnamed: 24'], axis=1)

In [31]:
advanced_stats_df.isna().sum()

Player     0
Pos        0
Age        0
Tm         0
G          0
MP         0
PER        0
TS%       26
3PAr      35
FTr       35
ORB%       0
DRB%       0
TRB%       0
AST%       0
STL%       0
BLK%       0
TOV%      21
USG%       0
OWS        0
DWS        0
WS         0
WS/48      0
OBPM       0
DBPM       0
BPM        0
VORP       0
Year       0
dtype: int64

In [32]:
# first line drops the 5 rows which are identical. represents players 
# fill na values with 0 since it references players who barely played
advanced_stats_df = advanced_stats_df.dropna(subset=['PER', 'WS/48'])
advanced_stats_df['TS%'] = advanced_stats_df['TS%'].fillna(0)
advanced_stats_df['3PAr'] = advanced_stats_df['3PAr'].fillna(0)
advanced_stats_df['FTr'] = advanced_stats_df['FTr'].fillna(0)
advanced_stats_df['TOV%'] = advanced_stats_df['TOV%'].fillna(0)

In [33]:
advanced_stats_df = advanced_stats_df.groupby(["Player", "Year"]).apply(single_row)

In [34]:
advanced_stats_df.index = advanced_stats_df.index.droplevel()
advanced_stats_df.index = advanced_stats_df.index.droplevel()

In [35]:
# Fixing asterisk problem in this dataset as well
advanced_stats_df["Player"] = advanced_stats_df["Player"].str.replace("*","",regex=False)

In [44]:
# Merge advanced stats with MVP's to get the advanced stats necessary
advanced_mvp = advanced_stats_df.merge(mvp_df, how="outer", on=["Player", "Year"])
advanced_mvp

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,WS,WS/48,OBPM,DBPM,BPM,VORP,Year,Pts Won,Pts Max,Share
0,A.C. Green,PF,22.0,LAL,82.0,1542.0,11.8,0.564,0.015,0.430,...,3.3,0.103,-1.6,0.1,-1.6,0.2,1986,,,
1,A.C. Green,PF,23.0,LAL,79.0,2240.0,15.7,0.599,0.009,0.480,...,7.6,0.163,0.9,0.1,1.0,1.7,1987,,,
2,A.C. Green,PF,24.0,LAL,82.0,2636.0,14.5,0.581,0.003,0.592,...,7.9,0.144,0.4,-0.1,0.3,1.5,1988,,,
3,A.C. Green,PF,25.0,LAL,82.0,2510.0,17.8,0.594,0.022,0.474,...,9.4,0.179,1.8,-0.2,1.6,2.3,1989,,,
4,A.C. Green,PF,26.0,LAL,82.0,2709.0,14.7,0.548,0.057,0.459,...,7.7,0.137,0.3,-0.9,-0.6,1.0,1990,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7260,Žan Tabak,C,25.0,TOR,67.0,1332.0,12.4,0.554,0.002,0.275,...,1.1,0.038,-3.0,-1.5,-4.6,-0.9,1996,,,
7261,Žan Tabak,C,26.0,TOR,13.0,218.0,12.5,0.501,0.000,0.408,...,0.1,0.032,-3.6,0.0,-3.5,-0.1,1997,,,
7262,Žan Tabak,C,27.0,BOS,57.0,984.0,9.7,0.464,0.003,0.201,...,0.0,-0.001,-4.6,-1.3,-6.0,-1.0,1998,,,
7263,Žarko Paspalj,SF,23.0,SAS,28.0,181.0,4.3,0.406,0.013,0.278,...,-0.3,-0.072,-8.2,-2.2,-10.4,-0.4,1990,,,


In [45]:
# Deleting duplicate columns
advanced_mvp = advanced_mvp.drop(columns=["Pos", "Age", "Tm", "G", "MP"])

In [49]:
# Dropping the player who has null values for everything
advanced_mvp = advanced_mvp.dropna(subset=['PER', 'WS/48'])

In [50]:
# Now need to merge advanced and mvp's with per game
combined = per_game_df.merge(advanced_mvp, how="outer", on=["Player", "Year"])
combined

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Pts Won,Pts Max,Share
0,A.C. Green,PF,22,LAL,82,24,18.8,2.5,4.7,0.539,...,2.0,3.3,0.103,-1.6,0.1,-1.6,0.2,,,
1,A.C. Green,PF,23,LAL,79,27,28.4,4.0,7.4,0.538,...,3.3,7.6,0.163,0.9,0.1,1.0,1.7,,,
2,A.C. Green,PF,24,LAL,82,0,32.1,3.9,7.8,0.503,...,3.4,7.9,0.144,0.4,-0.1,0.3,1.5,,,
3,A.C. Green,PF,25,LAL,82,3,30.6,4.9,9.2,0.529,...,3.5,9.4,0.179,1.8,-0.2,1.6,2.3,,,
4,A.C. Green,PF,26,LAL,82,0,33.0,4.7,9.8,0.478,...,3.3,7.7,0.137,0.3,-0.9,-0.6,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7259,Žan Tabak,C,24,HOU,37,11,4.9,0.6,1.4,0.453,...,0.2,0.2,0.043,-6.3,-2.1,-8.4,-0.3,,,
7260,Žan Tabak,C,25,TOR,67,23,19.9,3.4,6.2,0.543,...,0.6,1.1,0.038,-3.0,-1.5,-4.6,-0.9,,,
7261,Žan Tabak,C,26,TOR,13,3,16.8,2.5,5.5,0.451,...,0.2,0.1,0.032,-3.6,0.0,-3.5,-0.1,,,
7262,Žan Tabak,C,27,BOS,57,1,17.3,2.5,5.3,0.467,...,0.4,0.0,-0.001,-4.6,-1.3,-6.0,-1.0,,,


In [51]:
# Dropping rows where there are no advanced stats (probably irrelevant for MVP)
combined = combined.dropna(subset=['PER', 'WS/48'])

In [52]:
# Filling in 0's for those who did not get any MVP Pts Won
combined[["Pts Won", "Pts Max", "Share"]] = combined[["Pts Won", "Pts Max", "Share"]].fillna(0)

In [53]:
combined

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Pts Won,Pts Max,Share
0,A.C. Green,PF,22,LAL,82,24,18.8,2.5,4.7,0.539,...,2.0,3.3,0.103,-1.6,0.1,-1.6,0.2,0.0,0.0,0.0
1,A.C. Green,PF,23,LAL,79,27,28.4,4.0,7.4,0.538,...,3.3,7.6,0.163,0.9,0.1,1.0,1.7,0.0,0.0,0.0
2,A.C. Green,PF,24,LAL,82,0,32.1,3.9,7.8,0.503,...,3.4,7.9,0.144,0.4,-0.1,0.3,1.5,0.0,0.0,0.0
3,A.C. Green,PF,25,LAL,82,3,30.6,4.9,9.2,0.529,...,3.5,9.4,0.179,1.8,-0.2,1.6,2.3,0.0,0.0,0.0
4,A.C. Green,PF,26,LAL,82,0,33.0,4.7,9.8,0.478,...,3.3,7.7,0.137,0.3,-0.9,-0.6,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7259,Žan Tabak,C,24,HOU,37,11,4.9,0.6,1.4,0.453,...,0.2,0.2,0.043,-6.3,-2.1,-8.4,-0.3,0.0,0.0,0.0
7260,Žan Tabak,C,25,TOR,67,23,19.9,3.4,6.2,0.543,...,0.6,1.1,0.038,-3.0,-1.5,-4.6,-0.9,0.0,0.0,0.0
7261,Žan Tabak,C,26,TOR,13,3,16.8,2.5,5.5,0.451,...,0.2,0.1,0.032,-3.6,0.0,-3.5,-0.1,0.0,0.0,0.0
7262,Žan Tabak,C,27,BOS,57,1,17.3,2.5,5.3,0.467,...,0.4,0.0,-0.001,-4.6,-1.3,-6.0,-1.0,0.0,0.0,0.0


In [55]:
# Cleaning team data
team_records = team_records.drop(["Unnamed: 0"], axis=1)

In [56]:
team_records["Team"] = team_records["Team"].str.replace("*","",regex=False)
team_records["Team"] = team_records["Team"].str.replace("\((\d+)\)","",regex=True)

In [57]:
team_records["Team"].unique()

array(['Boston Celtics', 'Philadelphia 76ers', 'Washington Bullets',
       'New York Knicks', 'New Jersey Nets', 'Atlanta Hawks',
       'Houston Rockets', 'San Antonio Spurs', 'Indiana Pacers',
       'Cleveland Cavaliers', 'Detroit Pistons', 'Milwaukee Bucks',
       'Kansas City Kings', 'Denver Nuggets', 'Chicago Bulls',
       'Utah Jazz', 'Los Angeles Lakers', 'Seattle SuperSonics',
       'Phoenix Suns', 'Portland Trail Blazers', 'San Diego Clippers',
       'Golden State Warriors', 'Dallas Mavericks',
       'Los Angeles Clippers', 'Sacramento Kings', 'Charlotte Hornets',
       'Miami Heat', 'Orlando Magic', 'Minnesota Timberwolves',
       'Toronto Raptors', 'Vancouver Grizzlies', 'Washington Wizards'],
      dtype=object)

In [58]:
# isolating series and replacing the xa0
xa0_version = team_records["Team"].to_numpy()
new_array = []
for team in xa0_version:
    new_team = team.replace(u'\xa0', "")
    new_array.append(new_team)
# THIS NEW ARRAY IS THE CORRECT TEAM NAMES
#team_records_2023["Team"] = new_array
#team_records_2023
team_records["Team"] = new_array

In [59]:
# Sanity check to make sure the xa0 was removed
team_records["Team"].unique()

array(['Boston Celtics', 'Philadelphia 76ers', 'Washington Bullets',
       'New York Knicks', 'New Jersey Nets', 'Atlanta Hawks',
       'Houston Rockets', 'San Antonio Spurs', 'Indiana Pacers',
       'Cleveland Cavaliers', 'Detroit Pistons', 'Milwaukee Bucks',
       'Kansas City Kings', 'Denver Nuggets', 'Chicago Bulls',
       'Utah Jazz', 'Los Angeles Lakers', 'Seattle SuperSonics',
       'Phoenix Suns', 'Portland Trail Blazers', 'San Diego Clippers',
       'Golden State Warriors', 'Dallas Mavericks',
       'Los Angeles Clippers', 'Sacramento Kings', 'Charlotte Hornets',
       'Miami Heat', 'Orlando Magic', 'Minnesota Timberwolves',
       'Toronto Raptors', 'Vancouver Grizzlies', 'Washington Wizards'],
      dtype=object)

In [60]:
combined["Tm"].unique()

array(['LAL', 'PHO', 'DAL', 'WSB', 'BOS', 'POR', 'DET', 'PHI', 'UTA',
       'MIL', 'VAN', 'SEA', 'TOR', 'ATL', 'GSW', 'DEN', 'NJN', 'MIN',
       'HOU', 'IND', 'SDC', 'MIA', 'SAS', 'NYK', 'CHH', 'ORL', 'SAC',
       'CLE', 'CHI', 'LAC', 'WAS', 'KCK'], dtype=object)

In [80]:
nickname_df = pd.read_csv("nicknames.csv")
new_nicknames = {"Abbreviation": ['SDC', 'KCK'],
                 "Name": ['San Diego Clippers', 'Kansas City Kings']}
nickname_df.loc[len(nickname_df.index)] = ['SDC', 'San Diego Clippers']
nickname_df.loc[len(nickname_df.index)] = ['KCK', 'Kansas City Kings']
nickname_df

Unnamed: 0,Abbreviation,Name
0,ATL,Atlanta Hawks
1,BRK,Brooklyn Nets
2,BKN,Brooklyn Nets
3,BOS,Boston Celtics
4,CHA,Charlotte Bobcats
5,CHH,Charlotte Hornets
6,CHO,Charlotte Hornets
7,CHI,Chicago Bulls
8,CLE,Cleveland Cavaliers
9,DAL,Dallas Mavericks


In [81]:
nickname_df.to_csv("updated_nicknames.csv")

In [83]:
nicknames = {}

with open("updated_nicknames.csv") as f:
    lines = f.readlines()
    print(lines)
    for line in lines[1:]:
        index, prefix, name = line.replace("\n", "").split(",")
        nicknames[prefix] = name

[',Abbreviation,Name\n', '0,ATL,Atlanta Hawks\n', '1,BRK,Brooklyn Nets\n', '2,BKN,Brooklyn Nets\n', '3,BOS,Boston Celtics\n', '4,CHA,Charlotte Bobcats\n', '5,CHH,Charlotte Hornets\n', '6,CHO,Charlotte Hornets\n', '7,CHI,Chicago Bulls\n', '8,CLE,Cleveland Cavaliers\n', '9,DAL,Dallas Mavericks\n', '10,DEN,Denver Nuggets\n', '11,DET,Detroit Pistons\n', '12,GSW,Golden State Warriors\n', '13,HOU,Houston Rockets\n', '14,IND,Indiana Pacers\n', '15,LAC,Los Angeles Clippers\n', '16,LAL,Los Angeles Lakers\n', '17,MEM,Memphis Grizzlies\n', '18,MIA,Miami Heat\n', '19,MIL,Milwaukee Bucks\n', '20,MIN,Minnesota Timberwolves\n', '21,NJN,New Jersey Nets\n', '22,NOH,New Orleans Hornets\n', '23,NOP,New Orleans Pelicans\n', '24,NOK,New Orleans/Oklahoma City Hornets\n', '25,NYK,New York Knicks\n', '26,OKC,Oklahoma City Thunder\n', '27,ORL,Orlando Magic\n', '28,PHI,Philadelphia 76ers\n', '29,PHX,Phoenix Suns\n', '30,PHO,Phoenix Suns\n', '31,POR,Portland Trail Blazers\n', '32,SEA,Seattle SuperSonics\n', '33,

In [84]:
nicknames

{'ATL': 'Atlanta Hawks',
 'BRK': 'Brooklyn Nets',
 'BKN': 'Brooklyn Nets',
 'BOS': 'Boston Celtics',
 'CHA': 'Charlotte Bobcats',
 'CHH': 'Charlotte Hornets',
 'CHO': 'Charlotte Hornets',
 'CHI': 'Chicago Bulls',
 'CLE': 'Cleveland Cavaliers',
 'DAL': 'Dallas Mavericks',
 'DEN': 'Denver Nuggets',
 'DET': 'Detroit Pistons',
 'GSW': 'Golden State Warriors',
 'HOU': 'Houston Rockets',
 'IND': 'Indiana Pacers',
 'LAC': 'Los Angeles Clippers',
 'LAL': 'Los Angeles Lakers',
 'MEM': 'Memphis Grizzlies',
 'MIA': 'Miami Heat',
 'MIL': 'Milwaukee Bucks',
 'MIN': 'Minnesota Timberwolves',
 'NJN': 'New Jersey Nets',
 'NOH': 'New Orleans Hornets',
 'NOP': 'New Orleans Pelicans',
 'NOK': 'New Orleans/Oklahoma City Hornets',
 'NYK': 'New York Knicks',
 'OKC': 'Oklahoma City Thunder',
 'ORL': 'Orlando Magic',
 'PHI': 'Philadelphia 76ers',
 'PHX': 'Phoenix Suns',
 'PHO': 'Phoenix Suns',
 'POR': 'Portland Trail Blazers',
 'SEA': 'Seattle SuperSonics',
 'SAC': 'Sacramento Kings',
 'SAS': 'San Antonio Spu

In [85]:
combined["Team"] = combined["Tm"].map(nicknames)

In [86]:
stats = combined.merge(team_records, how="outer", on=["Team", "Year"])

In [88]:
stats[stats["Year"] == 1980]

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
394,Abdul Jeelani,SF,25,POR,77,62,16.7,3.7,7.3,0.510,...,0.0,0.0,Portland Trail Blazers,38,44,0.463,22.0,102.5,103.3,-0.87
395,Billy Ray Bates,SG,23,POR,16,1,14.7,4.5,9.1,0.493,...,0.0,0.0,Portland Trail Blazers,38,44,0.463,22.0,102.5,103.3,-0.87
396,Bob Gross,SF,26,POR,62,0,25.5,3.6,7.6,0.468,...,0.0,0.0,Portland Trail Blazers,38,44,0.463,22.0,102.5,103.3,-0.87
397,Calvin Natt,SF,23,POR,78,74,36.6,8.0,16.6,0.479,...,0.0,0.0,Portland Trail Blazers,38,44,0.463,22.0,102.5,103.3,-0.87
398,Dave Twardzik,PG,29,POR,67,20,23.8,2.7,5.9,0.464,...,0.0,0.0,Portland Trail Blazers,38,44,0.463,22.0,102.5,103.3,-0.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7259,Mike Gale,PG,29,SAS,67,79,22.0,2.6,5.6,0.454,...,0.0,0.0,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24
7260,Paul Griffin,C,26,SAS,82,0,22.1,2.1,3.8,0.553,...,0.0,0.0,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24
7261,Sylvester Norris,C,22,SAS,17,71,11.1,1.1,2.5,0.419,...,0.0,0.0,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24
7262,Tim Bassett,PF,28,SAS,12,79,13.7,1.0,2.8,0.353,...,0.0,0.0,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24


In [89]:
stats["GB"] = stats["GB"].str.replace('—', "0.0")
stats["GB"].unique()

array(['0.0', '5.0', '14.0', '23.0', '7.0', '40.0', '42.0', '18.0',
       '26.0', '15.0', '20.0', '24.0', '50.0', '10.0', '22.0', '37.0',
       '25.0', '51.0', '36.0', '2.0', '4.0', '16.0', '38.0', '34.0',
       '3.0', '39.0', '11.0', '9.0', '13.0', '33.0', '12.0', '31.0',
       '32.0', '19.0', '17.0', '21.0', '28.0', '35.0', '30.0', '8.0',
       '27.0', '29.0', '6.0', '46.0', '41.0', '1.0', '44.0', '43.0',
       '53.0', '45.0', '47.0'], dtype=object)

In [90]:
stats["GB"] = pd.to_numeric(stats["GB"])

In [91]:
stats.dtypes

Player     object
Pos        object
Age         int64
Tm         object
G           int64
           ...   
W/L%      float64
GB        float64
PS/G      float64
PA/G      float64
SRS       float64
Length: 61, dtype: object

## Combining this dataframe with previous data of 2000-2023

In [93]:
stats.isna().sum()

Player    0
Pos       0
Age       0
Tm        0
G         0
         ..
W/L%      0
GB        0
PS/G      0
PA/G      0
SRS       0
Length: 61, dtype: int64

In [94]:
mvp_2000_df = pd.read_csv("player_mvp_stats.csv")

In [95]:
stats

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,22,LAL,82,24,18.8,2.5,4.7,0.539,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
1,Byron Scott,SG,24,LAL,76,13,28.8,6.7,13.0,0.513,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
2,James Worthy,SF,24,LAL,75,0,32.7,8.4,14.5,0.579,...,780.0,0.009,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
3,Jerome Henderson,C,26,LAL,1,60,3.0,2.0,3.0,0.667,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
4,Kareem Abdul-Jabbar,C,38,LAL,79,7,33.3,9.6,16.9,0.564,...,780.0,0.173,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7259,Mike Gale,PG,29,SAS,67,79,22.0,2.6,5.6,0.454,...,0.0,0.000,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24
7260,Paul Griffin,C,26,SAS,82,0,22.1,2.1,3.8,0.553,...,0.0,0.000,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24
7261,Sylvester Norris,C,22,SAS,17,71,11.1,1.1,2.5,0.419,...,0.0,0.000,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24
7262,Tim Bassett,PF,28,SAS,12,79,13.7,1.0,2.8,0.353,...,0.0,0.000,San Antonio Spurs,41,41,0.500,9.0,119.4,119.7,-0.24


In [99]:
mvp_2000_df = mvp_2000_df.drop(["Unnamed: 0"], axis=1)
mvp_2000_df

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,0.447,...,0.0,0.0,Los Angeles Lakers,67,15,0.817,0.0,100.8,92.3,8.41
1,Brian Shaw,SG,33,LAL,74,2,16.9,1.7,4.4,0.382,...,0.0,0.0,Los Angeles Lakers,67,15,0.817,0.0,100.8,92.3,8.41
2,Derek Fisher,PG,25,LAL,78,22,23.1,2.1,6.2,0.346,...,0.0,0.0,Los Angeles Lakers,67,15,0.817,0.0,100.8,92.3,8.41
3,Devean George,SF,22,LAL,49,1,7.0,1.1,2.9,0.389,...,0.0,0.0,Los Angeles Lakers,67,15,0.817,0.0,100.8,92.3,8.41
4,Glen Rice,SF,32,LAL,80,80,31.6,5.3,12.3,0.430,...,0.0,0.0,Los Angeles Lakers,67,15,0.817,0.0,100.8,92.3,8.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,0.484,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
11512,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,0.286,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
11513,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
11514,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [100]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS'],
      dtype='object')

In [101]:
mvp_2000_df.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'PER',
       'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%',
       'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM',
       'VORP', 'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB',
       'PS/G', 'PA/G', 'SRS'],
      dtype='object')

In [102]:
stats.columns == mvp_2000_df.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [103]:
all_players = pd.concat([stats, mvp_2000_df], ignore_index=True)
all_players

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,22,LAL,82,24,18.8,2.5,4.7,0.539,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
1,Byron Scott,SG,24,LAL,76,13,28.8,6.7,13.0,0.513,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
2,James Worthy,SF,24,LAL,75,0,32.7,8.4,14.5,0.579,...,780.0,0.009,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
3,Jerome Henderson,C,26,LAL,1,60,3.0,2.0,3.0,0.667,...,0.0,0.000,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
4,Kareem Abdul-Jabbar,C,38,LAL,79,7,33.3,9.6,16.9,0.564,...,780.0,0.173,Los Angeles Lakers,62,20,0.756,0.0,117.3,109.5,6.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18775,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,0.484,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
18776,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,0.286,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
18777,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,0.470,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
18778,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,0.459,...,0.0,0.000,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [104]:
all_players.to_csv("all_player_mvp_stats.csv")