In [1]:
# import modules

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load dataset
df = pd.read_csv('gamelogs_df.csv')

### Gathering Info on Scraped Data

In [5]:
# check rows and columns
df.shape

(1610, 31)

In [6]:
# print the first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,0,1,,2023-10-25,23-352,BOS,@,NYK,W (+4),Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play
1,1,2,,2023-10-27,23-354,BOS,,MIA,W (+8),Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play
2,2,3,1.0,2023-10-30,23-357,BOS,@,WAS,W (+19),0,...,0,2,1,0,0,1,1,2,1.0,-17
3,3,4,2.0,2023-11-01,23-359,BOS,,IND,W (+51),0,...,3,3,1,1,0,0,2,11,11.6,+3
4,4,5,,2023-11-04,23-362,BOS,@,BRK,W (+10),Did Not Play,...,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play,Did Not Play


In [7]:
# print the datatypes of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1610 entries, 0 to 1609
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1610 non-null   int64 
 1   Rk          1610 non-null   object
 2   G           1045 non-null   object
 3   Date        1610 non-null   object
 4   Age         1610 non-null   object
 5   Tm          1610 non-null   object
 6   Unnamed: 5  764 non-null    object
 7   Opp         1610 non-null   object
 8   Unnamed: 7  1536 non-null   object
 9   GS          1610 non-null   object
 10  MP          1610 non-null   object
 11  FG          1610 non-null   object
 12  FGA         1610 non-null   object
 13  FG%         1549 non-null   object
 14  3P          1610 non-null   object
 15  3PA         1610 non-null   object
 16  3P%         1412 non-null   object
 17  FT          1610 non-null   object
 18  FTA         1610 non-null   object
 19  FT%         1105 non-null   object
 20  ORB     

In [8]:
# check for missing values
df.isnull().sum()

Unnamed: 0      0
Rk              0
G             565
Date            0
Age             0
Tm              0
Unnamed: 5    846
Opp             0
Unnamed: 7     74
GS              0
MP              0
FG              0
FGA             0
FG%            61
3P              0
3PA             0
3P%           198
FT              0
FTA             0
FT%           505
ORB             0
DRB             0
TRB             0
AST             0
STL             0
BLK             0
TOV             0
PF              0
PTS             0
GmSc            0
+/-             0
dtype: int64

# Data Cleaning

#### Exploring Null Values

In [12]:
# printing values, including null, for 'Unnamed: 5'
df['Unnamed: 5'].value_counts(dropna=False)

NaN    846
@      764
Name: Unnamed: 5, dtype: int64

In [13]:
# printing values, including null, for 'Unnamed: 7'
df['Unnamed: 7'].value_counts(dropna=False)

W (+10)    103
W (+17)     86
NaN         74
W (+9)      71
W (+4)      55
          ... 
L (-39)      1
L (-60)      1
L (-14)      1
L (-18)      1
L (-40)      1
Name: Unnamed: 7, Length: 76, dtype: int64

In [14]:
# printing values, including null, for 'FG%'
df['FG%'].value_counts(dropna=False)

Inactive        345
Did Not Play    203
.500            117
FG%              74
.000             70
               ... 
.182              1
.091              1
.654              1
.370              1
.615              1
Name: FG%, Length: 114, dtype: int64

In [15]:
# printing values, including null, for '3P%'
df['3P%'].value_counts(dropna=False)

Inactive         345
Did Not Play     203
NaN              198
.000             171
.500             122
3P%               74
.333              74
1.000             54
.250              44
.400              34
.667              33
.200              32
.600              29
.750              17
.286              16
Did Not Dress     15
.167              14
.222              13
.429              12
.375              12
.143              10
.556              10
.444               8
.300               7
.625               6
.364               5
.571               5
.800               4
.833               4
.125               3
.182               3
.545               3
.455               3
.417               3
.714               3
.615               2
.273               2
Not With Team      2
.857               2
.636               2
.111               2
.308               1
.077               1
.875               1
.769               1
.091               1
.467               1
.385         

In [16]:
# printing values, including null, for 'FT%'
df['FT%'].value_counts(dropna=False)

NaN              505
Inactive         345
1.000            223
Did Not Play     203
FT%               74
.500              66
.750              39
.000              23
.667              23
Did Not Dress     15
.833              12
.800              11
.875               9
.600               8
.571               6
.889               6
.778               6
.333               5
.900               5
.857               4
.250               3
.700               3
Not With Team      2
.929               2
.923               2
.625               2
.714               2
.400               1
.789               1
.909               1
.818               1
.846               1
.429               1
Name: FT%, dtype: int64

In [17]:
df[df['Date']] != 'Date'

KeyError: "['2023-10-25', '2023-10-27', '2023-10-30', '2023-11-01', '2023-11-04', '2023-11-06', '2023-11-08', '2023-11-10', '2023-11-11', '2023-11-13', '2023-11-15', '2023-11-17', '2023-11-19', '2023-11-20', '2023-11-22', '2023-11-24', '2023-11-26', '2023-11-28', '2023-12-01', '2023-12-04', '2023-12-08', '2023-12-12', '2023-12-14', '2023-12-15', '2023-12-17', '2023-12-19', '2023-12-20', '2023-12-23', '2023-12-25', '2023-12-28', '2023-12-29', '2023-12-31', '2024-01-02', '2024-01-05', '2024-01-06', '2024-01-08', '2024-01-10', '2024-01-11', '2024-01-13', '2024-01-15', '2024-01-17', '2024-01-19', '2024-01-21', '2024-01-22', '2024-01-25', '2024-01-27', '2024-01-29', '2024-01-30', '2024-02-01', '2024-02-04', '2024-02-07', '2024-02-08', '2024-02-10', '2024-02-13', '2024-02-15', '2024-02-23', '2024-02-25', '2024-02-27', '2024-03-01', '2024-03-02', '2024-03-04', '2024-03-06', '2024-03-08', '2024-03-09', '2024-03-11', '2024-03-13', '2024-03-14', '2024-03-16', '2024-03-18', '2024-03-20', '2024-03-22', '2024-03-23', '2024-03-25', '2024-03-27', '2024-03-29', '2024-04-01', '2024-04-03', '2024-04-05', '2024-04-07', '2024-04-09', '2024-04-11', '2024-04-12', '2024-04-14', '2024-02-09', '2024-02-11', '2024-02-14', '2024-02-22', '2024-02-24', '2024-03-03', '2024-03-05', '2024-03-07', '2024-03-12', '2024-03-17', '2024-03-28', '2024-03-30', '2023-10-26', '2023-10-28', '2023-10-29', '2023-11-02', '2023-11-12', '2023-11-14', '2023-11-21', '2023-11-25', '2023-11-27', '2023-11-29', '2023-12-06', '2023-12-11', '2023-12-13', '2023-12-16', '2023-12-18', '2023-12-22', '2023-12-27', '2023-12-30', '2024-01-12', '2024-01-16', '2024-01-20', '2024-02-03', '2024-02-05', '2024-02-12', '2024-02-26', '2024-02-28', '2024-03-10', '2024-04-06', '2024-04-10', '2023-11-03', '2023-11-05', '2023-11-18', '2023-12-02', '2023-12-21', '2023-12-26', '2024-01-03', '2024-01-07', '2024-01-09', '2024-01-18', '2024-01-24', '2024-01-26', '2024-01-28', '2024-02-02', '2024-02-06'] not in index"

In [None]:
# rename headers
column_titles = ['Rank',
                 'Season Game',
                 'Date',
                 'Age',
                 'Team',
                 '@',
                 'Opponent',
                 'W/L Margin',
                 'Games Started',
                 'Field Goals',
                 'Field Goal Attempts',
                 'Field Goal %',
                 '3-Point Field Goals',
                 '3-Point Field Goal %',
                 'Free Throws',
                 'Free Throw Attempts',
                 'Free Throw %',
                 'Offensive Rebounds',
                 'Defensive Rebounds',
                 'Total Rebounds',
                 'Assists',
                 'Steals',
                 'Blocks',
                 'Turnovers',
                 'Personal Fouls',
                 'Points',
                 'Game Score',
                 'Plus/Minus'    
]