In [1]:
import pandas as pd 
import numpy as np

In [2]:
# Loading in the data, this needs to be changed to the official data once collection has finished.

official_data = pd.read_csv('Nothing But Net - Real Half Time Data 04-10-24.csv')

In [3]:
official_data.head()

Unnamed: 0,TEAM_ABBREVIATION,TEAM_NAME,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,2PA,2PM,2P_PCT
0,DEN,Denver Nuggets,04/10/2024,DEN vs. BOS,L,120,63,24,52,0.42,...,28,18,8,2,12,11,7,32,16,0.5
1,BOS,Boston Celtic,04/10/2024,BOS vs. DEN,W,120,56,19,55,0.345,...,24,14,5,4,9,8,-7,53,19,0.358
2,LAL,Los Angeles Lakers,04/10/2024,LAL vs. MIN,L,120,49,19,43,0.442,...,19,14,4,6,8,10,-9,26,14,0.538
3,MIN,Minnesota Timberwolves,04/10/2024,MIN @ LAL,W,120,58,22,42,0.524,...,21,13,5,1,7,7,9,24,14,0.583
4,GSW,Golden State Warriors,05/10/2024,GSW vs. LAC,W,120,53,19,51,0.373,...,23,13,9,3,9,16,4,26,14,0.538


In [4]:
# Initial Feature Enginnering:
# Adding the following columns:

# 1. Points per minute:
official_data['PPM'] = round(official_data['PTS'] / 24, 2)

# 2. Estimated Possessions:
official_data['POSSESSIONS'] = official_data['FGA'] - official_data['OREB'] + official_data['STL']
# 3. Points per possession:
official_data['PTSperP'] = round(official_data['PTS'] / official_data['POSSESSIONS'], 3)

# 4. Effective Field Goal Percentage:
official_data['EFGPCT'] = round((official_data['FGM'] + 0.5 * official_data['FG3M']) / official_data['FGA'] * 100, 3)

# 5. Pace:
official_data['PACE'] = round((official_data['POSSESSIONS'] / 24), 3)

# 6. Home or Away:
official_data['HOME'] = np.where(official_data['MATCHUP'].str.contains(' vs. '), 1, 0)

# 7. Scoring Opportunities
official_data['Scoring_Opportunities'] = official_data['FGA'] + official_data['FG3A'] + official_data['FTA']

# 8. Defensive Index
official_data['Defensive_Index'] = official_data['DREB'] + official_data['STL'] + official_data['BLK']

# 9. Turnover to Assist Ratio
official_data['TO_to_AST_Ratio'] = official_data['TOV'] / (official_data['AST'] + 1e-10)

In [5]:
official_data.dtypes

TEAM_ABBREVIATION         object
TEAM_NAME                 object
GAME_DATE                 object
MATCHUP                   object
WL                        object
MIN                        int64
PTS                        int64
FGM                        int64
FGA                        int64
FG_PCT                   float64
FG3M                       int64
FG3A                       int64
FG3_PCT                  float64
FTM                        int64
FTA                        int64
FT_PCT                   float64
OREB                       int64
DREB                       int64
REB                        int64
AST                        int64
STL                        int64
BLK                        int64
TOV                        int64
PF                         int64
PLUS_MINUS                 int64
2PA                        int64
2PM                        int64
2P_PCT                   float64
PPM                      float64
POSSESSIONS                int64
PTSperP   

In [6]:
nan_counts = official_data.isnull().sum()
print(nan_counts)

TEAM_ABBREVIATION        0
TEAM_NAME                0
GAME_DATE                0
MATCHUP                  0
WL                       0
MIN                      0
PTS                      0
FGM                      0
FGA                      0
FG_PCT                   0
FG3M                     0
FG3A                     0
FG3_PCT                  0
FTM                      0
FTA                      0
FT_PCT                   0
OREB                     0
DREB                     0
REB                      0
AST                      0
STL                      0
BLK                      0
TOV                      0
PF                       0
PLUS_MINUS               0
2PA                      0
2PM                      0
2P_PCT                   0
PPM                      0
POSSESSIONS              0
PTSperP                  0
EFGPCT                   0
PACE                     0
HOME                     0
Scoring_Opportunities    0
Defensive_Index          0
TO_to_AST_Ratio          0
d

In [7]:
# To ensure that there is no information that links to particulars teams, and to ensure that the models are only utilsing numerical data, the following columns will be dropped:

official_data.drop(['MIN','TEAM_ABBREVIATION','GAME_DATE','TEAM_NAME', 'MATCHUP'], axis=1, inplace=True)

In [8]:
official_data.head()

Unnamed: 0,WL,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,...,2P_PCT,PPM,POSSESSIONS,PTSperP,EFGPCT,PACE,HOME,Scoring_Opportunities,Defensive_Index,TO_to_AST_Ratio
0,L,63,24,52,0.42,8,20,0.4,7,10,...,0.5,2.62,53,1.189,53.846,2.208,1,82,31,0.666667
1,W,56,19,55,0.345,0,2,0.286,8,11,...,0.358,2.33,49,1.143,34.545,2.042,1,68,22,0.642857
2,L,49,19,43,0.442,5,17,0.294,6,6,...,0.538,2.04,45,1.089,50.0,1.875,1,66,27,0.571429
3,W,58,22,42,0.524,8,18,0.444,6,15,...,0.583,2.42,44,1.318,61.905,1.833,0,75,24,0.538462
4,W,53,19,51,0.373,5,25,0.2,10,13,...,0.538,2.21,53,1.0,42.157,2.208,1,89,28,0.692308


In [9]:
official_data.replace({'W':1 , 'L':0}, inplace=True)

In [10]:
# As no NULL values are present, we can proceed with the rest of the data preprocessing,
# starting with normalizing the data to ensure that all features are on the same scale (0-1):

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

cols_to_scale = ['PTS', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 
                 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'PLUS_MINUS', 'PPM', 'POSSESSIONS',
                 'PTSperP', 'EFGPCT', 'PACE', 'Scoring_Opportunities', 'Defensive_Index', '2PA','2PM']

# Apply the scaler to the selected columns

official_data[cols_to_scale] = scaler.fit_transform(official_data[cols_to_scale])

# Display the first few rows to verify scaling
official_data.head()

Unnamed: 0,WL,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,...,2P_PCT,PPM,POSSESSIONS,PTSperP,EFGPCT,PACE,HOME,Scoring_Opportunities,Defensive_Index,TO_to_AST_Ratio
0,0,0.636364,0.666667,0.8,0.42,0.444444,0.5625,0.4,0.318182,0.357143,...,0.5,0.633188,0.805556,0.319452,0.403715,0.805333,1,0.666667,0.633333,0.666667
1,1,0.509091,0.428571,0.885714,0.345,0.0,0.0,0.286,0.363636,0.392857,...,0.358,0.50655,0.694444,0.295203,0.1153,0.694667,1,0.333333,0.333333,0.642857
2,0,0.381818,0.428571,0.542857,0.442,0.277778,0.46875,0.294,0.272727,0.214286,...,0.538,0.379913,0.583333,0.266737,0.346244,0.583333,1,0.285714,0.5,0.571429
3,1,0.545455,0.571429,0.514286,0.524,0.444444,0.5,0.444,0.272727,0.535714,...,0.583,0.545852,0.555556,0.387454,0.52414,0.555333,0,0.5,0.4,0.538462
4,1,0.454545,0.428571,0.771429,0.373,0.277778,0.71875,0.2,0.454545,0.464286,...,0.538,0.454148,0.805556,0.219821,0.229046,0.805333,1,0.833333,0.533333,0.692308


In [11]:
official_data.to_csv('Normalised_NBA_Real_HalfTime_Data.csv', index=False)