# Player Production Linear Regression Model


1. Collecting Data: Player Game logs from Basketball Reference. [link](https://www.basketball-reference.com/players/d/doncilu01/gamelog/2024)


2. Cleaning Data from the Player's season


In [None]:
import pandas as pd

data_path = "./luka_23-24.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1.0,2023-10-25,24-239,DAL,@,SAS,W (+7),1,34:14,...,12,13,10,2,0,4,1,33,28.4,4
1,2,2.0,2023-10-27,24-241,DAL,,BRK,W (+5),1,36:02,...,9,10,7,0,0,0,1,49,45.0,9
2,3,3.0,2023-10-30,24-244,DAL,@,MEM,W (+15),1,40:00,...,12,12,12,1,1,6,4,35,28.9,6
3,4,4.0,2023-11-01,24-246,DAL,,CHI,W (+9),1,40:03,...,7,7,10,2,1,6,2,18,13.0,11
4,5,5.0,2023-11-03,24-248,DAL,@,DEN,L (-11),1,38:16,...,7,10,8,0,1,9,2,34,21.9,-4


In [73]:
# Check for data types
print(df.info())

# Checking for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rk          82 non-null     int64  
 1   G           70 non-null     float64
 2   Date        82 non-null     object 
 3   Age         82 non-null     object 
 4   Tm          82 non-null     object 
 5   Unnamed: 5  41 non-null     object 
 6   Opp         82 non-null     object 
 7   Unnamed: 7  82 non-null     object 
 8   GS          82 non-null     object 
 9   MP          82 non-null     object 
 10  FG          82 non-null     object 
 11  FGA         82 non-null     object 
 12  FG%         82 non-null     object 
 13  3P          82 non-null     object 
 14  3PA         82 non-null     object 
 15  3P%         82 non-null     object 
 16  FT          82 non-null     object 
 17  FTA         82 non-null     object 
 18  FT%         82 non-null     object 
 19  ORB         82 non-null     obj

In [None]:
# Drop first column and second column
df = df.drop([df.columns[0], df.columns[1]], axis=1)
df.head()

Unnamed: 0,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,2023-10-25,24-239,DAL,@,SAS,W (+7),1,34:14,13,25,...,12,13,10,2,0,4,1,33,28.4,4
1,2023-10-27,24-241,DAL,,BRK,W (+5),1,36:02,16,25,...,9,10,7,0,0,0,1,49,45.0,9
2,2023-10-30,24-244,DAL,@,MEM,W (+15),1,40:00,11,22,...,12,12,12,1,1,6,4,35,28.9,6
3,2023-11-01,24-246,DAL,,CHI,W (+9),1,40:03,5,16,...,7,7,10,2,1,6,2,18,13.0,11
4,2023-11-03,24-248,DAL,@,DEN,L (-11),1,38:16,11,24,...,7,10,8,0,1,9,2,34,21.9,-4


In [75]:
# df.set_index(df.columns[0], inplace=True)
# df.head()

In [None]:
# rename two columns and fill na to home
df.rename(columns={"Unnamed: 5": "H/A"}, inplace=True)
df["H/A"].fillna("home", inplace=True)

df.rename(columns={"Unnamed: 7": "W/L"}, inplace=True)
df.head()

Unnamed: 0,Date,Age,Tm,H/A,Opp,W/L,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,2023-10-25,24-239,DAL,@,SAS,W (+7),1,34:14,13,25,...,12,13,10,2,0,4,1,33,28.4,4
1,2023-10-27,24-241,DAL,home,BRK,W (+5),1,36:02,16,25,...,9,10,7,0,0,0,1,49,45.0,9
2,2023-10-30,24-244,DAL,@,MEM,W (+15),1,40:00,11,22,...,12,12,12,1,1,6,4,35,28.9,6
3,2023-11-01,24-246,DAL,home,CHI,W (+9),1,40:03,5,16,...,7,7,10,2,1,6,2,18,13.0,11
4,2023-11-03,24-248,DAL,@,DEN,L (-11),1,38:16,11,24,...,7,10,8,0,1,9,2,34,21.9,-4


In [None]:
# df.columns

# na_rows = df[df.isna().any(axis=1)]
# print(na_rows)

In [78]:
df

Unnamed: 0,Date,Age,Tm,H/A,Opp,W/L,GS,MP,FG,FGA,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,2023-10-25,24-239,DAL,@,SAS,W (+7),1,34:14,13,25,...,12,13,10,2,0,4,1,33,28.4,+4
1,2023-10-27,24-241,DAL,home,BRK,W (+5),1,36:02,16,25,...,9,10,7,0,0,0,1,49,45.0,+9
2,2023-10-30,24-244,DAL,@,MEM,W (+15),1,40:00,11,22,...,12,12,12,1,1,6,4,35,28.9,+6
3,2023-11-01,24-246,DAL,home,CHI,W (+9),1,40:03,5,16,...,7,7,10,2,1,6,2,18,13.0,+11
4,2023-11-03,24-248,DAL,@,DEN,L (-11),1,38:16,11,24,...,7,10,8,0,1,9,2,34,21.9,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,2024-04-07,25-039,DAL,home,HOU,W (+11),1,45:25,12,21,...,7,9,12,0,2,7,5,37,31.4,+20
78,2024-04-09,25-041,DAL,@,CHO,W (+26),1,34:45,13,25,...,10,12,10,0,1,4,2,39,33.2,+22
79,2024-04-10,25-042,DAL,@,MIA,W (+19),1,35:59,9,23,...,8,9,9,0,0,3,4,29,20.9,+7
80,2024-04-12,25-044,DAL,home,DET,L (-18),Inactive,Inactive,Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive


In [None]:
df.drop([17, 27, 31, 35, 38, 39, 40, 47, 66, 76, 80, 81], inplace=True)
print(df)

          Date     Age   Tm   H/A  Opp      W/L GS     MP  FG FGA  ... DRB  \
0   2023-10-25  24-239  DAL     @  SAS   W (+7)  1  34:14  13  25  ...  12   
1   2023-10-27  24-241  DAL  home  BRK   W (+5)  1  36:02  16  25  ...   9   
2   2023-10-30  24-244  DAL     @  MEM  W (+15)  1  40:00  11  22  ...  12   
3   2023-11-01  24-246  DAL  home  CHI   W (+9)  1  40:03   5  16  ...   7   
4   2023-11-03  24-248  DAL     @  DEN  L (-11)  1  38:16  11  24  ...   7   
..         ...     ...  ...   ...  ...      ... ..    ...  ..  ..  ...  ..   
74  2024-04-02  25-034  DAL     @  GSW   L (-4)  1  39:12  11  22  ...  11   
75  2024-04-04  25-036  DAL  home  ATL  W (+14)  1  37:02   8  25  ...  12   
77  2024-04-07  25-039  DAL  home  HOU  W (+11)  1  45:25  12  21  ...   7   
78  2024-04-09  25-041  DAL     @  CHO  W (+26)  1  34:45  13  25  ...  10   
79  2024-04-10  25-042  DAL     @  MIA  W (+19)  1  35:59   9  23  ...   8   

   TRB AST STL BLK TOV PF PTS  GmSc  +/-  
0   13  10   2   0  

In [None]:
inactive_exists = df["GS"].isin(["Inactive"]).any()
print(inactive_exists)

False


In [None]:
print(df[df["GS"] != "1"]["GS"])

Series([], Name: GS, dtype: object)


In [82]:
df.columns

Index(['Date', 'Age', 'Tm', 'H/A', 'Opp', 'W/L', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-'],
      dtype='object')

In [None]:
# Basic Stats MP, FG, FGA, FG%, 3P, 3PA, 3P%, FT, FTA, FT%, ORB, DRB, TRB, AST, STL, BLK, TOV, PF, PTS
basic = df[
    [
        "MP",
        "FG",
        "FGA",
        "FG%",
        "3P",
        "3PA",
        "3P%",
        "FT",
        "FTA",
        "FT%",
        "ORB",
        "DRB",
        "TRB",
        "AST",
        "STL",
        "BLK",
        "TOV",
        "PF",
        "PTS",
    ]
]
basic.columns
basic.head()

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,34:14,13,25,0.52,3,11,0.273,4,7,0.571,1,12,13,10,2,0,4,1,33
1,36:02,16,25,0.64,9,14,0.643,8,10,0.8,1,9,10,7,0,0,0,1,49
2,40:00,11,22,0.5,6,12,0.5,7,10,0.7,0,12,12,12,1,1,6,4,35
3,40:03,5,16,0.313,1,8,0.125,7,9,0.778,0,7,7,10,2,1,6,2,18
4,38:16,11,24,0.458,6,11,0.545,6,7,0.857,3,7,10,8,0,1,9,2,34


In [None]:
# Convert min:sec to float (total minutes)
basic["MP"] = df["MP"].apply(lambda x: int(x.split(":")[0]) + int(x.split(":")[1]) / 60)

print(basic["MP"])

0     34.233333
1     36.033333
2     40.000000
3     40.050000
4     38.266667
        ...    
74    39.200000
75    37.033333
77    45.416667
78    34.750000
79    35.983333
Name: MP, Length: 70, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basic["MP"] = df["MP"].apply(


In [85]:
basic = basic[
    [
        "MP",
        "FG",
        "FGA",
        "FG%",
        "3P",
        "3PA",
        "3P%",
        "FT",
        "FTA",
        "FT%",
        "ORB",
        "DRB",
        "TRB",
        "AST",
        "STL",
        "BLK",
        "TOV",
        "PF",
        "PTS",
    ]
].astype(float)
basic.dtypes

MP     float64
FG     float64
FGA    float64
FG%    float64
3P     float64
3PA    float64
3P%    float64
FT     float64
FTA    float64
FT%    float64
ORB    float64
DRB    float64
TRB    float64
AST    float64
STL    float64
BLK    float64
TOV    float64
PF     float64
PTS    float64
dtype: object

# Building Linear Regression Model


In [None]:
# df = full stats
# basic = basic stats

# Predictor variable: minutes played
X = basic[["MP"]]

# Target variable: points scored
y = basic["PTS"]

In [None]:
# Split the Data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R^2): {r2}")

# Interpret the slope and intercept
slope = model.coef_[0]
intercept = model.intercept_

print(f"Intercept: {intercept}")
print(f"Slope: {slope}")

Mean Absolute Error (MAE): 5.1536622311688784
Mean Squared Error (MSE): 44.204086807434976
R-squared (R^2): 0.1945708827500925
Intercept: 11.170133178618048
Slope: 0.6022842299653495
