# Scikit-Learn Multiple Least-Squares Regression Example

In [1]:
# Note that the imports differ from the main scikit-learn demo
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
from statsmodels.tools.eval_measures import rmse

In [2]:
df = pd.read_csv("Michigan_GameLogs_W24.csv")
df.head()

Unnamed: 0,id,date,year,opponent,home_away,result,points_scored,points_against,pass_cmp,pass_att,...,pass_yrds,pass_td,pass_1st_down,rush_att,rush_yrds,rush_td,rush_1st_down,total_offense,fumbles,ints
0,1,2011-09-03,2011,Western Michigan,Home,W,34,10,9,13,...,98,0,5,26,190,3,9,288,0,0
1,2,2011-09-10,2011,Notre Dame,Home,W,35,31,11,24,...,338,4,10,26,114,1,5,452,0,3
2,3,2011-09-17,2011,Eastern Michigan,Home,W,31,3,7,18,...,95,2,5,50,376,2,19,471,0,1
3,4,2011-09-24,2011,San Diego State,Home,W,28,7,8,17,...,93,0,3,45,320,4,14,413,2,2
4,5,2011-10-01,2011,Minnesota,Home,W,58,0,18,25,...,217,3,10,48,363,3,19,580,0,0


In [3]:
# x is our explanatory variables
x = df[['pass_pct', 'pass_yrds', 'pass_td', 'rush_att', 'rush_yrds', 'rush_td', 'total_offense']]
# y is our response variable
y = df['points_scored']

In [4]:
regr = linear_model.LinearRegression()
regr.fit(x, y)

LinearRegression()

In [5]:
# OLS (Ordinary Least Squares) is a common technique for estimating coefficients
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,points_scored,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.88
Method:,Least Squares,F-statistic:,200.7
Date:,"Wed, 20 Mar 2024",Prob (F-statistic):,3.13e-71
Time:,17:50:50,Log-Likelihood:,-496.25
No. Observations:,165,AIC:,1006.0
Df Residuals:,158,BIC:,1028.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.1077,2.991,-1.373,0.172,-10.015,1.800
pass_pct,0.0965,0.041,2.339,0.021,0.015,0.178
pass_yrds,0.0030,0.004,0.721,0.472,-0.005,0.011
pass_td,5.3847,0.410,13.129,0.000,4.575,6.195
rush_att,0.1006,0.062,1.630,0.105,-0.021,0.223
rush_yrds,0.0084,0.005,1.812,0.072,-0.001,0.018
rush_td,5.2398,0.350,14.979,0.000,4.549,5.931
total_offense,0.0115,0.004,2.718,0.007,0.003,0.020

0,1,2,3
Omnibus:,12.346,Durbin-Watson:,1.908
Prob(Omnibus):,0.002,Jarque-Bera (JB):,13.395
Skew:,0.697,Prob(JB):,0.00123
Kurtosis:,3.067,Cond. No.,2.27e+16


**Model Notes**
1. The coef column shows the coefficients for each of our explanatory variables as well as our y-intercept (const)
2. The R-squared statistic explains that 0.XXX of the variation in Y is explained by X
* For example, if R-squared is 0.90, then 90% of the variation in points scored is explained by the combination of explanatory variables that we have chosen.

In [6]:
# .predict() will create a prediction for points scored for each of our rows in our df (each game in this example)
predictions = model.predict(x)
predictions

0      26.103439
1      36.864217
2      34.782679
3      33.637633
4      49.905643
         ...    
160    26.201290
161    29.233579
162    20.305532
163    32.368757
164    33.805455
Length: 165, dtype: float64

In [7]:
# RMSE is our prefered form of error
# In context, an rmse of 4.75 indicates that our predicted points scored differs by 4.75 points 
# from the actually observed points scored
rmse = rmse(y, predictions)
rmse

4.897053228716542

In [8]:
df['Predicted Points Scored'] = predictions
df['Points Scored Diff'] = df['points_scored'] - df['Predicted Points Scored']
df_subset = df[['date', 'year', 'opponent', 'result', 'points_scored', 'Predicted Points Scored', 'Points Scored Diff']]
df_subset

Unnamed: 0,date,year,opponent,result,points_scored,Predicted Points Scored,Points Scored Diff
0,2011-09-03,2011,Western Michigan,W,34,26.103439,7.896561
1,2011-09-10,2011,Notre Dame,W,35,36.864217,-1.864217
2,2011-09-17,2011,Eastern Michigan,W,31,34.782679,-3.782679
3,2011-09-24,2011,San Diego State,W,28,33.637633,-5.637633
4,2011-10-01,2011,Minnesota,W,58,49.905643,8.094357
...,...,...,...,...,...,...,...
160,2023-11-18,2023,Maryland,W,31,26.201290,4.798710
161,2023-11-25,2023,Ohio State,W,30,29.233579,0.766421
162,2023-12-02,2023,Iowa,W,26,20.305532,5.694468
163,2024-01-01,2024,Alabama,W,27,32.368757,-5.368757
