# Multiple linear regression 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

In [2]:
# Load data
data = pd.read_csv('./data/multiplelinearregression.csv')
data.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


In [3]:
# Vars
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

In [4]:
# Regression
reg = LinearRegression()
reg.fit(x,y)

LinearRegression()

In [5]:
reg.coef_

array([ 0.00165354, -0.00826982])

In [6]:
reg.intercept_

0.29603261264909486

In [7]:
# R-squared
reg.score(x,y)

0.40668119528142843

### Adjusted R-squared

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [8]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]
adj_r2 = 1 - (1 - r2) * (n-1)/(n-p-1)
adj_r2

0.39203134825134023

In [9]:
# Feature Selection (F-regression): remove insignificant vars (p-value > 0.05)
from sklearn.feature_selection import f_regression
f_statistics, p_values = f_regression(x,y) # creates simple linear regression for each feature and computes p-value
p_values.round(3) # p-value for Rand 1,2,3 is too high to be a significant feature

array([0.   , 0.676])

In [10]:
# Summary table
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])
reg_summary['Coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


In [11]:
# Standardization: fix order of magnitude difference between features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

In [12]:
# Regression with scaled features
reg = LinearRegression()
reg.fit(x_scaled, y)
reg_summary = pd.DataFrame([['Bias or Intercept'],['SAT'],['Rand 1,2,3']],columns=['Features'])
reg_summary['Weights'] = reg.intercept_, reg.coef_[0],reg.coef_[1] # standardized coefficients = weights
# when applying scaling, we don't really need to remove insifignificant vars since their weight will be close to 0
reg_summary

Unnamed: 0,Features,Weights
0,Bias or Intercept,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


In [13]:
# Predictions with standardized coeff
new_data = pd.DataFrame(data=[[1700,2], [1800,1]], columns=['SAT', 'Rand 1,2,3'])
reg.predict(scaler.transform(new_data)) # also need to scale here

array([3.09051403, 3.26413803])