In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [7]:
df = pd.read_csv('data/multiple_linear_regression.csv')
df

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
0,1714,2.40,1
1,1664,2.52,3
2,1760,2.54,3
3,1685,2.74,3
4,1693,2.83,2
...,...,...,...
79,1936,3.71,3
80,1810,3.71,1
81,1987,3.73,3
82,1962,3.76,1


In [8]:
df.describe()

Unnamed: 0,SAT,GPA,"Rand 1,2,3"
count,84.0,84.0,84.0
mean,1845.27381,3.330238,2.059524
std,104.530661,0.271617,0.855192
min,1634.0,2.4,1.0
25%,1772.0,3.19,1.0
50%,1846.0,3.38,2.0
75%,1934.0,3.5025,3.0
max,2050.0,3.81,3.0


### Creating multiple linear regression

In [9]:
x = df[['SAT', 'Rand 1,2,3']]
y = df['GPA']

### Standardization

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()

In [12]:
scaler.fit(x)

StandardScaler()

In [13]:
x_transf = scaler.transform(x)

In [14]:
x_transf

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974],
       [-1.06127829,  1.10632974],
       [-0.67631715, -0.07002087],
       [-1.06127829, -1.24637147],
       [-1.28263094,  1.10632974],
       [-0.6955652 , -0.07002087],
       [ 0.25721362, -0.07002087],
       [-0.86879772,  1.10632974],
       [-1.64834403, -0.07002087],
       [-0.03150724,  1.10632974],
       [-0.57045283,  1.10632974],
       [-0.81105355,  1.10632974],
       [-1.18639066,  1.10632974],
       [-1.75420834,  1.10632974],
       [-1.52323165, -1.24637147],
       [ 1.23886453, -1.24637147],
       [-0.18549169, -1.24637147],
       [-0.5608288 , -1.24637147],
       [-0.23361183,  1.10632974],
       [ 1.68156984,

### Regression model with scaled features

In [15]:
model = LinearRegression()

In [16]:
model.fit(x_transf,y)

LinearRegression()

In [17]:
model.coef_

array([ 0.17181389, -0.00703007])

In [18]:
model.intercept_

3.330238095238095

### Summary table

In [23]:
model_summary = pd.DataFrame([['Bias'], ['SAT'], ['Rand 1,2,3']], columns = ['Features'])
model_summary['Weights'] = model.intercept_, model.coef_[0], model.coef_[1] 
# weights means the same as coefficients
# Bias means the same as intercept (intercept is a number that adjusts regression with a constant)
# If we have to adjust regression by a number than the regression is biased by that number

In [24]:
model_summary

Unnamed: 0,Features,Weights
0,Bias,3.330238
1,SAT,0.171814
2,"Rand 1,2,3",-0.00703


In [25]:
# The bigger the weight the bigger the impact of the feature on the regression

### Interpretation 

The closer a weight is to 0, the smaller its impact

The bigger the weight, the bigger ist impact

#### When using standard scaling we don't have to worry if all the features are important or not. As we can see in summary table, Rand 1,2,3 is penalized with very low weight. We can leave it like that and it won't make any difference because its weight is very close to 0 and anything times 0 is simply 0.