# Multiple Regression

#### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

#### Load the Data

In [2]:
data = pd.read_csv('real_estate_price_size_year.csv')
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


#### Declare dependent and independent variables

In [3]:
x = data[['size','year']]
y = data['price']

#### Standardization

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
scaler = StandardScaler()

In [6]:
scaler.fit(x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [7]:
x_scaled = scaler.transform(x)

In [8]:
x_scaled

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

#### Multiple Regression

In [9]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
reg.coef_

array([67501.57614152, 13724.39708231])

In [11]:
reg.intercept_

292289.4701599997

#### Summary Table

In [12]:
reg_summary = pd.DataFrame([['Bias'],['price'],['size']], columns=['Features'])
reg_summary['Weights'] = reg.intercept_, reg.coef_[0], reg.coef_[1]

In [13]:
reg_summary

Unnamed: 0,Features,Weights
0,Bias,292289.47016
1,price,67501.576142
2,size,13724.397082


#### Calculating the R-squared

In [14]:
reg.score(x,y)

-1302733.899052204

#### Formula for Adjusted $R^2$

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

where, n = number of observations,
       p = number of predictors (features)

In [15]:
x.shape

(100, 2)

In [16]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

-1329594.4124347237

Since Adjusted $R^2$ is less than $R^2$, one or more predictors has little to no explanatory power 

#### Feature Selection

In [17]:
from sklearn.feature_selection import f_regression

In [18]:
f_regression(x,y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [19]:
p_values = f_regression(x,y)[1]
p_values

array([8.12763222e-31, 3.57340758e-01])

In [20]:
p_values.round(3)

array([0.   , 0.357])

Note: Both the above values are significant (<0.05)The above p-values are univariate values reached from simple linear models. They do not reflect interconnection of the features in out multiple linear regression

#### Summary Table

In [21]:
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Features'])
reg_summary ['Coefficients'] = reg.coef_
reg_summary ['p-values'] = p_values.round(3)

In [22]:
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,size,67501.576142,0.0
1,year,13724.397082,0.357
