# Multiple Linear Regression Exercise

##  Import the relevant libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

## Load the data


In [3]:
data=pd.read_csv("real_estate_price_size_year.csv")

In [4]:
data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [5]:
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


## Create the multiple linear regression

### Declare the dependent and independent variables

In [7]:
x= data[['size','year']]
y = data['price']

## Regression itself

In [8]:
reg= LinearRegression()
reg.fit(x,y)

In [10]:
reg.coef_

array([ 227.70085401, 2916.78532684])

In [11]:
reg.intercept_

-5772267.017463278

## Calculate the R-squared

In [12]:
reg.score(x,y)

0.7764803683276796

## Formula for adjusted R^2
$R^2_{adj.}=1-(1-R^2)*\frac{n-1}{n-p-1}$

In [13]:
x.shape

(100, 2)

In [14]:
r2=reg.score(x,y)
n=x.shape[0]
p=x.shape[1]
adjusted_r2=1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.7718717161282503

## Feature selection

In [15]:
from sklearn.feature_selection import f_regression 

In [16]:
f_regression(x,y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [20]:
p_values=f_regression(x,y)[1]
p_values

array([8.12763222e-31, 3.57340758e-01])

In [21]:
p_values.round(3)

array([0.   , 0.357])

## Create a summary table 

In [24]:
reg_summary=pd.DataFrame(data=['size','year'],columns=['Feature'])
reg_summary

Unnamed: 0,Feature
0,size
1,year


In [25]:
reg_summary ['coefficients']=reg.coef_
reg_summary['p-values']=p_values.round(3)
reg_summary

Unnamed: 0,Feature,coefficients,p-values
0,size,227.700854,0.0
1,year,2916.785327,0.357


## Standardization

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler=StandardScaler()
scaler.fit(x,y)

In [28]:
x_scaled=scaler.transform(x)

In [29]:
x_scaled

array([[-0.70816415,  0.51006137],
       [-0.66387316, -0.76509206],
       [-1.23371919,  1.14763808],
       [ 2.19844528,  0.51006137],
       [ 1.42498884, -0.76509206],
       [-0.937209  , -1.40266877],
       [-0.95171405,  0.51006137],
       [-0.78328682, -1.40266877],
       [-0.57603328,  1.14763808],
       [-0.53467702, -0.76509206],
       [ 0.69939906, -0.76509206],
       [ 3.33780001, -0.76509206],
       [-0.53467702,  0.51006137],
       [ 0.52699137,  1.14763808],
       [ 1.51100715, -1.40266877],
       [ 1.77668568, -1.40266877],
       [-0.54810263,  1.14763808],
       [-0.77276222, -1.40266877],
       [-0.58004747, -1.40266877],
       [ 0.58943055,  1.14763808],
       [-0.78365788,  0.51006137],
       [-1.02322731,  0.51006137],
       [ 1.19557293,  0.51006137],
       [-1.12884431,  0.51006137],
       [-1.10378093, -0.76509206],
       [ 0.84424715,  1.14763808],
       [-0.95171405,  1.14763808],
       [ 1.62279723,  0.51006137],
       [-0.58004747,

## Regression with scaled features

In [30]:
reg=LinearRegression()
reg.fit(x_scaled,y)

In [31]:
reg.coef_

array([67501.57614152, 13724.39708231])

In [32]:
reg.intercept_

292289.4701599997

## Creating a summary table

In [35]:
reg_summary=pd.DataFrame([['Intercept'],["size"],['year']],columns=['Features'])
reg_summary['weights']=reg.intercept_,reg.coef_[0],reg.coef_[1]
reg_summary

Unnamed: 0,Features,weights
0,Intercept,292289.47016
1,size,67501.576142
2,year,13724.397082


### Making predictions with the standardize coefficients(weights)

In [36]:
new_data=pd.DataFrame(data=[[500,2009],[600,2010]],columns=['size','year'])
new_data

Unnamed: 0,size,year
0,500,2009
1,600,2010


In [37]:
reg.predict(new_data)



array([61615391.27927811, 68379273.29051262])

In [38]:
new_data_scaled=scaler.transform(new_data)
new_data_scaled

array([[-1.19084496, -0.76509206],
       [-0.85351824, -0.55256648]])

In [39]:
reg.predict(new_data_scaled)

array([201405.13115808, 227092.00188566])

### What if we removed the 'year' variable

In [40]:
reg_simple=LinearRegression()
x_simple_matrix = x_scaled[:,0].reshape(-1,1)
reg_simple.fit(x_simple_matrix,y)

In [41]:
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

array([213501.97309853, 235819.84735799])