In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('data/real_estate_price_size_year.csv')
data

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


In [3]:
x1 = data.drop("price",axis=1)
y = data["price"]

In [6]:
model = LinearRegression()
model.fit(x1,y)

LinearRegression()

In [8]:
model.score(x1,y)

0.7764803683276791

In [11]:
# Let's use the handy function we created
def adj_r2(x,y):
    r2 = model.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [12]:
adj_r2(x1,y)

0.7718717161282498

In [13]:
from sklearn.feature_selection import f_regression
f_regression(x1,y)

(array([285.92105192,   0.85525799]), array([8.12763222e-31, 3.57340758e-01]))

In [15]:
p_values = f_regression(x1,y)[1]
p_values.round(3)

array([0.   , 0.357])

In [18]:
model_summary = pd.DataFrame(data=x1.columns.values,columns=["Features"])
model_summary["Coefficient"] = model.coef_
model_summary["P-Value"] = p_values.round(3)

In [19]:
model_summary

Unnamed: 0,Features,Coefficient,P-Value
0,size,227.700854,0.0
1,year,2916.785327,0.357


It seems that 'Year' is not event significant, therefore we should remove it from the model.