$$
y = m_1x_1 + m_2x_2 + m_3x_3 + \dots + b
$$

In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [143]:
df = pd.read_csv('data/homeprices.csv')

In [144]:
df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [145]:
df.columns

Index(['area', 'bedrooms', 'age', 'price'], dtype='object')

In [146]:
df.tail()

Unnamed: 0,area,bedrooms,age,price
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4100,6.0,8,810000


In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      6 non-null      int64  
 1   bedrooms  5 non-null      float64
 2   age       6 non-null      int64  
 3   price     6 non-null      int64  
dtypes: float64(1), int64(3)
memory usage: 324.0 bytes


In [148]:
df.isnull().sum()

area        0
bedrooms    1
age         0
price       0
dtype: int64

In [149]:
df.describe()

Unnamed: 0,area,bedrooms,age,price
count,6.0,5.0,6.0,6.0
mean,3416.666667,4.2,16.5,648333.333333
std,587.934237,1.30384,8.288546,109117.673484
min,2600.0,3.0,8.0,550000.0
25%,3050.0,3.0,9.75,572500.0
50%,3400.0,4.0,16.5,602500.0
75%,3900.0,5.0,19.5,722500.0
max,4100.0,6.0,30.0,810000.0


In [152]:
print(df['bedrooms'].mean())

4.2


In [153]:
df.bedrooms

0    3.0
1    4.0
2    NaN
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [154]:
df['bedrooms'] = df['bedrooms'].fillna(df.bedrooms.median())

In [155]:
df.bedrooms

0    3.0
1    4.0
2    4.0
3    3.0
4    5.0
5    6.0
Name: bedrooms, dtype: float64

In [158]:
X = df.drop(columns=['price'])

In [159]:
print(X)

   area  bedrooms  age
0  2600       3.0   20
1  3000       4.0   15
2  3200       4.0   18
3  3600       3.0   30
4  4000       5.0    8
5  4100       6.0    8


In [161]:
y = df['price']
print(y)

0    550000
1    565000
2    610000
3    595000
4    760000
5    810000
Name: price, dtype: int64


In [162]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [163]:
x_train

Unnamed: 0,area,bedrooms,age
1,3000,4.0,15
3,3600,3.0,30
0,2600,3.0,20
4,4000,5.0,8


In [164]:
x_test

Unnamed: 0,area,bedrooms,age
5,4100,6.0,8
2,3200,4.0,18


In [165]:
y_train

1    565000
3    595000
0    550000
4    760000
Name: price, dtype: int64

In [166]:
y_test

5    810000
2    610000
Name: price, dtype: int64

In [167]:
reg = LinearRegression()

In [168]:
reg.fit(x_train, y_train)

In [169]:
y_pred = reg.predict(x_test)

In [171]:
print(y_pred)

[608500. 554875.]


In [172]:
pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

Unnamed: 0,y_test,y_pred
5,810000,608500.0
2,610000,554875.0


In [177]:
print(reg.predict(pd.DataFrame({'area': [3000], 'bedrooms': [3], 'age': [40]})))

[262000.]


In [178]:
print(reg.predict(pd.DataFrame({'area': [2500], 'bedrooms': [4], 'age': [5]})))

[638125.]


In [182]:
print('MSE = ', mean_squared_error(y_test, y_pred))
print('R2 = ', r2_score(y_test, y_pred))
print('RMSE = ', np.sqrt(mean_squared_error(y_test, y_pred)))

MSE =  21820507812.499905
R2 =  -1.1820507812499903
RMSE =  147717.66249335217
