In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [133]:
df = pd.read_csv('boston.csv')
df.head() # read the first 5 rows in the data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [135]:
df.isnull().sum() # data have no null values

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [136]:
df.duplicated().sum() # data have no duplicated values

np.int64(0)

In [137]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [138]:
# remove all outliers
columns_outliers = df.select_dtypes(include=['number']).columns

for col in columns_outliers:
    Q1= df[col].quantile(0.25)
    Q3= df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,0.228482,7.023364,8.62757,0.0,0.499,6.214463,60.05,4.445154,4.518692,310.471963,18.678972,391.507757,10.567804,22.17757
std,0.283614,12.29859,5.237659,0.0,0.057924,0.378653,26.419783,1.760778,1.635041,65.158144,1.671545,6.677139,4.116063,4.097183
min,0.00632,0.0,1.25,0.0,0.409,5.39,2.9,1.6686,1.0,188.0,14.7,372.08,2.98,11.9
25%,0.069365,0.0,5.19,0.0,0.447,5.93675,40.175,2.89835,4.0,273.75,17.8,389.3925,7.3475,19.4
50%,0.131375,0.0,7.38,0.0,0.499,6.152,62.5,4.207,4.0,304.0,18.7,394.225,9.84,22.0
75%,0.253167,12.5,9.9825,0.0,0.538,6.4555,82.875,5.601375,5.0,342.5,19.7,396.9,13.135,24.3
max,2.24236,45.0,25.65,0.0,0.624,7.412,100.0,9.2229,8.0,437.0,21.2,396.9,21.32,33.0


In [139]:
cor= df.corr(numeric_only=True)['MEDV'].sort_values(ascending=False)
print(cor)

MEDV       1.000000
RM         0.684333
ZN         0.307642
DIS        0.154968
RAD        0.120954
B         -0.007701
TAX       -0.267205
CRIM      -0.272352
INDUS     -0.346934
NOX       -0.392227
AGE       -0.413385
PTRATIO   -0.430696
LSTAT     -0.661716
CHAS            NaN
Name: MEDV, dtype: float64


In [140]:
# linear regression 

X = df[['RM','ZN','DIS','RAD']]
y = df['MEDV']
X_train,X_temp,y_train,y_temp = train_test_split(X,y,test_size=0.2,random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE Linear Regression MSE: {mse:.4f}")

MSE Linear Regression MSE: 4.6769


In [141]:
polyreg = make_pipeline(PolynomialFeatures(degree=4), LinearRegression())
polyreg.fit(X_train, y_train)

y_pred_poly = polyreg.predict(X_test)
mse_poly = mean_squared_error(y_test, y_pred_poly)
print(f'MSE polynomial regression: {mse_poly:.2f}')

MSE polynomial regression: 17.23


In [142]:
# multiple linear regression

X_multi = np.hstack([X, X**2])
X_train, X_test, y_train, y_test = train_test_split(X_multi, y, test_size=0.2, random_state=42)

multi_reg = LinearRegression()
multi_reg.fit(X_train, y_train)
y_pred_multi = multi_reg.predict(X_test)
mse_multi = mean_squared_error(y_test, y_pred_multi)
print(f"MSE Multiple Linear Regression MSE: {mse_multi:.4f}")

MSE Multiple Linear Regression MSE: 6.1711
