# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings. simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

# Data Collection

## Import Boston Data

Content
The following describes the dataset columns:

Detail info : <br>
https://www.kaggle.com/datasets/avish5787/boston-data-set <br>

CRIM - per capita crime rate by town <br>
ZN - proportion of residential land zoned for lots over 25,000 sq.ft. <br>
INDUS - proportion of non-retail business acres per town. <br>
CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise) <br>
NOX - nitric oxides concentration (parts per 10 million) <br>
RM - average number of rooms per dwelling <br>
AGE - proportion of owner-occupied units built prior to 1940 <br>
DIS - weighted distances to five Boston employment centres <br>
RAD - index of accessibility to radial highways <br>
TAX - full-value property-tax rate per $10,000 <br>
PTRATIO - pupil-teacher ratio by town <br>
B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town <br>
LSTAT - percent lower status of the population <br>
MEDV - Median value of owner-occupied homes in USD1000's <br>

In [None]:
boston_df = pd.read_csv("../00_general_data/housing_boston.csv", delimiter=';')

In [None]:
boston_df.head()

# Data Preprocessing

In [None]:
# check for shape of dataset
# we get 506 rows and 14 columns
print("Shape of dataset: {}".format(boston_df.shape))

In [None]:
# check for null values
# perfect, no null values
boston_df.isnull().sum()

In [None]:
boston_df.info()

In [None]:
# check for descriptive statistics
boston_df.describe()

In [None]:
# check for columns
boston_df.columns

In [None]:
# check for numbers of columns
len(boston_df.columns)

# Exploratory Data Analysis

## Create Pair Plot

In [None]:
# create pairplot for independent and dependent variable
sns.pairplot(boston_df)

In [None]:
# Create pairplot for selection of independent variable
sns.pairplot(boston_df[['LSTAT', 'RM', 'PRICE']])

## Heat Map

In [None]:
# check for heatmap
# let's see for the correlation coeficient
plt.figure(figsize = (10,8))
sns.heatmap(boston_df.corr(), annot=True, cmap='Greens')

## Visualize Null Values

In [None]:
# No null values right
sns.heatmap(boston_df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# Only select some of the independent variables that have correlation with Price
plt.figure(figsize = (8,6))
sns.heatmap(boston_df[['LSTAT', 'RM', 'PTRATIO', 'PRICE']].corr(), annot=True, cmap='Greens')

# Train a Linear Regression Model

## Split between Dependent and Independent Variables

In [None]:
X = boston_df[['LSTAT', 'RM', 'PTRATIO']]
y = boston_df['PRICE']

# Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# use test size 20% from total data
# pseudo random number generator with code 101

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Create and Train the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train, y_train)

# Model Evaluation

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

# Predictions from Model

In [None]:
X_test.head()

## Train Accuracy

In [None]:
train_score = lm.score(X_train, y_train)*100

In [None]:
print("Training Accuracy: {}".format(train_score))

## Test Accuracy

In [None]:
test_score = lm.score(X_test, y_test)*100

In [None]:
print("Test Accuracy: {}".format(test_score))

## Predict X_test

In [None]:
y_pred = lm.predict(X_test)

In [None]:
y_pred

# Check Scatter Plot between y_test and y_pred

# Scatter Plot using Matplotlib

In [None]:
plt.figure(figsize=(8,6))
plt.xlabel('y_test value')
plt.ylabel('y_pred value')
plt.title('y_test value and y_pred value')

plt.scatter(x=y_test, y=y_pred)

## Scatter Plot and Regression Plot Using Seaborn

In [None]:
plt.figure(figsize=(8,6))
plt.xlabel('y_test value')
plt.ylabel('y_pred value')
plt.title('y_test value and y_pred value')
sns.regplot(x=y_test, y=y_pred)

# Residual Histogram

In [None]:
sns.displot((y_test-y_pred), bins=50, height = 5)

# Regression Evaluation Metrics

In [None]:
from sklearn import metrics

## Regression Evaluation Metrics


Here are three common evaluation metrics for regression problems:

**Mean Absolute Error** (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

**Mean Squared Error** (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

**Root Mean Squared Error** (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

Comparing these metrics:

- **MAE** is the easiest to understand, because it's the average error.
- **MSE** is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
- **RMSE** is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are **loss functions**, because we want to minimize them.

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
mod = sm.OLS(y , X).fit()

In [None]:
print(mod.summary())

references :

https://www.accelebrate.com/blog/interpreting-results-from-linear-regression-is-the-data-appropriate

https://www.statsmodels.org/dev/examples/notebooks/generated/ols.html

https://medium.com/analytics-vidhya/boston-house-price-prediction-using-machine-learning-ad3750a866cd

https://medium.com/analytics-vidhya/ordinary-least-squared-ols-regression-90942a2fdad5