In [1]:
# The first step in building an ML algorithm is deciding what you want to predict. When looking at a
# DataFrame, the idea is to choose one column as the target column. The target column, by definition,
# is what the algorithm will be trained to predict.

# The median value of a home is a desirable target column since real-estate agents, buyers, and sellers
# often want to know how much a house is worth. People usually determine this information based on
# the size of the house, the location, the number of bedrooms, and many other factors.

# It’s often helpful to simplify a problem. What if we take just one column, such as the number of
# bedrooms, and use it to predict the median house value?
# It’s clear that the more bedrooms a house has, the more valuable it will be. As the number of bedrooms
# goes up, so does the house value. A standard way to represent this positive association is with a
# straight line.

In [2]:

# Dimensionality is an important concept in ML.
# In math, it's common to work with two dimensions, x and y
# In physics, it's common to work with three dimensions, x y and z-axes
# In ML, the number of dimensions is often the number of predictor columns.

# There is no need to limit ourselves to one dimension with linear regression. Additional dimensions—
# in this case, additional columns—will give us more information about the median house value and
# make our model more valuable.

# In one-dimensional linear regression,
# the slope-intercept equation is "y = mx + b", where y is the target column, x is the input, m is the slope, and b is the y-intercept.
# This equation is now extended to an arbitrary number of dimensions using "Y = MX + B", where Y, M, and X are vectors of arbitrary length.
# Instead of the slope, M is referred to as the weight.


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
# load data
housing_df = pd.read_csv('./datasets/HousingData.csv')
housing_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [5]:
# drop null values
housing_df = housing_df.dropna()

In [6]:
# X for the predictor columns and y for the target column
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1] # The target column is MEDV

In [7]:
# Before building the regression model, we are going to use train_test_split() to split X and y, the predictor and target columns, into training and test sets.
# The model will be built using the training set. Let’s split the data in the following step.


X_train, X_test, y_train, y_test = train_test_split(
  X,
  y,
  test_size=0.2, # test_size=0.2 reflects the percentage of rows held back for the test set.
  random_state=0
)

In [8]:
# build the actual linear regression model
reg = LinearRegression()

# fit the model to the data
reg.fit(X_train, y_train)

# At this point, reg is an ML model with specified weights. There is one weight for each X
# column. These weights are multiplied by the entry in each row to get as close as possible to the
# target column, y, which is the median house value.

In [9]:
# Predict on the test data
y_pred = reg.predict(X_test)

# We can now test the prediction by comparing the predicted y values ( y_pred) to the actual y values ( y_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {rmse}')

RMSE: 5.371207757773577


In [10]:
# On average, the ML model predicts values approximately
# 5.37 units away from the target value, which is not bad in terms of accuracy given the range of column
# values of 45 and standard deviation of 9.1 (see df['MEDV'].describe()). Since the median
# value (from 1980) is in the thousands, the predictions are about 5.37 thousand off. Lower errors are
# always better, so we will see if can improve the error going forward.

In [11]:
# Will the result be different each time we test?
def regression_model(model):
  # Create training and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

  # Create the regressor: reg_all
  reg_all = model

  # Fit the regressor to the training data
  reg_all.fit(X_train, y_train)

  # Predict on the test data: y_pred
  y_pred = reg_all.predict(X_test)

  # Compute and print RMSE
  rmse = root_mean_squared_error(y_test, y_pred)
  print(f'RMSE: {rmse}')

regression_model(LinearRegression())

# The scores are different because we are splitting the data into a different training set and test set
# each time, and the model is based on different training sets. Furthermore, it’s being scored against a
# different test set.
# In order for ML scores to be meaningful, we want to minimize fluctuation, and ensure that our results
# are representative of reality.

RMSE: 4.26883402767759
