# Import Software Packages

In [1]:
import os  # Interact with the operating system
import sys  # Read system parameters.
from time import time  # Calculate training time.

import matplotlib as mpl  # Create 2D charts.
import matplotlib.cm as cm  # Create Color maps
import matplotlib.pyplot as plt  # Create figures
import numpy as np  # Work with multi-dimensional arrays and matrices.
import pandas as pd  # Manipulate and analyze data.
import seaborn as sb  # Perform data visualization.
import sklearn  # Perform data mining and analysis.
from sklearn import datasets

# Summarize software libraries used.
print("Libraries used in this project:")
print("- Python {}".format(sys.version))
print("- NumPy {}".format(np.__version__))
print("- pandas {}".format(pd.__version__))
print("- Matplotlib {}".format(mpl.__version__))
print("- Seaborn {}".format(sb.__version__))
print("- scikit-learn {}\n".format(sklearn.__version__))

Libraries used in this project:
- Python 3.8.18 | packaged by conda-forge | (default, Dec 23 2023, 17:23:49) 
[Clang 15.0.7 ]
- NumPy 1.24.3
- pandas 2.0.3
- Matplotlib 3.7.2
- Seaborn 0.9.0
- scikit-learn 1.3.2



# Load Dataset

In [2]:
PROJECT_ROOT_DIR = "."
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, "data/")
print("Data files in this project:", os.listdir(DATA_PATH))

# Read the raw dataset
raw_housing_data_file = os.path.join(DATA_PATH, "BostonHousing.csv")
raw_housing_data = pd.read_csv(raw_housing_data_file)
print(
    "Loaded {} records from {}.\n".format(len(raw_housing_data), raw_housing_data_file)
)

Data files in this project: ['.DS_Store', 'BostonHousing.csv', '.ipynb_checkpoints']
Loaded 506 records from ./data/BostonHousing.csv.



In [3]:
raw_housing_data.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


# Split the Data

Similar to the project on regularized linear regression, the 'target' serves as the dependent variable (the value that needs to be predicted), meaning it will be excluded from the training data and placed into a separate DataFrame designated for labels.

Models will be trained utilizing a holdout set instead of employing cross-validation.

In [4]:
from sklearn.model_selection import train_test_split

label_columns = ["Target"]

X_train, X_test, y_train, y_test = train_test_split(
    raw_housing_data.loc[:, "CRIM":"LSTAT"],
    raw_housing_data[label_columns],
    random_state=2,
)

Let's examine the quantity of rows and columns present in the original dataset compared to those in the training and testing subsets.

In [5]:
print(f"Original set:        {raw_housing_data.shape}")
print("------------------------------")
print(f"Training features:   {X_train.shape}")
print(f"Test features:       {X_test.shape}")
print(f"Training labels:     {y_train.shape}")
print(f"Test labels:         {y_test.shape}")

Original set:        (506, 14)
------------------------------
Training features:   (379, 13)
Test features:       (127, 13)
Training labels:     (379, 1)
Test labels:         (127, 1)


# Split the Datasets

The dependent variable, which is 'Target' and is the value to be predicted, will be extracted from the training data and placed into a separate DataFrame designated for labels, just as was done with the regularized linear regression.

The next steps are to divide the training and test datasets along with their corresponding labels and inspect the number of rows and columns in the original dataset compared to those in the training and testing datasets.

In [6]:
from sklearn.model_selection import train_test_split

# Extract 'Target' from the training data
label_columns = ["Target"]

# Divide the training and test datasets along with their corresponding labels.
X_train, X_test, y_train, y_test = train_test_split(
    raw_housing_data.loc[:, "CRIM":"LSTAT"],
    raw_housing_data[label_columns],
    random_state=2,
)

# Compare the number of rows and columns in the original dataset compared to those in the training and testing datasets.
print(f"Original set:        {raw_housing_data.shape}")
print("------------------------------")
print(f"Training features:   {X_train.shape}")
print(f"Test features:       {X_test.shape}")
print(f"Training labels:     {y_train.shape}")
print(f"Test labels:         {y_test.shape}")

Original set:        (506, 14)
------------------------------
Training features:   (379, 13)
Test features:       (127, 13)
Training labels:     (379, 1)
Test labels:         (127, 1)


# Observe Correlations to 'Target' 

Let's examine the relationships between numeric features and the 'Target' feature.



In [7]:
print("Correlations with median house value")
print(raw_housing_data.corr()["Target"].sort_values(ascending=False))

Correlations with median house value
Target     1.000000
RM         0.695360
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
AGE       -0.376955
RAD       -0.381626
CRIM      -0.388305
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: Target, dtype: float64


# Drop the Weakest Correlation 

The ensuing step involves eliminating the feature with the lowest correlation.

Since the **CHAS** correlation is weak and the feature in question is categorical rather than numeric (unlike the other features), it will be excluded from the training process.


In [8]:
def drop_weakest_correlation(dataset):

    print("Columns before drop:\n\n{}\n".format(list(dataset.columns)))

    dataset = dataset.drop(["CHAS"], axis=1)

    print("Columns after drop:\n\n{}\n".format(list(dataset.columns)))
    return dataset


X_train, X_test = drop_weakest_correlation(X_train.copy()), drop_weakest_correlation(
    X_test.copy()
)

Columns before drop:

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

Columns after drop:

['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

Columns before drop:

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

Columns after drop:

['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']



# Standardize the Features

The data is being processed in the same manner as previously. The features have the z-score implemented to normalize their values.

In [9]:
def standardize_features(X):
    result = X.copy()

    for feature in X.columns:
        result[feature] = (X[feature] - X[feature].mean()) / X[
            feature
        ].std()  # z-score formula.

    return result


X_train = standardize_features(X_train)
X_test = standardize_features(X_test)

print("The features have been standardized.")

The features have been standardized.


**Spotlight**

Scaling down features, such as through standardization, is crucial when applying an iterative cost minimization method like gradient descent, as normalization enables the model to reach the cost minimum more quickly, which reduces training time.

# Train Model and Calculate Cost

In [10]:
from sklearn.metrics import mean_squared_error as mse

**Calculate Mean Squared Error**

In [11]:
def calculate_mse(true_target_values, estimated_target_values):
    return mse(true_target_values, estimated_target_values)

**Train and Cost the Model**

This function is nearly identical to the **train_and_score_model()** function as previously used in the regularized regression; the only distinctions are that the training for each model will be timed, the variance score is omitted, and the name is appropriately modified.

In [12]:
def train_and_cost_model(model):
    start = time()
    model.fit(X_train, np.ravel(y_train))
    end = time()
    train_time = (end - start) * 1000

    model_predictions = model.predict(X_test)

    # cost = mse(y_test, predict)
    cost = calculate_mse(y_test, model_predictions)

    print("Linear regression model took {:.2f} milliseconds to fit.".format(train_time))
    print("Cost (mean squared error): {:.2f}".format(cost))


print("The function to train the model and calculate its cost has been defined.")

The function to train the model and calculate its cost has been defined.
