In [46]:
import pandas as pd

import mglearn
import random
import numpy as np

In [47]:
def train_test_split(X, y, test_size=0.25, random_state=None):
    
    # Set random seed for reproducibility if random_state is provided
    if random_state is not None:
        np.random.seed(random_state)
        
        
    # Get the total number of samples
    n_samples = len(X)
    
    
    # Create an array of indices and shuffle them
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    # Determine the number of samples for the test set
    if isinstance(test_size, float):
        test_size = int(test_size * n_samples)
        
        
    # Extract indices for the test and training sets
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    
    
    # Use indices to split the data into training and testing sets
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test


In [48]:
def fit_linear_regression(X, y):
    
    # Add a column of ones to X for the intercept term
    X_ext = np.column_stack((np.ones(len(X)), X))
    
    # Calculate coefficients using the normal equation
    coefficients = np.dot(np.dot(np.linalg.pinv(np.dot(X_ext.T, X_ext)), X_ext.T), y)
    
    return coefficients

def predict(X, coefficients):
    
    
    # Add a column of ones to X for the intercept term
    X_ext = np.column_stack((np.ones(len(X)), X))
    
    # Calculate predicted target values using dot product
    y_pred = np.dot(X_ext, coefficients)
    
    return y_pred
    

In [49]:
def mean_squared_error(y_true, y_pred):
    
    
    # Ensure that the input arrays have the same length
    if len(y_true) != len(y_pred):
        raise ValueError("Input arrays must have the same length.")

    # Calculate squared differences
    squared_diff = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]

    # Calculate mean squared error
    mse = sum(squared_diff) / len(y_true)

    return mse

In [50]:
# Load the extended Boston Housing dataset
X, y = mglearn.datasets.load_extended_boston()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# Fit the linear regression model on the training set
coefficients = fit_linear_regression(X_train, y_train)

# Make predictions on the training set
y_train_pred = predict(X_train, coefficients)

# Make predictions on the test set
y_pred = predict(X_test, coefficients)


# Calculate Mean Squared Error (MSE) on the test set
mse_test = mean_squared_error(y_test, y_pred)


# Calculate Mean Squared Error (MSE) on the training set
mse_train = mean_squared_error(y_train, y_train_pred)


In [51]:
print("Mean Squared Error on Training Set:", mse_train)
print("Mean Squared Error on Test Set:", mse_test)

Mean Squared Error on Training Set: 5.119969179921958
Mean Squared Error on Test Set: 14.329434192452062
