In [5]:
import csv
import numpy as np
import matplotlib.pyplot as plt

A = []

with open("housing.xls", 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        A.append(row)

A = np.array([list(map(float, row[0].split())) for row in A])

X = A[:, :13]
Y = A[:, 13]

# Linear Regression

In [16]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
A = []

with open("housing.xls", 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        A.append(row)

A = np.array([list(map(float, row[0].split())) for row in A])

X = A[:, :13]
Y = A[:, 13]

# Define a function to train and evaluate the model on random entries
def evaluate_random_entries(X, Y, num_entries=10, random_seed=42):
    # Seed for reproducibility
    np.random.seed(random_seed)
    
    # Step 1: Split the dataset into training and testing sets
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=random_seed)

    # Step 2: Calculate the coefficients using the closed-form solution (normal equation)
    ones_column = np.ones((X_train.shape[0], 1))
    X_train = np.concatenate((ones_column, X_train), axis=1)
    coefficients = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ Y_train

    # Step 3: Randomly select entries from the testing set
    random_indices = np.random.choice(X_test.shape[0], num_entries, replace=False)
    selected_X = X_test[random_indices]
    selected_Y = Y_test[random_indices]

    # Step 4: Make predictions on the selected entries using the trained model
    ones_column = np.ones((selected_X.shape[0], 1))
    selected_X = np.concatenate((ones_column, selected_X), axis=1)
    predictions = selected_X @ coefficients

    # Step 5: Evaluate the model's performance on the selected entries
    rmse = np.sqrt(mean_squared_error(selected_Y, predictions))
    r2 = r2_score(selected_Y, predictions)

    return rmse, r2

# Evaluate the model on 10 randomly chosen entries (you can change the number)
rmse, r2 = evaluate_random_entries(X, Y, num_entries=10)

print("Randomly Chosen Entries Evaluation:")
print("RMSE:", rmse)
print("R-squared (R2):", r2)

Randomly Chosen Entries Evaluation:
RMSE: 3.7625171274024587
R-squared (R2): 0.8253575417005796


# LASSO

In [17]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load your data as you've done
# Assuming you already have X (feature matrix) and Y (target values)

# Step 1: Add a column of ones to X for the intercept term
ones_column = np.ones((X.shape[0], 1))
X = np.concatenate((ones_column, X), axis=1)

# Step 2: Initialize coefficients with small random values (including intercept)
np.random.seed(0)
l_coefficients = np.random.randn(X.shape[1])

# Hyperparameter for Lasso regularization (lambda)
alpha = 0.0001

# Number of iterations, learning rate, and scaling factor
num_iterations = 1000000
learning_rate = 0.000001
scaling_factor = 1 / len(Y)

# Step 3: Define the Lasso cost function
def lasso_cost(X, Y, coefficients, alpha):
    predictions = X.dot(coefficients)
    error = Y - predictions
    l1_penalty = alpha * np.sum(np.abs(coefficients[1:]))  # Exclude intercept from regularization
    cost = np.mean(error**2) + l1_penalty
    return cost

# Step 4: Implement gradient descent
for _ in range(num_iterations):
    predictions = X.dot(l_coefficients)
    error = Y - predictions
    
    # Compute the gradient of the cost function with respect to coefficients
    gradient = -2 * X.T.dot(error) * scaling_factor
    
    # Update coefficients with Lasso regularization
    l_coefficients[1:] -= learning_rate * (gradient[1:] + alpha * np.sign(l_coefficients[1:]))
    l_coefficients[0] -= learning_rate * gradient[0]

# Print the coefficients
print("Intercept:", l_coefficients[0])
print("Slopes:", l_coefficients[1:])

# Now, let's evaluate the model's performance on random entries from the dataset

# Step 5: Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 6: Randomly select a few entries from the testing set
random_indices = np.random.choice(X_test.shape[0], 10, replace=False)
selected_X = X_test[random_indices]
selected_Y = Y_test[random_indices]

# Step 7: Make predictions on the selected entries using the trained model
selected_predictions = selected_X.dot(l_coefficients)

# Step 8: Evaluate the model's performance on these selected entries
rmse = np.sqrt(mean_squared_error(selected_Y, selected_predictions))
r2 = r2_score(selected_Y, selected_predictions)

# Print the evaluation results
print("Randomly Chosen Entries Evaluation:")
print("RMSE:", rmse)
print("R-squared (R2):", r2)

Intercept: 2.0992748928062577
Slopes: [-0.09074206  0.0660451  -0.01754142  2.08527767 -0.85121259  4.36797018
  0.01980169 -0.77717413  0.1656636  -0.00922756 -0.13456081  0.01710921
 -0.54337423]
Randomly Chosen Entries Evaluation:
RMSE: 3.645828361371956
R-squared (R2): 0.6104044704028384


# Ridge Regression

In [18]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load your data as you've done
# Assuming you already have X (feature matrix) and Y (target values)

# Step 1: Add a column of ones to X for the intercept term
ones_column = np.ones((X.shape[0], 1))
X = np.concatenate((ones_column, X), axis=1)

# Step 2: Initialize coefficients with small random values (including intercept)
np.random.seed(0)
r_coefficients = np.random.randn(X.shape[1])

# Hyperparameter for Ridge regularization (lambda)
alpha = 0.001

# Number of iterations, learning rate, and scaling factor
num_iterations = 1000000
learning_rate = 0.000001
scaling_factor = 1 / len(Y)

# Step 3: Define the Ridge cost function
def ridge_cost(X, Y, coefficients, alpha):
    predictions = X.dot(coefficients)
    error = Y - predictions
    l2_penalty = alpha * np.sum(coefficients[1:]**2)  # Exclude intercept from regularization
    cost = np.mean(error**2) + l2_penalty
    return cost

# Step 4: Implement gradient descent
for _ in range(num_iterations):
    predictions = X.dot(r_coefficients)
    error = Y - predictions
    
    # Compute the gradient of the cost function with respect to coefficients
    gradient = -2 * X.T.dot(error) * scaling_factor
    
    # Update coefficients with Ridge regularization
    r_coefficients[1:] -= learning_rate * (gradient[1:] + 2 * alpha * r_coefficients[1:])
    r_coefficients[0] -= learning_rate * gradient[0]

# Print the coefficients
print("Intercept:", r_coefficients[0])
print("Slopes:", r_coefficients[1:])

# Now, let's evaluate the model's performance on random entries from the dataset

# Step 5: Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 6: Randomly select a few entries from the testing set
random_indices = np.random.choice(X_test.shape[0], 10, replace=False)
selected_X = X_test[random_indices]
selected_Y = Y_test[random_indices]

# Step 7: Make predictions on the selected entries using the trained model
selected_predictions = selected_X.dot(r_coefficients)

# Step 8: Evaluate the model's performance on these selected entries
rmse = np.sqrt(mean_squared_error(selected_Y, selected_predictions))
r2 = r2_score(selected_Y, selected_predictions)

# Print the evaluation results
print("Randomly Chosen Entries Evaluation:")
print("RMSE:", rmse)
print("R-squared (R2):", r2)

Intercept: 2.1576501409990083
Slopes: [ 0.79250438 -0.09371877  0.06813992 -0.0046147  -0.41352885  1.10003442
  4.06416296  0.02610843 -0.6989206   0.18739186 -0.01073717 -0.13891769
  0.0177176  -0.58065627]
Randomly Chosen Entries Evaluation:
RMSE: 2.619446056963898
R-squared (R2): 0.5896233465703522
