In [1]:
import pandas as pd
import numpy as np

file_path = '/content/Boston-filtered.csv'

df = pd.read_csv(file_path)
print(df)

        CRIM   ZN   INDUS   CHAS    NOX     RM   AGE     DIS  RAD  TAX  \
0    0.00632  18.0    2.31     0  0.538  6.575  65.2  4.0900    1  296   
1    0.02731   0.0    7.07     0  0.469  6.421  78.9  4.9671    2  242   
2    0.02729   0.0    7.07     0  0.469  7.185  61.1  4.9671    2  242   
3    0.03237   0.0    2.18     0  0.458  6.998  45.8  6.0622    3  222   
4    0.06905   0.0    2.18     0  0.458  7.147  54.2  6.0622    3  222   
..       ...   ...     ...   ...    ...    ...   ...     ...  ...  ...   
501  0.06263   0.0   11.93     0  0.573  6.593  69.1  2.4786    1  273   
502  0.04527   0.0   11.93     0  0.573  6.120  76.7  2.2875    1  273   
503  0.06076   0.0   11.93     0  0.573  6.976  91.0  2.1675    1  273   
504  0.10959   0.0   11.93     0  0.573  6.794  89.3  2.3889    1  273   
505  0.04741   0.0   11.93     0  0.573  6.030  80.8  2.5050    1  273   

     PTRATIO  LSTAT  MEDV  
0       15.3   4.98  24.0  
1       17.8   9.14  21.6  
2       17.8   4.03  34.7  

In [5]:
from sklearn.model_selection import train_test_split

target_col = 'MEDV'

def add_bias_term(df):
    df = df.copy()
    df['bias'] = 1
    return df

def mean_squared_error(y_true, y_pred):
    return sum((y_true - y_pred) ** 2) / len(y_true)

def linear_regression(X, y):
    X_transpose = X.T
    weights = np.linalg.inv(X_transpose @ X) @ X_transpose @ y
    return weights

def generate_splits(data, test_size=1/3, n_splits=20, random_seed=42):
    train_splits = []
    test_splits = []
    for i in range(n_splits):
        train, test = train_test_split(data, test_size=test_size, random_state=random_seed + i)
        train_splits.append(train)
        test_splits.append(test)
    return train_splits, test_splits

In [6]:
# a. Naive regression

train_splits, test_splits = generate_splits(df)

train_mse_list = []
test_mse_list = []

for train, test in zip(train_splits, test_splits):
    y_train, y_test = train[target_col], test[target_col]
    constant_value = y_train.mean()

    train_mse = mean_squared_error(y_train, [constant_value] * len(y_train))
    test_mse = mean_squared_error(y_test, [constant_value] * len(y_test))

    train_mse_list.append(train_mse)
    test_mse_list.append(test_mse)

average_train_mse = sum(train_mse_list) / len(train_mse_list)
average_test_mse = sum(test_mse_list) / len(test_mse_list)

print(f'Average Training MSE: {average_train_mse}')
print(f'Average Test MSE: {average_test_mse}')

Average Training MSE: 82.83301494245788
Average Test MSE: 87.82042310294798


<small>
b. Naive regression

<p>

<small>
The constant function in part "a" simply predicts the average MEDV value from the training set for every observation, ignoring any influence of features like crime rate or number of rooms. This naive approach assumes that all variability in housing prices is random and does not capture patterns in the data. It serves as a baseline: if a more complex model cannot outperform this constant prediction, it suggests that the model may not be effectively using the available features to predict MEDV.


In [11]:
# c. Single attribute regression

train_mse_results = {col: [] for col in df.columns if col != target_col}
test_mse_results = {col: [] for col in df.columns if col != target_col}

for attribute in train_mse_results.keys():
    for train, test in zip(train_splits, test_splits):
        # Prepare training and test data with the bias term using the helper function
        X_train = add_bias_term(train[[attribute]])
        y_train = train[target_col]

        X_test = add_bias_term(test[[attribute]])
        y_test = test[target_col]

        # Convert Df to NumPy arrays for matrix operations
        X_train_np = X_train.values
        y_train_np = y_train.values

        weights = linear_regression(X_train_np, y_train_np)

        # Predict for training and test sets
        train_predictions = X_train_np @ weights
        test_predictions = X_test.values @ weights

        # Calculate and store MSE for each split
        train_mse = mean_squared_error(y_train, train_predictions)
        test_mse = mean_squared_error(y_test, test_predictions)

        train_mse_results[attribute].append(train_mse)
        test_mse_results[attribute].append(test_mse)

# Calculate average MSE across all splits for each attribute
average_train_mse = {attr: sum(mse_list) / len(mse_list) for attr, mse_list in train_mse_results.items()}
average_test_mse = {attr: sum(mse_list) / len(mse_list) for attr, mse_list in test_mse_results.items()}

print("Average Training MSE for each attribute:")
for attr, mse in average_train_mse.items():
    print(f"{attr}: {mse:.2f}")

print("\nAverage Test MSE for each attribute:")
for attr, mse in average_test_mse.items():
    print(f"{attr}: {mse:.2f}")

Average Training MSE for each attribute:
CRIM: 70.15
 ZN : 72.87
INDUS : 63.32
CHAS: 80.51
NOX: 67.65
RM: 42.87
AGE: 71.18
DIS: 77.66
RAD: 70.55
TAX: 64.38
PTRATIO: 62.06
LSTAT: 37.92

Average Test MSE for each attribute:
CRIM: 76.35
 ZN : 75.05
INDUS : 67.63
CHAS: 85.25
NOX: 72.12
RM: 45.44
AGE: 75.25
DIS: 82.43
RAD: 75.60
TAX: 69.23
PTRATIO: 64.15
LSTAT: 40.03


In [10]:
# d. Linear regression (general)

train_mse_all_attributes = []
test_mse_all_attributes = []

for train, test in zip(train_splits, test_splits):
    # Prepare training and test data with all attributes, including a bias term
    X_train = add_bias_term(train.drop(columns=[target_col]))
    y_train = train[target_col]

    X_test = add_bias_term(test.drop(columns=[target_col]))
    y_test = test[target_col]

    # Convert Df to NumPy arrays for matrix operations
    y_train_np = y_train.values
    X_train_np = X_train.values

    # Fit the custom linear regression model on all attributes
    weights = linear_regression(X_train_np, y_train_np)

    # Predict for training and test sets
    train_predictions = X_train_np @ weights
    test_predictions = X_test.values @ weights

    # Calculate and store MSE for each split
    train_mse = mean_squared_error(y_train, train_predictions)
    test_mse = mean_squared_error(y_test, test_predictions)

    train_mse_all_attributes.append(train_mse)
    test_mse_all_attributes.append(test_mse)

# Calculate average MSE across all splits
average_train_mse_all_attributes = sum(train_mse_all_attributes) / len(train_mse_all_attributes)
average_test_mse_all_attributes = sum(test_mse_all_attributes) / len(test_mse_all_attributes)

# Display the results
print(f"Average Training MSE: {average_train_mse_all_attributes:.2f}")
print(f"Average Test MSE: {average_test_mse_all_attributes:.2f}")

Average Training MSE: 21.94
Average Test MSE: 24.63
