In [2]:
from google.colab import files
import pandas as pd

df = pd.read_csv("House_Price_Prediction.csv")

In [3]:
df = pd.DataFrame(df)
df.drop(['date'], axis=1, inplace=True)
print(df.isnull().sum())

price                0
bedrooms             0
grade                0
has_basement         0
living_in_m2         0
renovated            0
nice_view            0
perfect_condition    0
real_bathrooms       0
has_lavatory         0
single_floor         0
month                0
quartile_zone        0
dtype: int64


In [4]:
bool_cols = ['has_basement', 'renovated', 'nice_view', 'perfect_condition', 'has_lavatory', 'single_floor']
df[bool_cols] = df[bool_cols].astype(float)

In [5]:
scaling_columns = ['bedrooms', 'grade', 'living_in_m2', 'real_bathrooms', 'month', 'quartile_zone']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[scaling_columns] = scaler.fit_transform(df[scaling_columns])

In [6]:
from scipy.stats import spearmanr
target = 'price'

# Calculate Spearman correlation
correlations = {
    feature: spearmanr(df[feature], df[target]).correlation
    for feature in df.columns if feature != target
}

# Rank features based on absolute correlation
ranked_features = sorted(correlations, key=lambda x: abs(correlations[x]), reverse=True)

In [7]:
import numpy as np

class LinearRegressionClosedForm:

    def __init__(self):
        self.weights = None

    def fit(self, X, y):
        # Add a bias term (column of ones) to X
        X_bias = np.c_[np.ones(X.shape[0]), X]
        # Compute weights using the Normal Equation
        self.weights = np.linalg.inv(X_bias.T @ X_bias) @ X_bias.T @ y

    def predict(self, X):
        # Add a bias term (column of ones) to X
        X_bias = np.c_[np.ones(X.shape[0]), X]
        # Compute predictions
        return X_bias @ self.weights

In [8]:
import numpy as np

def train_test_split(X, y, test_size, random_seed=None):

    X = np.array(X)
    y = np.array(y)
    assert 0.0 < test_size < 1.0, "test_size must be a float between 0 and 1."
    assert len(X) == len(y), "X and y must have the same number of samples."

    if random_seed is not None:
        np.random.seed(random_seed)

    n = len(X)
    n_test = int(n * test_size)  # Number of test samples

    # Shuffle indices
    indices = np.arange(n)
    np.random.shuffle(indices)

    # Split indices for test and train
    train_indices = indices[n_test:]
    test_indices = indices[:n_test]

    # Split the data
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

In [9]:
def mean_squared_error(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert len(y_true) == len(y_pred), "y_true and y_pred must have the same number of elements."
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

In [10]:
results = []

for i in range(1, len(ranked_features) + 1):
    subset = ranked_features[:i]
    X = df[subset]
    y = df[target]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_seed=42)

    # Train model
    model = LinearRegressionClosedForm()
    model.fit(X_train, y_train)

    # Calculate errors
    train_error = mean_squared_error(y_train, model.predict(X_train))
    test_error = mean_squared_error(y_test, model.predict(X_test))

    results.append({
        "Feature Count": i,
        "Selected Features": subset,
        "Train Error": train_error,
        "Test Error": test_error
    })

In [11]:
print(results)

[{'Feature Count': 1, 'Selected Features': ['quartile_zone'], 'Train Error': 23428021954.723625, 'Test Error': 24594566240.336384}, {'Feature Count': 2, 'Selected Features': ['quartile_zone', 'grade'], 'Train Error': 15633815403.03254, 'Test Error': 15301687295.413305}, {'Feature Count': 3, 'Selected Features': ['quartile_zone', 'grade', 'living_in_m2'], 'Train Error': 12820384654.05958, 'Test Error': 12021119095.033855}, {'Feature Count': 4, 'Selected Features': ['quartile_zone', 'grade', 'living_in_m2', 'real_bathrooms'], 'Train Error': 12799884980.197325, 'Test Error': 12045644912.36101}, {'Feature Count': 5, 'Selected Features': ['quartile_zone', 'grade', 'living_in_m2', 'real_bathrooms', 'single_floor'], 'Train Error': 12789181251.435232, 'Test Error': 12034765065.39437}, {'Feature Count': 6, 'Selected Features': ['quartile_zone', 'grade', 'living_in_m2', 'real_bathrooms', 'single_floor', 'bedrooms'], 'Train Error': 12769062673.1985, 'Test Error': 11984733731.83403}, {'Feature Cou