In [1]:
from google.colab import files
import pandas as pd

df = pd.read_csv("House_Price_Prediction.csv")

In [2]:
df = pd.DataFrame(df)
df.drop(['date'], axis=1, inplace=True)
print(df.isnull().sum())

price                0
bedrooms             0
grade                0
has_basement         0
living_in_m2         0
renovated            0
nice_view            0
perfect_condition    0
real_bathrooms       0
has_lavatory         0
single_floor         0
month                0
quartile_zone        0
dtype: int64


In [3]:
bool_cols = ['has_basement', 'renovated', 'nice_view', 'perfect_condition', 'has_lavatory', 'single_floor']
df[bool_cols] = df[bool_cols].astype(float)

In [4]:
scaling_columns = ['bedrooms', 'grade', 'living_in_m2', 'real_bathrooms', 'month', 'quartile_zone']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[scaling_columns] = scaler.fit_transform(df[scaling_columns])

In [6]:
y = df['price']
x = df
x = df.drop(['price'], axis=1)

In [14]:
import numpy as np

def train_test_split_manual(X, y, test_size, random_seed=None):

    X = np.array(X)
    y = np.array(y)
    assert 0.0 < test_size < 1.0, "test_size must be a float between 0 and 1."
    assert len(X) == len(y), "X and y must have the same number of samples."

    if random_seed is not None:
        np.random.seed(random_seed)

    n = len(X)
    n_test = int(n * test_size)  # Number of test samples

    # Shuffle indices
    indices = np.arange(n)
    np.random.shuffle(indices)

    # Split indices for test and train
    train_indices = indices[n_test:]
    test_indices = indices[:n_test]

    # Split the data
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

x_train, x_test, y_train, y_test = train_test_split_manual(x, y, test_size=0.2, random_seed=42)

In [15]:
import numpy as np

class LinearRegressionClosedForm:

    def __init__(self):
        self.weights = None

    def fit(self, X, y):
        bias = np.c_[np.ones(X.shape[0]), X] # Add a bias term (column of ones) to X
        self.weights = np.linalg.inv(bias.T @ bias) @ bias.T @ y # Compute weights using the Normal Equation

    def predict(self, X):
        bias = np.c_[np.ones(X.shape[0]), X] # Add a bias term (column of ones) to X
        return bias @ self.weights # Compute predictions

In [12]:
from sklearn.metrics import mean_squared_error

def greedy_forward_selection(x_train, x_test, y_train, y_test, feature_names):

  selected_features = []
  remaining_features = list(feature_names)
  best_mse = float("inf")
  model = LinearRegressionClosedForm()

  while remaining_features:
        best_feature = None
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            feature_indices = [feature_names.index(f) for f in features_to_test]
            model.fit(x_train[:, feature_indices], y_train)
            predictions = model.predict(x_test[:, feature_indices])
            mse = mean_squared_error(y_test, predictions)
            if mse < best_mse:
                best_mse = mse
                best_feature = feature
        if best_feature:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break

  return selected_features, best_mse

In [13]:
selected_features, best_mse = greedy_forward_selection(x_train, x_test, y_train, y_test, list(x.columns))
print("Selected Features:", selected_features)
print("Best MSE:", best_mse)

Selected Features: ['quartile_zone', 'living_in_m2', 'grade', 'nice_view', 'perfect_condition', 'renovated', 'has_basement', 'has_lavatory', 'bedrooms', 'single_floor', 'month']
Best MSE: 10853780660.519274


In [9]:
def greedy_backward_selection(x_train, x_test, y_train, y_test):

    selected_features = list(x_train.columns)  # Start with all features
    best_mse = float("inf")
    model = LinearRegressionClosedForm()

    while len(selected_features) > 0:
        worst_feature = None
        for feature in selected_features:
            # Test removing one feature at a time
            features_to_test = [f for f in selected_features if f != feature]
            model.fit(x_train[features_to_test], y_train)
            predictions = model.predict(x_test[features_to_test])
            mse = mean_squared_error(y_test, predictions)
            # Update the worst feature if MSE improves
            if mse < best_mse:
                best_mse = mse
                worst_feature = feature
        if worst_feature:
            selected_features.remove(worst_feature)
        else:
            break

    return selected_features, best_mse

In [10]:
selected_features, best_mse = greedy_backward_selection(x_train, x_test, y_train, y_test)
print("Selected Features:", selected_features)
print("Best MSE:", best_mse)

Selected Features: ['bedrooms', 'grade', 'has_basement', 'living_in_m2', 'renovated', 'nice_view', 'perfect_condition', 'has_lavatory', 'single_floor', 'month', 'quartile_zone']
Best MSE: 10853780660.519266
