In [15]:
# ----------------------------------------------------------------------------
# Title: Assignment 7.2
# Author: Surenther Selvaraj
# Date: 23 Oct 2025
# Modified By: Surenther Selvaraj
# Description: Dimensionality Reduction and Feature Selection
# Data: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv
# ----------------------------------------------------------------------------

In [29]:
# --- Importing Libraries ---
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
warnings.filterwarnings('ignore')

In [None]:
# --- Import the housing data as a data frame and ensure that the data is loaded properly. --- #

# Define the file path
file_path = 'train.csv'

try:
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    print("--- Data loaded successfully! ---")

    # --- Ensure the data is loaded properly ---
    # 1. Print the first 5 rows to inspect the data
    print("\n--- First 5 Rows of the Dataset ---")
    print(df.head())

    # 2. Print the shape (rows, columns)
    print(f"\nDataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

except FileNotFoundError:
    print(f"--- ERROR: File not found at '{file_path}' ---")

--- Data loaded successfully! ---

--- First 5 Rows of the Dataset ---
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12  

In [None]:
# --- Drop the "Id" column and any features that are missing more than 40% of their values. --- #

# --- Drop the 'Id' column ---
df.drop('Id', axis=1, inplace=True)
print("--- 'Id' column dropped. ---")

# --- Drop columns with more than 40% missing values ---
# Calculate the percentage of missing values for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Define the threshold
threshold = 40.0

# Identify columns to drop
columns_to_drop = missing_percentage[missing_percentage > threshold].index

# Drop the identified columns
df.drop(columns=columns_to_drop, inplace=True)

print(f"\n--- Columns with more than {threshold}% missing values dropped. ---")
if len(columns_to_drop) > 0:
    print("Dropped columns:", list(columns_to_drop))
else:
    print("No columns met the threshold for dropping.")

# --- Verify the changes ---
print(f"\nNew dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

--- 'Id' column dropped. ---

--- Columns with more than 40.0% missing values dropped. ---
Dropped columns: ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

New dataset shape: 1460 rows and 74 columns


In [None]:
# --- For numerical columns, fill in any missing data with the median value. --- #

# --- Select numerical columns ---
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(f"--- Identified {len(numerical_cols)} numerical columns for imputation. ---")

# --- Fill missing values with the median for each column ---
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

print("--- Missing values in numerical columns filled with median. ---")

--- Identified 37 numerical columns for imputation. ---
--- Missing values in numerical columns filled with median. ---


In [None]:
# --- For categorical columns, fill in any missing data with the most common value (mode). --- #

# --- Select categorical columns ---
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"--- Identified {len(categorical_cols)} categorical columns for imputation. ---")

# --- Fill missing values with the mode for each column ---
for col in categorical_cols:
    # .mode() returns a Series; [0] selects the first one (in case of ties).
    most_common_value = df[col].mode()[0]
    # Fill NaNs with this mode using inplace=True
    df[col].fillna(most_common_value, inplace=True)

print("--- Missing values in categorical columns filled with mode. ---")

--- Identified 37 categorical columns for imputation. ---
--- Missing values in categorical columns filled with mode. ---


In [None]:
# --- Convert categorical columns to dummy variables --- #

# --- Store the original shape for comparison ---
original_shape = df.shape
print(f"Original shape (before dummies): {original_shape[0]} rows, {original_shape[1]} columns")

# --- Convert categorical columns to dummy variables ---
# pd.get_dummies() will automatically find and convert all 'object' type columns.
# drop_first=True creates k-1 dummies for k categories, avoiding the dummy variable trap.
# dtype=int ensures the new columns are 0s and 1s.
df = pd.get_dummies(df, drop_first=True, dtype=int)

# --- Verify the changes ---
new_shape = df.shape
print(f"New shape (after dummies): {new_shape[0]} rows, {new_shape[1]} columns")

Original shape (before dummies): 1460 rows, 74 columns
New shape (after dummies): 1460 rows, 230 columns


In [None]:
# --- Split the data into a training and test set, where the SalePrice column is the target. --- #

# --- Define Features (X) and Target (y) ---
# X contains all columns *except* 'SalePrice'
X = df.drop('SalePrice', axis=1)

# y contains *only* the 'SalePrice' column
y = df['SalePrice']

print("--- Features (X) and Target (y) successfully separated. ---")

# --- Split the data ---
# test_size=0.2 means 20% of the data will be used for testing.
# random_state=42 is a standard number to ensure the split is reproducible.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("--- Data successfully split into training and test sets. ---")

# --- Verify the shapes of the new sets ---
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

--- Features (X) and Target (y) successfully separated. ---
--- Data successfully split into training and test sets. ---
X_train shape: (1168, 229)
X_test shape: (292, 229)
y_train shape: (1168,)
y_test shape: (292,)


In [None]:
# --- Run a linear regression and report the R2-value and RMSE on the test set. --- #

# --- Create and Train the Model ---
# Initialize the Linear Regression model
model = LinearRegression()

# Train (fit) the model on the training data
model.fit(X_train, y_train)

print("--- Model trained successfully. ---")

# --- Make Predictions on the Test Set ---
y_pred = model.predict(X_test)

# --- Calculate and Report Metrics ---
# Calculate R-squared (R2)
# R2 measures the proportion of the variance in the target
# that is predictable from the features.
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
# RMSE measures the average magnitude of the errors in the target's units (dollars).
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n--- Model Performance on Test Set ---")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")

--- Model trained successfully. ---

--- Model Performance on Test Set ---
R-squared (R2): 0.6478
Root Mean Squared Error (RMSE): $51,973.14


In [None]:
# --- Fit and transform the training features with a PCA so that 90% of the variance is retained. --- #

# --- Standardize the Features ---
scaler = StandardScaler()

# Fit the scaler *only* on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Use the *same* scaler (fit on train) to transform the test data
X_test_scaled = scaler.transform(X_test)

print("--- Training and test features standardized. ---")


# --- Initialize and Fit PCA ---
# Set n_components=0.90 to automatically select the number of
# components that explain 90% of the variance.
pca = PCA(n_components=0.90, random_state=42)

# Fit and transform the *scaled training data*
X_train_pca = pca.fit_transform(X_train_scaled)

print("--- PCA fitted on training data. ---")


# --- Transform the Test Data ---
# Apply the same PCA transformation (fit on train) to the test data
X_test_pca = pca.transform(X_test_scaled)


# --- Report the Results ---
original_features = X_train_scaled.shape[1]
retained_components = pca.n_components_

print(f"\n--- PCA Transformation Complete ---")
print(f"Original number of features: {original_features}")
print(f"Components retained to explain 90% variance: {retained_components}")
print(f"New training data shape: {X_train_pca.shape}")
print(f"New test data shape: {X_test_pca.shape}")

--- Training and test features standardized. ---
--- PCA fitted on training data. ---

--- PCA Transformation Complete ---
Original number of features: 229
Components retained to explain 90% variance: 127
New training data shape: (1168, 127)
New test data shape: (292, 127)


How many features are in the PCA-transformed matrix?

Based on the above output, there are 127 features in the PCA-transformed matrix.

The output line "Components retained to explain 90% variance: 127" shows this directly.

This means PCA successfully reduced the dimensionality from 229 original features down to 127 principal components while retaining 90% of the variance.

In [None]:
# --- Transform but DO NOT fit the test features with the same PCA. --- #

# --- Transform the Test Data ---
X_test_pca = pca.transform(X_test_scaled)

print("--- Test features successfully transformed using the fitted PCA. ---")

# --- Verify the shape ---
# The number of columns should match the components from the training (e.g., 127)
print(f"New test data shape: {X_test_pca.shape}")
print(f"Training data shape (for comparison): {X_train_pca.shape}")

--- Test features successfully transformed using the fitted PCA. ---
New test data shape: (292, 127)
Training data shape (for comparison): (1168, 127)


In [None]:
# --- Repeat step 7 with your PCA transformed data. --- #

# --- Create and Train the Model on PCA Data ---
# Initialize a new Linear Regression model
model_pca = LinearRegression()

# Train (fit) the model on the PCA-transformed training data
model_pca.fit(X_train_pca, y_train)

print("--- Model trained successfully on PCA data. ---")

# --- Make Predictions on the PCA Test Set ---
# Use the trained PCA model to predict on the transformed test set
y_pred_pca = model_pca.predict(X_test_pca)

# --- Calculate and Report Metrics ---
# Calculate R-squared (R2)
r2_pca = r2_score(y_test, y_pred_pca)

# Calculate Root Mean Squared Error (RMSE)
rmse_pca = np.sqrt(mean_squared_error(y_test, y_pred_pca))

print("\n--- PCA Model Performance on Test Set ---")
print(f"R-squared (R2): {r2_pca:.4f}")
print(f"Root Mean Squared Error (RMSE): ${rmse_pca:,.2f}")

--- Model trained successfully on PCA data. ---

--- PCA Model Performance on Test Set ---
R-squared (R2): 0.8409
Root Mean Squared Error (RMSE): $34,938.50


In [None]:
# --- Take your original training features (from step 6) and apply a min-max scaler to them. --- #

# --- Initialize the Scaler ---
min_max_scaler = MinMaxScaler()

# --- Fit and Transform the Training Data ---
X_train_minmax = min_max_scaler.fit_transform(X_train)

# Convert back to DataFrame
X_train_minmax_df = pd.DataFrame(X_train_minmax, columns=X_train.columns, index=X_train.index)

print("--- Training features successfully scaled with MinMaxScaler. ---")

# --- Transform the Test Data ---
# Use the *same* scaler (fit on train) to transform X_test.
# We only call .transform() here.
X_test_minmax = min_max_scaler.transform(X_test)

# Convert back to DataFrame
X_test_minmax_df = pd.DataFrame(X_test_minmax, columns=X_test.columns, index=X_test.index)

print("--- Test features successfully transformed with the same scaler. ---")

# --- Verify the results ---
print("\n--- Verification ---")
print(f"Original X_train min: {X_train.to_numpy().min():.2f}, max: {X_train.to_numpy().max():.2f}")
print(f"Scaled X_train_minmax min: {X_train_minmax.min():.2f}, max: {X_train_minmax.max():.2f}")

--- Training features successfully scaled with MinMaxScaler. ---
--- Test features successfully transformed with the same scaler. ---

--- Verification ---
Original X_train min: 0.00, max: 215245.00
Scaled X_train_minmax min: 0.00, max: 1.00


In [None]:
# --- Find the min-max scaled features in your training set that have a variance above 0.1 --- #

# --- Initialize and Fit the Selector ---
# Create the selector to find features with variance > 0.1
var_selector = VarianceThreshold(threshold=0.1)

# Fit the selector to the scaled training data
var_selector.fit(X_train_minmax_df)

# --- Get the Results ---
# .get_support() returns a boolean mask (True/False) for each feature
features_to_keep_mask = var_selector.get_support()

# Use the mask to get the *names* of the features to keep
features_kept = X_train_minmax_df.columns[features_to_keep_mask].tolist()

# Use the inverted mask (~) to get the names of the features to drop
features_dropped = X_train_minmax_df.columns[~features_to_keep_mask].tolist()

# --- 3. Report the Findings ---
print(f"--- Variance Threshold (threshold=0.1) Results ---")
print(f"Original number of features: {X_train_minmax_df.shape[1]}")
print(f"Features kept (variance > 0.1): {len(features_kept)}")
print(f"Features dropped (variance <= 0.1): {len(features_dropped)}")

print("\n--- List of Features DROPPED ---")
if len(features_dropped) > 0:
    for f in features_dropped:
        print(f)
else:
    print("No features were dropped.")

--- Variance Threshold (threshold=0.1) Results ---
Original number of features: 229
Features kept (variance > 0.1): 40
Features dropped (variance <= 0.1): 189

--- List of Features DROPPED ---
MSSubClass
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
MasVnrArea
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
TotRmsAbvGrd
Fireplaces
GarageYrBlt
GarageCars
GarageArea
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
MSZoning_FV
MSZoning_RH
Street_Pave
LotShape_IR2
LotShape_IR3
LandContour_HLS
LandContour_Low
LandContour_Lvl
Utilities_NoSeWa
LotConfig_CulDSac
LotConfig_FR2
LotConfig_FR3
LandSlope_Mod
LandSlope_Sev
Neighborhood_Blueste
Neighborhood_BrDale
Neighborhood_BrkSide
Neighborhood_ClearCr
Neighborhood_CollgCr
Neighborhood_Crawfor
Neighborhood_Edwards
Neighborhood_Gilbert
Neighborhood_IDOTRR
Neighborhood_MeadowV
Neighborhood_Mitchel
Neigh

In [31]:
# --- Transform but DO NOT fit the test features with the same steps applied in steps 11 and 12. --- #

# --- Apply the fitted MinMaxScaler ---
X_test_minmax = min_max_scaler.transform(X_test)

# --- Convert back to DataFrame ---
X_test_minmax_df = pd.DataFrame(X_test_minmax, columns=X_test.columns, index=X_test.index)

# --- Apply the fitted VarianceThreshold (from Part 12) ---
X_test_vt = var_selector.transform(X_test_minmax_df)

# --- Report the change in shape ---
print(f"Original X_test shape: {X_test.shape}")
print(f"New X_test shape after scaling and thresholding: {X_test_vt.shape}")

Original X_test shape: (292, 229)
New X_test shape after scaling and thresholding: (292, 40)


In [32]:
# --- Repeat step 7 with the high variance data. --- #

# --- Transform the Training Data ---
X_train_vt = var_selector.transform(X_train_minmax_df)

print(f"--- Final training data created with shape: {X_train_vt.shape} ---")


# --- Create and Train the Model ---
model_vt = LinearRegression()

# Train (fit) the model on the high-variance training data
model_vt.fit(X_train_vt, y_train)
print("--- Model trained successfully on high-variance data. ---")


# --- Make Predictions on the High-Variance Test Set ---
y_pred_vt = model_vt.predict(X_test_vt)


# --- Calculate and Report Metrics ---
# Calculate R-squared (R2)
r2_vt = r2_score(y_test, y_pred_vt)

# Calculate Root Mean Squared Error (RMSE)
rmse_vt = np.sqrt(mean_squared_error(y_test, y_pred_vt))

print("\n--- High-Variance Model Performance on Test Set ---")
print(f"R-squared (R2): {r2_vt:.4f}")
print(f"Root Mean Squared Error (RMSE): ${rmse_vt:,.2f}")

--- Final training data created with shape: (1168, 40) ---
--- Model trained successfully on high-variance data. ---

--- High-Variance Model Performance on Test Set ---
R-squared (R2): 0.6481
Root Mean Squared Error (RMSE): $51,952.01


### Conclusion
This notebook compared three methods for predicting house prices. First, a baseline linear regression model was trained using all 229 preprocessed features, but it performed poorly, only explaining about 65% of the price variance (R²: 0.6478) with an average error of nearly $52,000 (RMSE).

The second method used PCA to reduce the 229 features to 127 principal components. This model was far superior, explaining 84% of the variance (R²: 0.8409) and reducing the average error to just $35,000 (RMSE).

The third method used a Variance Threshold to select the 40 "highest variance" features. This model performed almost identically to the baseline (R²: 0.6481), showing no improvement.

From these we can understand that simply having more features (229) isn't always better. The baseline model was likely "noisy." PCA created a much more accurate and efficient model by compressing the features and filtering out that noise. The Variance Threshold method, while simple, failed to select a useful set of features, proving that PCA was the clearly superior strategy for this dataset.