In [1]:
# ----------------------------------------------------------------------------
# Title: Assignment 7.2
# Author: Surenther Selvaraj
# Date: 23 Oct 2025
# Modified By: Surenther Selvaraj
# Description: Dimensionality Reduction and Feature Selection
# Data: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv
# ----------------------------------------------------------------------------

In [18]:
# --- Importing Libraries ---
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
warnings.filterwarnings('ignore')

In [None]:
# --- Import the housing data ---

# Define the file path
file_path = 'train.csv'

try:
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    print("--- Data loaded successfully! ---")

    # --- Ensure the data is loaded properly ---
    # 1. Print the first 5 rows to inspect the data
    print("\n--- First 5 Rows of the Dataset ---")
    print(df.head())

    # 2. Print the shape (rows, columns)
    print(f"\nDataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

except FileNotFoundError:
    print(f"--- ERROR: File not found at '{file_path}' ---")

--- Data loaded successfully! ---

--- First 5 Rows of the Dataset ---
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12  

In [4]:
# --- Drop the 'Id' column ---
df.drop('Id', axis=1, inplace=True)
print("--- 'Id' column dropped. ---")

# --- Drop columns with more than 40% missing values ---
# Calculate the percentage of missing values for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Define the threshold
threshold = 40.0

# Identify columns to drop
columns_to_drop = missing_percentage[missing_percentage > threshold].index

# Drop the identified columns
df.drop(columns=columns_to_drop, inplace=True)

print(f"\n--- Columns with more than {threshold}% missing values dropped. ---")
if len(columns_to_drop) > 0:
    print("Dropped columns:", list(columns_to_drop))
else:
    print("No columns met the threshold for dropping.")

# --- Verify the changes ---
print(f"\nNew dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

--- 'Id' column dropped. ---

--- Columns with more than 40.0% missing values dropped. ---
Dropped columns: ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

New dataset shape: 1460 rows and 74 columns


In [6]:
# --- Select numerical columns ---
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(f"--- Identified {len(numerical_cols)} numerical columns for imputation. ---")

# --- Fill missing values with the median for each column ---
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

print("--- Missing values in numerical columns filled with median. ---")

--- Identified 37 numerical columns for imputation. ---
--- Missing values in numerical columns filled with median. ---


In [10]:
# --- Select categorical columns ---
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"--- Identified {len(categorical_cols)} categorical columns for imputation. ---")

# --- Fill missing values with the mode for each column ---
for col in categorical_cols:
    # .mode() returns a Series; [0] selects the first one (in case of ties).
    most_common_value = df[col].mode()[0]
    # Fill NaNs with this mode using inplace=True
    df[col].fillna(most_common_value, inplace=True)

print("--- Missing values in categorical columns filled with mode. ---")

--- Identified 37 categorical columns for imputation. ---
--- Missing values in categorical columns filled with mode. ---


In [12]:
# --- Store the original shape for comparison ---
original_shape = df.shape
print(f"Original shape (before dummies): {original_shape[0]} rows, {original_shape[1]} columns")

# --- Convert categorical columns to dummy variables ---
# pd.get_dummies() will automatically find and convert all 'object' type columns.
# drop_first=True creates k-1 dummies for k categories, avoiding the dummy variable trap.
# dtype=int ensures the new columns are 0s and 1s.
df = pd.get_dummies(df, drop_first=True, dtype=int)

# --- Verify the changes ---
new_shape = df.shape
print(f"New shape (after dummies): {new_shape[0]} rows, {new_shape[1]} columns")

Original shape (before dummies): 1460 rows, 230 columns
New shape (after dummies): 1460 rows, 230 columns


In [16]:
# --- Define Features (X) and Target (y) ---
# X contains all columns *except* 'SalePrice'
X = df.drop('SalePrice', axis=1)

# y contains *only* the 'SalePrice' column
y = df['SalePrice']

print("--- Features (X) and Target (y) successfully separated. ---")

# --- Split the data ---
# test_size=0.2 means 20% of the data will be used for testing.
# random_state=42 is a standard number to ensure the split is reproducible.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("--- Data successfully split into training and test sets. ---")

# --- Verify the shapes of the new sets ---
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

--- Features (X) and Target (y) successfully separated. ---
--- Data successfully split into training and test sets. ---
X_train shape: (1168, 229)
X_test shape: (292, 229)
y_train shape: (1168,)
y_test shape: (292,)


In [17]:
# --- Create and Train the Model ---
# Initialize the Linear Regression model
model = LinearRegression()

# Train (fit) the model on the training data
model.fit(X_train, y_train)

print("--- Model trained successfully. ---")

# --- Make Predictions on the Test Set ---
y_pred = model.predict(X_test)

# --- Calculate and Report Metrics ---
# Calculate R-squared (R2)
# R2 measures the proportion of the variance in the target
# that is predictable from the features.
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
# RMSE measures the average magnitude of the errors in the target's units (dollars).
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n--- Model Performance on Test Set ---")
print(f"R-squared (R2): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")

--- Model trained successfully. ---

--- Model Performance on Test Set ---
R-squared (R2): 0.6478
Root Mean Squared Error (RMSE): $51,973.14


In [19]:
# --- Standardize the Features ---
scaler = StandardScaler()

# Fit the scaler *only* on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Use the *same* scaler (fit on train) to transform the test data
X_test_scaled = scaler.transform(X_test)

print("--- Training and test features standardized. ---")


# --- Initialize and Fit PCA ---
# Set n_components=0.90 to automatically select the number of
# components that explain 90% of the variance.
pca = PCA(n_components=0.90, random_state=42)

# Fit and transform the *scaled training data*
X_train_pca = pca.fit_transform(X_train_scaled)

print("--- PCA fitted on training data. ---")


# --- Transform the Test Data ---
# Apply the same PCA transformation (fit on train) to the test data
X_test_pca = pca.transform(X_test_scaled)


# --- Report the Results ---
original_features = X_train_scaled.shape[1]
retained_components = pca.n_components_

print(f"\n--- PCA Transformation Complete ---")
print(f"Original number of features: {original_features}")
print(f"Components retained to explain 90% variance: {retained_components}")
print(f"New training data shape: {X_train_pca.shape}")
print(f"New test data shape: {X_test_pca.shape}")

--- Training and test features standardized. ---
--- PCA fitted on training data. ---

--- PCA Transformation Complete ---
Original number of features: 229
Components retained to explain 90% variance: 127
New training data shape: (1168, 127)
New test data shape: (292, 127)


Based on the above output, there are 127 features in the PCA-transformed matrix.

The output line "Components retained to explain 90% variance: 127" shows this directly.

This means PCA successfully reduced the dimensionality from 229 original features down to 127 principal components while retaining 90% of the variance.

In [20]:
# --- 1. Transform the Test Data ---
X_test_pca = pca.transform(X_test_scaled)

print("--- Test features successfully transformed using the fitted PCA. ---")

# --- 2. Verify the shape ---
# The number of columns should match the components from the training (e.g., 127)
print(f"New test data shape: {X_test_pca.shape}")
print(f"Training data shape (for comparison): {X_train_pca.shape}")

--- Test features successfully transformed using the fitted PCA. ---
New test data shape: (292, 127)
Training data shape (for comparison): (1168, 127)
