In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/datathon2025/train.csv")


In [4]:
import numpy as np

# Select only numerical columns (excluding categorical features)
numerical_df = df.select_dtypes(include=[np.number]).drop(columns=["Id"])  # Remove 'Id' as it's just an identifier

# Handle missing values by filling with the mean of each column
numerical_df = numerical_df.fillna(numerical_df.mean())

# Standardize the data (zero mean, unit variance)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_df)

# Compute the covariance matrix
cov_matrix = np.cov(scaled_data, rowvar=False)

# Perform eigenvalue decomposition
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# Sort eigenvalues and eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Return the top 5 eigenvalues and eigenvectors for interpretation
eigenvalues[:35], eigenvectors[:, :35]


(array([7.9091255 , 3.20874356, 2.57725775, 2.02717987, 1.49148846,
        1.19454129, 1.15265956, 1.13406704, 1.11011297, 1.09568502,
        1.05118621, 1.02259199, 1.0120192 , 0.95633808, 0.91882154,
        0.89970425, 0.84539057, 0.81313263, 0.78590357, 0.73936518,
        0.67437586, 0.62474974, 0.58736304, 0.54073096, 0.43020133,
        0.40171418, 0.32250384, 0.29125728, 0.2635927 , 0.24015599,
        0.19520206, 0.15170252, 0.14073883, 0.12215726, 0.09359999]),
 array([[-0.01793272, -0.16539772,  0.2244277 , ...,  0.00079744,
          0.08720987, -0.0427887 ],
        [ 0.15132268,  0.00949372, -0.27242654, ...,  0.00164002,
          0.06488731,  0.0118214 ],
        [ 0.10245635,  0.02990583, -0.27845216, ...,  0.03728009,
         -0.02094389,  0.00382695],
        ...,
        [ 0.02052032, -0.02895691,  0.0048399 , ...,  0.02255729,
          0.00877003, -0.00521361],
        [-0.01246075,  0.03646642, -0.00154245, ...,  0.00997944,
          0.00287591,  0.00499417],

In [5]:
# Get the first principal component (corresponding to the largest eigenvalue)
first_pc = eigenvectors[:, 0]

# Get the indices of the top 5 features contributing to the first principal component
top_features_indices = np.argsort(np.abs(first_pc))[::-1][:35]

# Get the feature names
top_features = numerical_df.columns[top_features_indices]

top_features


Index(['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', 'FullBath', '1stFlrSF', 'TotRmsAbvGrd', 'YearBuilt',
       'GarageYrBlt', 'YearRemodAdd', 'MasVnrArea', 'Fireplaces',
       'LotFrontage', 'OpenPorchSF', 'BsmtFinSF1', 'WoodDeckSF', '2ndFlrSF',
       'HalfBath', 'BsmtUnfSF', 'LotArea', 'BedroomAbvGr', 'BsmtFullBath',
       'OverallCond', 'EnclosedPorch', 'PoolArea', 'ScreenPorch',
       'KitchenAbvGr', 'MoSold', 'MSSubClass', '3SsnPorch', 'YrSold',
       'BsmtHalfBath', 'BsmtFinSF2'],
      dtype='object')

In [6]:
# Compute the cumulative explained variance ratio
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Find the number of principal components needed to explain at least 95% variance
num_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

num_components_95


26