In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os


In [2]:
# Load the dataset
df = pd.read_csv("sample_csv/sample_input.csv")
df.head()


Unnamed: 0,ID,LowVar_Col,MedVar_Col,HighVar_Col,Corr_Col1,Corr_Col2,Corr_Col3,Imp_Feat1,Imp_Feat2_Weak,Imp_Feat3_VeryWeak,Cat_Feat,Target_Reg,Target_Class
0,0,1,,44.268225,0.0,0.008619,1.045502,0.487332,0.087403,0.003338,A,92.911141,0.0
1,1,1,,84.232685,0.010101,0.031015,1.068211,0.383663,0.01624,0.008825,C,178.085463,1.0
2,2,1,,5.850934,0.020202,0.025686,1.036346,0.034567,0.029623,0.003629,B,14.70702,0.0
3,3,1,,48.530212,0.030303,0.077049,0.988921,0.653782,0.096128,0.006716,A,91.956562,0.0
4,4,1,,77.579016,0.040404,0.0526,1.022043,0.051603,0.062867,0.007309,C,153.472354,1.0


In [3]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=['number'])

# Fill missing values with column means
numeric_df = numeric_df.fillna(numeric_df.mean())


In [4]:
# Standardize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)


In [5]:
# Apply PCA to retain 95% variance
pca = PCA(n_components=0.95)
pca_result = pca.fit_transform(scaled_data)

# Create a DataFrame for the PCA results
pca_columns = [f"PCA_{i+1}" for i in range(pca_result.shape[1])]
pca_df = pd.DataFrame(pca_result, columns=pca_columns)
pca_df.head()


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6
0,-3.055441,-1.809021,0.282044,0.026464,0.08645,1.354701
1,-3.905348,1.566057,1.149079,-0.447772,0.326796,-1.057213
2,-2.656521,-3.212297,0.207397,-1.275141,-1.09227,-0.727068
3,-2.702501,-1.561189,0.839634,0.659381,1.099572,0.98252
4,-3.451543,0.751091,2.03376,-0.168003,-0.693082,0.175847


In [6]:
print(" Original numeric columns:")
print(numeric_df.columns.tolist())

print("\n Number of PCA components retained:", pca.n_components_)
print("\n Explained variance ratio:")
print(pca.explained_variance_ratio_)


 Original numeric columns:
['ID', 'LowVar_Col', 'MedVar_Col', 'HighVar_Col', 'Corr_Col1', 'Corr_Col2', 'Corr_Col3', 'Imp_Feat1', 'Imp_Feat2_Weak', 'Imp_Feat3_VeryWeak', 'Target_Reg', 'Target_Class']

 Number of PCA components retained: 6

 Explained variance ratio:
[0.37893709 0.24409212 0.11366305 0.08720504 0.0751438  0.07089964]


In [7]:
# Create output folder if it doesn't exist
os.makedirs("notebook/output", exist_ok=True)

# Save PCA output to CSV
pca_df.to_csv("notebook/output/reduced_data.csv", index=False)
print("\n PCA-transformed data saved to: notebook/output/reduced_data.csv")



 PCA-transformed data saved to: notebook/output/reduced_data.csv
