In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import plotly.express as px

In [2]:
df = pd.read_csv('../data/train_wbcd.csv')

In [3]:
[df.isna().sum()/df.shape[0] == 0]

[Patient_ID     True
 Diagnosis      True
 f1             True
 f2             True
 f3             True
 f4             True
 f5             True
 f6             True
 f7             True
 f8             True
 f9             True
 f10            True
 f11            True
 f12            True
 f13            True
 f14            True
 f15            True
 f16            True
 f17            True
 f18            True
 f19            True
 f20            True
 f21           False
 f22            True
 f23            True
 f24            True
 f25            True
 f26            True
 f27            True
 f28            True
 f29            True
 f30            True
 dtype: bool]

In [4]:
X = df.drop(['Diagnosis', 'Patient_ID'], axis=1)

In [5]:
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(
    imputer.fit_transform(X),
    columns=X.columns
)

In [6]:
X

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
0,14.02,15.66,89.59,606.5,0.07966,0.05581,0.02087,0.02652,0.1589,0.05586,...,14.91,19.31,96.53,688.9,0.10340,0.10170,0.06260,0.08216,0.2136,0.06710
1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.54,16.67,152.20,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678
2,12.89,15.70,84.08,516.6,0.07818,0.09580,0.11150,0.03390,0.1432,0.05935,...,13.90,19.69,92.12,595.6,0.09926,0.23170,0.33440,0.10170,0.1999,0.07127
3,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.01968,0.1800,0.06569,...,11.38,15.65,73.23,394.5,0.13430,0.16500,0.08615,0.06696,0.2937,0.07722
4,13.16,20.54,84.06,538.7,0.07335,0.05275,0.01800,0.01256,0.1713,0.05888,...,14.50,28.46,95.29,648.3,0.11180,0.16460,0.07698,0.04195,0.2687,0.07429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,25.22,24.91,171.50,1878.0,0.10630,0.26650,0.33390,0.18450,0.1829,0.06782,...,30.00,33.62,211.70,2562.0,0.15730,0.60760,0.64760,0.28670,0.2355,0.10510
96,21.09,26.57,142.70,1311.0,0.11410,0.28320,0.24870,0.14960,0.2395,0.07398,...,26.68,33.48,176.50,2089.0,0.14910,0.75840,0.67800,0.29030,0.4098,0.12840
97,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.02280,0.1875,0.05715,...,13.06,25.75,84.35,517.8,0.13690,0.17580,0.13160,0.09140,0.3101,0.07007
98,11.20,29.37,70.67,386.0,0.07449,0.03558,0.00000,0.00000,0.1060,0.05502,...,11.92,38.30,75.19,439.6,0.09267,0.05494,0.00000,0.00000,0.1566,0.05905


In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
dim_red = PCA(n_components=3)
X_reduced = dim_red.fit_transform(X_scaled)

In [8]:
pca_df = pd.DataFrame(
    data=X_reduced,
    columns=['PC1', 'PC2', 'PC3']
)

In [9]:
pca_df

Unnamed: 0,PC1,PC2,PC3
0,-3.308300,-2.243232,-0.682648
1,3.834441,-1.120468,0.512224
2,-1.396165,0.071686,0.791181
3,-3.568554,0.847258,-1.870195
4,-2.872167,-1.157155,1.228936
...,...,...,...
95,10.425409,-1.847684,0.687194
96,8.852651,0.602500,-1.728561
97,-2.929528,-0.524457,-0.333030
98,-5.273610,-2.745657,4.437311


In [10]:
px.scatter_3d(
    data_frame=pca_df,
    x='PC1', 
    y='PC2',
    z='PC3',
)

In [11]:
dim_red.explained_variance_ratio_

array([0.50165299, 0.16554235, 0.09865804])

**Compute Explained Variance Ratio**:
   - The variance explained by each principal component is given by the eigenvalue divided by the sum of all eigenvalues.
   - The explained variance ratio for a principal component $ i $ is calculated as:
   $$
   \text{Explained Variance Ratio}_i = \frac{\lambda_i}{\sum_{j=1}^{p} \lambda_j}
   $$
   where $ \lambda_i $ is the eigenvalue of the $ i $-th principal component, and $ p $ is the total number of principal components.

In [13]:
pca_df.corr().round(2)

Unnamed: 0,PC1,PC2,PC3
PC1,1.0,0.0,0.0
PC2,0.0,1.0,0.0
PC3,0.0,0.0,1.0


The correlation matrix for the three principal components indicates no correlation between them