## Here I'm going to do a simple experiment to check the difference between PCs across full data and test + train data

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# load the data set
fgl = pd.read_csv("https://www.dropbox.com/scl/fi/4m46hl6wul60pjhvmgqkl/fgl.csv?rlkey=0q33wenv2tqxpy3ldypptqeyj&dl=1")

In [3]:
print(fgl)

       RI     Na    Mg    Al     Si     K    Ca    Ba   Fe  type
0    3.01  13.64  4.49  1.10  71.78  0.06  8.75  0.00  0.0  WinF
1   -0.39  13.89  3.60  1.36  72.73  0.48  7.83  0.00  0.0  WinF
2   -1.82  13.53  3.55  1.54  72.99  0.39  7.78  0.00  0.0  WinF
3   -0.34  13.21  3.69  1.29  72.61  0.57  8.22  0.00  0.0  WinF
4   -0.58  13.27  3.62  1.24  73.08  0.55  8.07  0.00  0.0  WinF
..    ...    ...   ...   ...    ...   ...   ...   ...  ...   ...
209 -1.77  14.14  0.00  2.88  72.61  0.08  9.18  1.06  0.0  Head
210 -1.15  14.92  0.00  1.99  73.06  0.00  8.40  1.59  0.0  Head
211  2.65  14.36  0.00  2.02  73.42  0.00  8.44  1.64  0.0  Head
212 -1.49  14.38  0.00  1.94  73.61  0.00  8.48  1.57  0.0  Head
213 -0.89  14.23  0.00  2.08  73.36  0.00  8.62  1.67  0.0  Head

[214 rows x 10 columns]


#### We have 214 observations. Let's assume the first 114 obs are training data and the rest 100 are testing data set. And remove the last column, we just focus on the first 9 variables

In [4]:
# Split the data into test and train
train = fgl.iloc[0:114, 0:9] 
print(len(train))
test = fgl.iloc[-100:, 0:9]
print(len(test))

114
100


In [5]:
# Perform PCA on the train and test sets
scaler = StandardScaler()
PC_train = PCA().fit_transform(scaler.fit_transform(train))
PC_test = PCA().fit_transform(scaler.transform(test))

In [6]:
# Combine PC_test and PC_train
PC_combined = np.vstack((PC_test, PC_train))
len(PC_combined)

214

In [7]:
# Perform PCA on the entire dataset
PC_all = PCA().fit_transform(scaler.fit_transform(fgl.iloc[:, 0:9]))
len(PC_all)

214

In [8]:
# Extract the first column from PC_all_original and PC_combined
compare = np.column_stack((PC_all[:, 0], PC_combined[:, 0]))

In [9]:
compare

array([[ 1.15113957e+00,  3.21043495e-01],
       [-5.74137172e-01, -3.87678594e-02],
       [-9.40159722e-01,  1.92703864e-01],
       [-1.42083283e-01,  7.25113658e-01],
       [-3.51092291e-01,  6.29691875e-01],
       [-2.90266583e-01,  5.57657527e-01],
       [-2.52671444e-01,  2.66491391e-01],
       [-1.20299466e-01,  6.23814572e-01],
       [-2.08160308e-02,  2.10263856e-01],
       [-2.35222894e-03,  5.60337250e-01],
       [-3.82514937e-01, -7.31097072e-02],
       [-1.33538675e-02,  3.80003343e-01],
       [-3.09724390e-01,  1.72776537e-01],
       [ 1.56521013e-01, -2.26443859e-01],
       [-8.99566628e-02,  1.98866111e-02],
       [-1.15255023e-01, -5.97840787e-01],
       [ 1.73458320e-01, -1.02692263e+00],
       [ 1.45812372e+00, -1.63207703e+00],
       [ 5.47316267e-01,  1.83112350e-01],
       [-3.08817022e-01,  2.10680357e-01],
       [ 2.05673879e-01, -7.02161116e-02],
       [ 1.21605345e+00,  1.45214707e-01],
       [ 4.11679297e-02,  1.90833901e-02],
       [-1.