# Nutrition project: Principal component analyses (PCA)

## 1. Install and import the necesary packages and libraries

I already have the most recent versions of **pandas, numpy, seaborn, matplotlib and scikit learn** installed, but you can install them using pip (see pypi.org) or conda install in Anaconda prompt (see anaconda.org). If you get the ImportError: cannot import name 'html5lib' from 'pip._vendor', you can install html5lib in Anaconda prompt (conda install -c anaconda html5lib).

Currently installed versions: 
<br>Pandas 1.4.4
<br>numpy 1.21.5
<br>seaborn 0.12.2
<br>matplotlib 3.5.1
<br>scikit learn 1.1.1

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

## 2. Read csv file into Pandas dataframe

In [2]:
#Read the csv file into the pandas dataframe
df2 = pd.read_csv('main_transformed.csv')

# If the rows are truncated so we can't see the full list, you can correct that with:
pd.set_option('display.max_rows', None)

#Let's display max columns too 
pd.set_option('display.max_columns', None)

df2.head()

Unnamed: 0.1,Unnamed: 0,id,age,sex,income,items_home,cold_flu,antibiotics,ill_7days,ill_now,exercise_stren,exercise_mod,fruit_juice,fruit,veg,red_meat,chicken,pork,fish,eggs,bread,pap_samp,rice_pasta,dairy,soft_drinks,take_away,smoke,alcohol,avg_systbp,avg_diabp,body_fat,muscle_mass,bmi,visceral_fat,metabolic_age,bfat_rarm,bfat_rleg,bfat_trunk,ecw_percentage,fat_rarm,fat_rleg,fat_trunk,muscle_rarm,muscle_rleg,muscle_trunk,left_grip,right_grip,sitting_height,standing_height,age_log,age_log_capped,exercise,exercise_capped,fruitveg_index,carbs_index,protein_index,junkfood_index,pork_log,pork_log_capped,fish_log,fish_log_capped,avg_systbp2,avg_systbp_capped,avg_diabp2,avg_diabp_capped,muscle_mass_capped,bmi_log,bmi_log_capped,visceral_fat_log,ecw_percentage_capped,avg_grip,avg_grip_capped,sitting_height_capped,standing_height_capped,age_scaled,cold_flu_scaled,antibiotics_scaled,fruit_juice_scaled,fruit_scaled,veg_scaled,red_meat_scaled,chicken_scaled,pork_scaled,fish_scaled,eggs_scaled,bread_scaled,pap_samp_scaled,rice_pasta_scaled,dairy_scaled,soft_drinks_scaled,take_away_scaled,smoke_scaled,alcohol_scaled,body_fat_scaled,muscle_mass_scaled,bmi_scaled,visceral_fat_scaled,metabolic_age_scaled,bfat_rarm_scaled,bfat_rleg_scaled,bfat_trunk_scaled,ecw_percentage_scaled,fat_rarm_scaled,fat_rleg_scaled,fat_trunk_scaled,muscle_rarm_scaled,muscle_rleg_scaled,muscle_trunk_scaled,sitting_height_scaled,standing_height_scaled,age_log_capped_scaled,exercise_scaled,exercise_capped_scaled,fruitveg_index_scaled,carbs_index_scaled,protein_index_scaled,junkfood_index_scaled,pork_log_capped_scaled,fish_log_capped_scaled,avg_systbp2_scaled,avg_systbp_capped_scaled,avg_diabp2_scaled,avg_diabp_capped_scaled,muscle_mass_capped_scaled,bmi_log_capped_scaled,visceral_fat_log_scaled,ecw_percentage_capped_scaled,avg_grip_scaled,avg_grip_capped_scaled,sitting_height_capped_scaled,standing_height_capped_scaled,infections_index,infections_index2,infections_index_log,infections_index_scaled,infections_index_log_scaled
0,0,1,20.8,1,2,7,1,0,False,False,0,1,2,5,8,3,3,0,0,1,6,4,4,8,6,3,False,True,119.0,71.5,3.2,51.7,18.1,1,12,7.0,2.0,3.0,38.4,0.2,0.2,0.9,3.0,9.8,26.6,50,50,131.5,176.4,3.034953,3.034953,1,1.0,15,14,7,9,0.0,0.0,0.0,0.0,119.0,119.0,71.5,71.5,51.7,2.895912,2.895912,0.0,38.4,50.0,50.0,131.5,176.4,0.258929,0.1,0.0,0.25,0.625,1.0,0.375,0.375,0.0,0.0,0.125,0.75,0.5,0.5,1.0,0.75,0.375,0.0,1.0,0.0,0.498851,0.111913,0.0,0.0,0.031311,0.0,0.0,0.313433,0.0,0.0,0.0,0.545455,0.60241,0.465753,0.746388,0.581395,0.368216,0.047619,0.051477,0.714286,0.583333,0.285714,0.5625,0.0,0.0,0.587629,0.494248,0.428571,0.436718,0.515411,0.189773,0.0,0.216739,0.72449,0.72449,0.626139,0.581395,0.1,1.1,0.09531,0.027778,0.062455
1,1,2,20.6,0,2,9,2,0,False,False,0,7,0,0,5,0,5,0,0,6,8,6,6,4,2,5,False,True,119.0,74.0,20.3,36.2,17.4,1,12,18.1,29.2,14.3,40.6,0.4,2.6,3.7,1.7,6.0,21.0,23,26,129.9,165.8,3.025291,3.025291,7,7.0,5,20,11,7,0.0,0.0,0.0,0.0,119.0,119.0,74.0,74.0,36.2,2.85647,2.85647,0.0,40.6,24.5,24.5,129.9,165.8,0.241071,0.2,0.0,0.0,0.0,0.625,0.0,0.625,0.0,0.0,0.75,1.0,0.75,0.75,0.5,0.25,0.625,0.0,1.0,0.372549,0.142529,0.086643,0.0,0.0,0.248532,0.509363,0.261574,0.477612,0.052632,0.235294,0.124444,0.151515,0.144578,0.210046,0.720706,0.375969,0.344522,0.333333,0.360341,0.238095,0.833333,0.47619,0.4375,0.0,0.0,0.587629,0.494248,0.468254,0.485051,0.14726,0.14993,0.0,0.404041,0.204082,0.204082,0.588279,0.375969,0.2,1.2,0.182322,0.055556,0.119472
2,2,3,19.9,0,2,4,1,1,True,True,0,7,0,2,4,1,1,0,4,0,8,0,8,3,1,2,False,False,101.5,63.0,20.0,33.3,18.5,1,12,22.5,30.1,11.8,40.9,0.5,2.6,2.7,1.5,5.7,19.1,22,24,123.2,154.0,2.99072,2.99072,7,7.0,6,16,6,3,0.0,0.0,1.609438,0.752575,101.5,101.5,63.0,63.0,33.3,2.917771,2.917771,0.0,40.9,23.0,23.0,123.2,154.0,0.178571,0.1,0.1,0.0,0.25,0.5,0.125,0.125,0.0,0.5,0.0,1.0,0.0,1.0,0.375,0.125,0.25,0.0,0.0,0.366013,0.075862,0.126354,0.0,0.0,0.334638,0.526217,0.203704,0.5,0.078947,0.235294,0.08,0.090909,0.108434,0.123288,0.613162,0.147287,0.259744,0.333333,0.360341,0.285714,0.666667,0.238095,0.1875,0.0,1.0,0.407216,0.270092,0.293651,0.272385,0.078381,0.211854,0.0,0.429582,0.173469,0.173469,0.429743,0.147287,2.2,3.2,1.163151,0.611111,0.762194
3,3,4,23.1,0,2,9,1,0,True,False,3,5,0,8,2,0,4,0,0,3,1,0,5,4,1,0,False,False,117.0,78.0,39.6,42.9,29.6,6,38,45.4,42.6,36.4,44.8,1.7,5.7,14.8,1.9,7.3,24.6,31,35,124.5,159.0,3.139833,3.139833,11,11.0,10,6,7,1,0.0,0.0,0.0,0.0,117.0,117.0,78.0,78.0,42.9,3.387774,3.387774,1.791759,44.8,33.0,33.0,124.5,159.0,0.464286,0.1,0.0,0.0,1.0,0.25,0.0,0.5,0.0,0.0,0.375,0.125,0.0,0.625,0.5,0.125,0.0,0.0,0.0,0.793028,0.296552,0.527076,0.357143,0.83871,0.782779,0.7603,0.773148,0.791045,0.394737,0.539216,0.617778,0.212121,0.301205,0.374429,0.634029,0.244186,0.625411,0.52381,0.56625,0.47619,0.25,0.285714,0.0625,0.0,0.0,0.56701,0.46863,0.531746,0.562385,0.306397,0.686639,0.661642,0.761616,0.377551,0.377551,0.460504,0.244186,1.1,2.1,0.741937,0.305556,0.48618
4,4,5,22.5,0,1,9,1,1,False,False,0,5,4,7,3,4,5,0,2,2,8,5,2,0,5,6,False,False,118.5,83.0,24.6,42.0,22.3,1,15,23.0,35.1,17.5,40.8,0.7,3.8,5.5,2.1,6.6,24.8,24,30,127.5,162.5,3.113515,3.113515,5,5.0,14,15,13,11,0.0,0.0,1.098612,0.752575,118.5,118.5,83.0,83.0,42.0,3.104587,3.104587,0.0,40.8,27.0,27.0,127.5,162.5,0.410714,0.1,0.1,0.5,0.875,0.375,0.5,0.625,0.0,0.25,0.25,1.0,0.625,0.25,0.0,0.625,0.75,0.0,0.0,0.466231,0.275862,0.263538,0.0,0.096774,0.344423,0.61985,0.335648,0.492537,0.131579,0.352941,0.204444,0.272727,0.216867,0.383562,0.682183,0.312016,0.560873,0.238095,0.257386,0.666667,0.625,0.571429,0.6875,0.0,1.0,0.582474,0.487844,0.611111,0.659051,0.28502,0.400571,0.0,0.421068,0.255102,0.255102,0.53149,0.312016,0.2,1.2,0.182322,0.055556,0.119472


## 3. Principal component analysis (PCA)

The body composition/ bio-impedence measures still contain too many variables to be useful as a predictor in the downstream analysis, so I will try and reduce the number of variables using PCA. Although I have already reduced the number of diet variables through the calculation of food group indexes (see nutrition-data-wrangling), I also want to evaluate PCA as a means of reducing the number of diet variables. 

### 3.1 Body composition/ bio-impedence

The body composition variables initially consisted of multiple highly correlated variables. I already (a) dropped the most highly correlated body composition variables and (b) determined that we need to include standing_height, sitting_height and avg_grip in this group because of their high correlations (see nutrition-data-wrangling). 

Here, I will use PCA to reduce the number of dimensions further for this to be a useful measure of health. 

In [3]:
# Selecting the variables (standardized, minimally transformed - see pca-compare in the compare repository for a detailed explanation of this choice)
df2_bodycomp = df2[['body_fat_scaled', 'muscle_mass_scaled', 'bmi_scaled', 'visceral_fat_scaled', 'metabolic_age_scaled', 'bfat_rarm_scaled', 'bfat_rleg_scaled', 'bfat_trunk_scaled', 'ecw_percentage_scaled', 'fat_rarm_scaled', 'fat_rleg_scaled', 'fat_trunk_scaled', 'muscle_rarm_scaled', 'muscle_rleg_scaled', 'muscle_trunk_scaled', 'avg_grip_scaled', 'sitting_height_scaled', 'standing_height_scaled']]

In [4]:
#Conduct the PCA
pca_bodycomp = PCA(n_components=2) # Specify the number of principal components you want
principalComponents = pca_bodycomp.fit_transform(df2_bodycomp)

#Create the df that will contain all the principal component values
principal_bodycomp_Df = pd.DataFrame(data = principalComponents
             , columns = ['pc1_bodycomp', 'pc2_bodycomp'])

#Check the results
print(principal_bodycomp_Df.head())

   pc1_bodycomp  pc2_bodycomp
0     -1.106312      0.227749
1     -0.318955     -0.635933
2     -0.252522     -0.825377
3      1.085679     -0.022105
4     -0.062037     -0.399174


In [5]:
# Get the explained variance
print('Explained variation per principal component: {}'.format(pca_bodycomp.explained_variance_ratio_))

Explained variation per principal component: [0.5657224 0.3067538]


In [6]:
# Calculate the total variance explained
print('Total explained variance:')
print(0.5657224+0.3067538)

#The first two PCs explained 87.25% of the variance
#A third PC for the standardized, minimally transformed variables only explained an additional 3.3% (results not shown) so we stuck with 2PCs

Total explained variance:
0.8724761999999999


In [8]:
# Get the loadings on each of the principal components
# Define feature_names
feature_names = ['body_fat_scaled', 'muscle_mass_scaled', 'bmi_scaled', 'visceral_fat_scaled', 'metabolic_age_scaled', 'bfat_rarm_scaled', 'bfat_rleg_scaled', 'bfat_trunk_scaled', 'ecw_percentage_scaled', 'fat_rarm_scaled', 'fat_rleg_scaled', 'fat_trunk_scaled', 'muscle_rarm_scaled', 'muscle_rleg_scaled', 'muscle_trunk_scaled', 'avg_grip_scaled', 'sitting_height_scaled', 'standing_height_scaled']

# Get loadings
loadings = pca_bodycomp.components_.T * np.sqrt(pca_bodycomp.explained_variance_)
loading_matrix=pd.DataFrame(loadings, columns=['pc1_bodycomp', 'pc2_bodycomp'], index = feature_names)
print(loading_matrix.sort_values(by=['pc1_bodycomp'], ascending=False))

# PC1 can be referred to as the fat component, with higher values on this component indicating body types that are higher in body fat & metabolic age and lower in strength, muscle and height
# PC2 can be referred to as the muscular component with higher values indicating more strength height and a stronger grip, with lower body fat

                        pc1_bodycomp  pc2_bodycomp
metabolic_age_scaled        0.268125      0.122024
bfat_rarm_scaled            0.230172     -0.043413
bfat_rleg_scaled            0.223073     -0.087473
body_fat_scaled             0.214328     -0.020238
bfat_trunk_scaled           0.189577      0.042052
fat_rleg_scaled             0.186335     -0.002375
fat_trunk_scaled            0.184052      0.093446
bmi_scaled                  0.141523      0.076207
fat_rarm_scaled             0.140274      0.040363
visceral_fat_scaled         0.125543      0.096235
ecw_percentage_scaled       0.121946     -0.020997
muscle_trunk_scaled         0.002112      0.169459
sitting_height_scaled      -0.003007      0.029785
muscle_mass_scaled         -0.025591      0.189866
muscle_rleg_scaled         -0.052942      0.197714
muscle_rarm_scaled         -0.067115      0.199110
standing_height_scaled     -0.074692      0.125828
avg_grip_scaled            -0.098255      0.119604


In [9]:
# Add the PCs to the original dataframe (concatenate)
df2 = pd.concat([df2, principal_bodycomp_Df], axis=1)
print('shape', df2.shape)
df2.tail()


shape (270, 138)


Unnamed: 0.1,Unnamed: 0,id,age,sex,income,items_home,cold_flu,antibiotics,ill_7days,ill_now,exercise_stren,exercise_mod,fruit_juice,fruit,veg,red_meat,chicken,pork,fish,eggs,bread,pap_samp,rice_pasta,dairy,soft_drinks,take_away,smoke,alcohol,avg_systbp,avg_diabp,body_fat,muscle_mass,bmi,visceral_fat,metabolic_age,bfat_rarm,bfat_rleg,bfat_trunk,ecw_percentage,fat_rarm,fat_rleg,fat_trunk,muscle_rarm,muscle_rleg,muscle_trunk,left_grip,right_grip,sitting_height,standing_height,age_log,age_log_capped,exercise,exercise_capped,fruitveg_index,carbs_index,protein_index,junkfood_index,pork_log,pork_log_capped,fish_log,fish_log_capped,avg_systbp2,avg_systbp_capped,avg_diabp2,avg_diabp_capped,muscle_mass_capped,bmi_log,bmi_log_capped,visceral_fat_log,ecw_percentage_capped,avg_grip,avg_grip_capped,sitting_height_capped,standing_height_capped,age_scaled,cold_flu_scaled,antibiotics_scaled,fruit_juice_scaled,fruit_scaled,veg_scaled,red_meat_scaled,chicken_scaled,pork_scaled,fish_scaled,eggs_scaled,bread_scaled,pap_samp_scaled,rice_pasta_scaled,dairy_scaled,soft_drinks_scaled,take_away_scaled,smoke_scaled,alcohol_scaled,body_fat_scaled,muscle_mass_scaled,bmi_scaled,visceral_fat_scaled,metabolic_age_scaled,bfat_rarm_scaled,bfat_rleg_scaled,bfat_trunk_scaled,ecw_percentage_scaled,fat_rarm_scaled,fat_rleg_scaled,fat_trunk_scaled,muscle_rarm_scaled,muscle_rleg_scaled,muscle_trunk_scaled,sitting_height_scaled,standing_height_scaled,age_log_capped_scaled,exercise_scaled,exercise_capped_scaled,fruitveg_index_scaled,carbs_index_scaled,protein_index_scaled,junkfood_index_scaled,pork_log_capped_scaled,fish_log_capped_scaled,avg_systbp2_scaled,avg_systbp_capped_scaled,avg_diabp2_scaled,avg_diabp_capped_scaled,muscle_mass_capped_scaled,bmi_log_capped_scaled,visceral_fat_log_scaled,ecw_percentage_capped_scaled,avg_grip_scaled,avg_grip_capped_scaled,sitting_height_capped_scaled,standing_height_capped_scaled,infections_index,infections_index2,infections_index_log,infections_index_scaled,infections_index_log_scaled,pc1_bodycomp,pc2_bodycomp
265,265,283,22.8,1,3,9,3,3,False,False,0,2,4,0,0,0,2,0,0,6,3,6,0,6,4,1,False,False,145.0,70.5,14.9,52.9,22.1,2,15,12.2,11.6,17.7,40.1,0.5,1.3,6.2,3.2,9.7,27.5,50,44,128.0,172.0,3.126761,3.126761,2,2.0,4,9,8,5,0.0,0.0,0.0,0.0,145.0,145.0,70.5,70.5,52.9,3.095578,3.095578,0.693147,40.1,47.0,47.0,128.0,172.0,0.4375,0.3,0.3,0.5,0.0,0.0,0.0,0.25,0.0,0.0,0.75,0.375,0.75,0.0,0.75,0.5,0.125,0.0,0.0,0.254902,0.526437,0.256318,0.071429,0.096774,0.133072,0.179775,0.340278,0.440299,0.078947,0.107843,0.235556,0.606061,0.590361,0.506849,0.690209,0.496124,0.593355,0.095238,0.102954,0.190476,0.375,0.333333,0.3125,0.0,0.0,0.85567,0.827281,0.412698,0.417385,0.543914,0.39147,0.255958,0.361472,0.663265,0.663265,0.543321,0.496124,0.6,1.6,0.470004,0.166667,0.307986,-0.586217,0.318281
266,266,284,19.5,0,1,8,2,6,False,False,0,0,0,1,2,0,2,0,0,4,5,0,2,3,1,1,False,False,111.0,75.0,25.1,54.4,24.1,5,34,22.5,17.2,29.8,42.5,1.0,2.3,12.5,3.1,10.3,28.0,22,29,134.0,178.0,2.970414,2.970414,0,0.0,3,7,6,2,0.0,0.0,0.0,0.0,111.0,111.0,75.0,75.0,54.4,3.182212,3.182212,1.609438,42.5,25.5,25.5,134.0,178.0,0.142857,0.2,0.6,0.0,0.125,0.25,0.0,0.25,0.0,0.0,0.5,0.625,0.0,0.25,0.375,0.125,0.125,0.0,0.0,0.477124,0.56092,0.32852,0.285714,0.709677,0.334638,0.284644,0.62037,0.619403,0.210526,0.205882,0.515556,0.575758,0.662651,0.52968,0.786517,0.612403,0.209949,0.0,0.0,0.142857,0.291667,0.238095,0.125,0.0,0.0,0.505155,0.391777,0.484127,0.504385,0.579541,0.478985,0.594316,0.565801,0.22449,0.22449,0.685294,0.612403,0.8,1.8,0.587787,0.222222,0.385167,0.207533,0.531205
267,267,285,18.8,0,1,9,0,0,False,False,0,4,0,0,1,0,1,1,0,3,3,0,1,5,2,3,False,True,106.0,70.5,23.2,38.3,19.3,1,12,27.0,32.5,17.0,40.6,0.6,3.1,4.9,1.6,6.1,22.9,30,31,125.0,165.0,2.933857,2.933857,4,4.0,1,4,5,5,0.693147,0.693147,0.0,0.0,106.0,106.0,70.5,70.5,38.3,2.960105,2.960105,0.0,40.6,30.5,30.5,125.0,165.0,0.080357,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.2,0.0,0.375,0.375,0.0,0.125,0.625,0.25,0.375,0.0,1.0,0.43573,0.190805,0.155235,0.0,0.0,0.422701,0.571161,0.324074,0.477612,0.105263,0.284314,0.177778,0.121212,0.156627,0.296804,0.642055,0.360465,0.1203,0.190476,0.205909,0.047619,0.166667,0.190476,0.3125,0.921034,0.0,0.453608,0.327732,0.412698,0.417385,0.197139,0.254619,0.0,0.404041,0.326531,0.326531,0.472335,0.360465,0.0,1.0,0.0,0.0,0.0,-0.156658,-0.570512
268,268,286,20.5,0,1,9,4,0,False,False,0,0,2,1,0,0,3,1,0,1,3,0,2,1,4,3,False,True,104.0,73.5,15.9,47.8,22.0,2,13,16.6,8.0,20.2,40.4,0.6,0.9,6.4,2.6,9.6,24.1,46,49,126.0,165.0,3.020425,3.020425,0,0.0,3,5,5,7,0.693147,0.693147,0.0,0.0,104.0,104.0,73.5,73.5,47.8,3.091042,3.091042,0.693147,40.4,47.5,47.5,126.0,165.0,0.232143,0.4,0.0,0.25,0.125,0.0,0.0,0.375,0.2,0.0,0.125,0.375,0.0,0.25,0.125,0.5,0.375,0.0,1.0,0.276688,0.409195,0.252708,0.071429,0.032258,0.219178,0.11236,0.398148,0.462687,0.105263,0.068627,0.244444,0.424242,0.578313,0.351598,0.658106,0.360465,0.332589,0.0,0.0,0.142857,0.208333,0.190476,0.4375,0.921034,0.0,0.43299,0.302114,0.460317,0.475385,0.42278,0.386889,0.255958,0.387013,0.673469,0.673469,0.495997,0.360465,0.4,1.4,0.336472,0.111111,0.220485,-0.542786,0.094884
269,269,290,20.79145,1,1,8,0,0,False,False,0,3,0,1,3,5,6,3,0,0,4,0,6,7,7,8,False,False,109.0,80.0,12.0,46.5,19.3,1,12,10.6,6.2,15.6,40.8,0.3,0.6,4.6,2.7,9.1,23.6,40,41,126.0,170.0,3.034542,3.034542,3,3.0,4,10,14,15,1.386294,0.752575,0.0,0.0,109.0,109.0,80.0,80.0,46.5,2.960105,2.960105,0.0,40.8,40.5,40.5,126.0,170.0,0.258165,0.0,0.0,0.0,0.125,0.375,0.625,0.75,0.6,0.0,0.0,0.5,0.0,0.75,0.875,0.875,1.0,0.0,0.0,0.191721,0.37931,0.155235,0.0,0.0,0.101761,0.078652,0.291667,0.492537,0.026316,0.039216,0.164444,0.454545,0.518072,0.328767,0.658106,0.457364,0.367208,0.142857,0.154432,0.190476,0.416667,0.619048,0.9375,1.0,0.0,0.484536,0.366159,0.563492,0.601051,0.391903,0.254619,0.0,0.421068,0.530612,0.530612,0.495997,0.457364,0.0,1.0,0.0,0.0,0.0,-0.733382,0.00152


### 3.2 Diet

In [10]:
# Selecting the variables (standardized, minimally transformed [preferred option] & unstandardized, fully transformed [option that explained the most variance for diet] - see pca-compare in the compare repository for a detailed explanation)
# Option 1: Minimally transformed + scaled variables
df2_diet1 = df2[['fruit_juice_scaled', 'fruit_scaled', 'veg_scaled', 'red_meat_scaled', 'chicken_scaled', 'pork_scaled', 'fish_scaled', 'eggs_scaled', 'bread_scaled' ,'pap_samp_scaled', 'rice_pasta_scaled', 'dairy_scaled', 'soft_drinks_scaled', 'take_away_scaled']]

# Option 2: Fully transformed (not scaled)
df2_diet2 = df2[['fruit_juice', 'fruit', 'veg', 'red_meat', 'chicken', 'pork_log_capped', 'fish_log_capped', 'eggs', 'bread', 'pap_samp', 'rice_pasta', 'dairy', 'soft_drinks', 'take_away']]

In [12]:
#Conduct the PCA (option1)
pca_diet1 = PCA(n_components=4) # Specify the number of principal components you want
principalComponents1 = pca_diet1.fit_transform(df2_diet1)

#Create the df that will contain all the principal component values
principal_diet_Df1 = pd.DataFrame(data = principalComponents1
             , columns = ['pc1_diet1', 'pc2_diet1', 'pc3_diet1', 'pc4_diet1'])

#Check the results
print('Option1')
print(principal_diet_Df1.head())

#-------------------------------------------------------
#Conduct the PCA (option2)
pca_diet2 = PCA(n_components=4)
principalComponents2 = pca_diet2.fit_transform(df2_diet2)

#Create the df that will contain all the principal component values
principal_diet_Df2 = pd.DataFrame(data = principalComponents2
             , columns = ['pc1_diet2', 'pc2_diet2', 'pc3_diet2', 'pc4_diet2'])

#Check the results
print('Option2')
print(principal_diet_Df2.head())

Option1
   pc1_diet1  pc2_diet1  pc3_diet1  pc4_diet1
0   0.506757  -0.322999  -0.182482  -0.217031
1   0.307246  -0.094914   0.644987   0.134811
2  -0.152738  -0.128810   0.613115  -0.329105
3  -0.549241  -0.500114  -0.095544  -0.299933
4   0.412846   0.093609  -0.503231   0.203955
Option2
   pc1_diet2  pc2_diet2  pc3_diet2  pc4_diet2
0   4.237939  -2.640248  -1.283815  -1.777284
1   2.545485  -0.731722   5.286394   1.028436
2  -1.462814  -0.646256   4.731678  -2.598356
3  -4.266221  -4.016122  -0.723556  -2.423082
4   3.412352   0.926528  -3.966591   1.619811


In [14]:
# Get the explained variance
print('Option1')
print('Explained variation per principal component: {}'.format(pca_diet1.explained_variance_ratio_))

print('Option2')
print('Explained variation per principal component: {}'.format(pca_diet2.explained_variance_ratio_))

Option1
Explained variation per principal component: [0.18560463 0.15090166 0.11525822 0.08610095]
Option2
Explained variation per principal component: [0.19543547 0.15977634 0.12252068 0.09185746]


In [15]:
# Calculate the total variance explained
print('Total explained variance of the standardized data (minimally transformed):')
print(0.18560463+0.15090166+0.11525822+0.08610095)

print('Total explained variance of the unstandardized data (minimally transformed):')
print(0.19543547+0.15977634+0.12252068+0.09185746)

#PCA is not the right approach for the diet data. Even with four principal components only ~54-57% of the variance is explained
#I will stick with the food indexes (i.e. protein_index etc.) I calculated in the nutrition-data-wrangling

Total explained variance of the standardized data (minimally transformed):
0.5378654599999999
Total explained variance of the unstandardized data (minimally transformed):
0.56958995


## 4. Saving the dataset

In [15]:
df2.to_csv('main_transformed2_b.csv')