# Nutrition project: Principal component analyses (PCA)

## 1. Install and import the necesary packages and libraries

I already have the most recent versions of **pandas, numpy and scikit learn** installed, but you can install them using pip (see pypi.org) or conda install in Anaconda prompt (see anaconda.org). If you get the ImportError: cannot import name 'html5lib' from 'pip._vendor', you can install html5lib in Anaconda prompt (conda install -c anaconda html5lib).

Currently installed versions: 
<br>Pandas 1.4.4
<br>numpy 1.21.5
<br>scikit learn 1.1.1

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

## 2. Read csv file into Pandas dataframe

In [2]:
#Read the csv file into the pandas dataframe
df2 = pd.read_csv('nutrition_transformed.csv')

# If the rows are truncated so we can't see the full list, you can correct that with:
pd.set_option('display.max_rows', None)

#Let's display max columns too 
pd.set_option('display.max_columns', None)

print(df2.shape)
df2.head()

(217, 101)


Unnamed: 0.1,Unnamed: 0,id,age,sex,income,items_home,cold_flu,antibiotics,ill_7days,ill_now,exercise_stren,exercise_mod,fruit_juice,fruit,veg,red_meat,chicken,pork,fish,eggs,bread,pap_samp,rice_pasta,dairy,soft_drinks,take_away,smoke,alcohol,avg_systbp,avg_diabp,body_fat,muscle_mass,bmi,visceral_fat,metabolic_age,bfat_rarm,bfat_rleg,bfat_trunk,ecw_percentage,fat_rarm,fat_rleg,fat_trunk,muscle_rarm,muscle_rleg,muscle_trunk,left_grip,right_grip,sitting_height,standing_height,rs174547,rs9939609,exercise,fruitveg_index,carbs_index,protein_index,junkfood_index,avg_grip,age_scaled,cold_flu_scaled,antibiotics_scaled,fruit_juice_scaled,fruit_scaled,veg_scaled,red_meat_scaled,chicken_scaled,pork_scaled,fish_scaled,eggs_scaled,bread_scaled,pap_samp_scaled,rice_pasta_scaled,dairy_scaled,soft_drinks_scaled,take_away_scaled,avg_systbp_scaled,avg_diabp_scaled,body_fat_scaled,muscle_mass_scaled,bmi_scaled,visceral_fat_scaled,metabolic_age_scaled,bfat_rarm_scaled,bfat_rleg_scaled,bfat_trunk_scaled,ecw_percentage_scaled,fat_rarm_scaled,fat_rleg_scaled,fat_trunk_scaled,muscle_rarm_scaled,muscle_rleg_scaled,muscle_trunk_scaled,sitting_height_scaled,standing_height_scaled,exercise_scaled,fruitveg_index_scaled,carbs_index_scaled,protein_index_scaled,junkfood_index_scaled,avg_grip_scaled,infections_index,infections_index_scaled
0,0,1,20.8,1,2,7,1,0,False,False,0,1,2,5,8,3,3,0,0,1,6,4,4,8,6,3,False,True,119.0,71.5,3.2,51.7,18.1,1,12,7.0,2.0,3.0,38.4,0.2,0.2,0.9,3.0,9.8,26.6,50,50,131.5,176.4,1,2,1,15,14,7,9,50.0,0.266055,0.1,0.0,0.25,0.625,1.0,0.375,0.375,0.0,0.0,0.125,0.75,0.5,0.5,1.0,0.75,0.375,0.587629,0.534351,0.0,0.498851,0.127572,0.0,0.0,0.032193,0.0,0.0,0.313433,0.0,0.0,0.0,0.545455,0.60241,0.465753,0.746388,0.581395,0.047619,0.714286,0.583333,0.285714,0.5625,0.825581,0.1,0.027778
1,1,2,20.6,0,2,9,2,0,False,False,0,7,0,0,5,0,5,0,0,6,8,6,6,4,2,5,False,True,119.0,74.0,20.3,36.2,17.4,1,12,18.1,29.2,14.3,40.6,0.4,2.6,3.7,1.7,6.0,21.0,23,26,129.9,165.8,1,2,7,5,20,11,7,24.5,0.247706,0.2,0.0,0.0,0.0,0.625,0.0,0.625,0.0,0.0,0.75,1.0,0.75,0.75,0.5,0.25,0.625,0.587629,0.572519,0.372549,0.142529,0.098765,0.0,0.0,0.255533,0.509363,0.261574,0.477612,0.068966,0.235294,0.124444,0.151515,0.144578,0.210046,0.720706,0.375969,0.333333,0.238095,0.833333,0.47619,0.4375,0.232558,0.2,0.055556
2,2,3,19.9,0,2,4,1,1,True,True,0,7,0,2,4,1,1,0,4,0,8,0,8,3,1,2,False,False,101.5,63.0,20.0,33.3,18.5,1,12,22.5,30.1,11.8,40.9,0.5,2.6,2.7,1.5,5.7,19.1,22,24,123.2,154.0,2,2,7,6,16,6,3,23.0,0.183486,0.1,0.1,0.0,0.25,0.5,0.125,0.125,0.0,0.5,0.0,1.0,0.0,1.0,0.375,0.125,0.25,0.407216,0.40458,0.366013,0.075862,0.144033,0.0,0.0,0.344064,0.526217,0.203704,0.5,0.103448,0.235294,0.08,0.090909,0.108434,0.123288,0.613162,0.147287,0.333333,0.285714,0.666667,0.238095,0.1875,0.197674,2.2,0.611111
3,3,4,23.1,0,2,9,1,0,True,False,3,5,0,8,2,0,4,0,0,3,1,0,5,4,1,0,False,False,117.0,78.0,39.6,42.9,29.6,6,38,45.4,42.6,36.4,44.8,1.7,5.7,14.8,1.9,7.3,24.6,31,35,124.5,159.0,1,2,11,10,6,7,1,33.0,0.477064,0.1,0.0,0.0,1.0,0.25,0.0,0.5,0.0,0.0,0.375,0.125,0.0,0.625,0.5,0.125,0.0,0.56701,0.633588,0.793028,0.296552,0.600823,0.357143,0.83871,0.804829,0.7603,0.773148,0.791045,0.517241,0.539216,0.617778,0.212121,0.301205,0.374429,0.634029,0.244186,0.52381,0.47619,0.25,0.285714,0.0625,0.430233,1.1,0.305556
4,4,5,22.5,0,1,9,1,1,False,False,0,5,4,7,3,4,5,0,2,2,8,5,2,0,5,6,False,False,118.5,83.0,24.6,42.0,22.3,1,15,23.0,35.1,17.5,40.8,0.7,3.8,5.5,2.1,6.6,24.8,24,30,127.5,162.5,1,2,5,14,15,13,11,27.0,0.422018,0.1,0.1,0.5,0.875,0.375,0.5,0.625,0.0,0.25,0.25,1.0,0.625,0.25,0.0,0.625,0.75,0.582474,0.709924,0.466231,0.275862,0.300412,0.0,0.096774,0.354125,0.61985,0.335648,0.492537,0.172414,0.352941,0.204444,0.272727,0.216867,0.383562,0.682183,0.312016,0.238095,0.666667,0.625,0.571429,0.6875,0.290698,0.2,0.055556


In [3]:
df2 = df2.drop(['Unnamed: 0'], axis=1)
df2.head()

Unnamed: 0,id,age,sex,income,items_home,cold_flu,antibiotics,ill_7days,ill_now,exercise_stren,exercise_mod,fruit_juice,fruit,veg,red_meat,chicken,pork,fish,eggs,bread,pap_samp,rice_pasta,dairy,soft_drinks,take_away,smoke,alcohol,avg_systbp,avg_diabp,body_fat,muscle_mass,bmi,visceral_fat,metabolic_age,bfat_rarm,bfat_rleg,bfat_trunk,ecw_percentage,fat_rarm,fat_rleg,fat_trunk,muscle_rarm,muscle_rleg,muscle_trunk,left_grip,right_grip,sitting_height,standing_height,rs174547,rs9939609,exercise,fruitveg_index,carbs_index,protein_index,junkfood_index,avg_grip,age_scaled,cold_flu_scaled,antibiotics_scaled,fruit_juice_scaled,fruit_scaled,veg_scaled,red_meat_scaled,chicken_scaled,pork_scaled,fish_scaled,eggs_scaled,bread_scaled,pap_samp_scaled,rice_pasta_scaled,dairy_scaled,soft_drinks_scaled,take_away_scaled,avg_systbp_scaled,avg_diabp_scaled,body_fat_scaled,muscle_mass_scaled,bmi_scaled,visceral_fat_scaled,metabolic_age_scaled,bfat_rarm_scaled,bfat_rleg_scaled,bfat_trunk_scaled,ecw_percentage_scaled,fat_rarm_scaled,fat_rleg_scaled,fat_trunk_scaled,muscle_rarm_scaled,muscle_rleg_scaled,muscle_trunk_scaled,sitting_height_scaled,standing_height_scaled,exercise_scaled,fruitveg_index_scaled,carbs_index_scaled,protein_index_scaled,junkfood_index_scaled,avg_grip_scaled,infections_index,infections_index_scaled
0,1,20.8,1,2,7,1,0,False,False,0,1,2,5,8,3,3,0,0,1,6,4,4,8,6,3,False,True,119.0,71.5,3.2,51.7,18.1,1,12,7.0,2.0,3.0,38.4,0.2,0.2,0.9,3.0,9.8,26.6,50,50,131.5,176.4,1,2,1,15,14,7,9,50.0,0.266055,0.1,0.0,0.25,0.625,1.0,0.375,0.375,0.0,0.0,0.125,0.75,0.5,0.5,1.0,0.75,0.375,0.587629,0.534351,0.0,0.498851,0.127572,0.0,0.0,0.032193,0.0,0.0,0.313433,0.0,0.0,0.0,0.545455,0.60241,0.465753,0.746388,0.581395,0.047619,0.714286,0.583333,0.285714,0.5625,0.825581,0.1,0.027778
1,2,20.6,0,2,9,2,0,False,False,0,7,0,0,5,0,5,0,0,6,8,6,6,4,2,5,False,True,119.0,74.0,20.3,36.2,17.4,1,12,18.1,29.2,14.3,40.6,0.4,2.6,3.7,1.7,6.0,21.0,23,26,129.9,165.8,1,2,7,5,20,11,7,24.5,0.247706,0.2,0.0,0.0,0.0,0.625,0.0,0.625,0.0,0.0,0.75,1.0,0.75,0.75,0.5,0.25,0.625,0.587629,0.572519,0.372549,0.142529,0.098765,0.0,0.0,0.255533,0.509363,0.261574,0.477612,0.068966,0.235294,0.124444,0.151515,0.144578,0.210046,0.720706,0.375969,0.333333,0.238095,0.833333,0.47619,0.4375,0.232558,0.2,0.055556
2,3,19.9,0,2,4,1,1,True,True,0,7,0,2,4,1,1,0,4,0,8,0,8,3,1,2,False,False,101.5,63.0,20.0,33.3,18.5,1,12,22.5,30.1,11.8,40.9,0.5,2.6,2.7,1.5,5.7,19.1,22,24,123.2,154.0,2,2,7,6,16,6,3,23.0,0.183486,0.1,0.1,0.0,0.25,0.5,0.125,0.125,0.0,0.5,0.0,1.0,0.0,1.0,0.375,0.125,0.25,0.407216,0.40458,0.366013,0.075862,0.144033,0.0,0.0,0.344064,0.526217,0.203704,0.5,0.103448,0.235294,0.08,0.090909,0.108434,0.123288,0.613162,0.147287,0.333333,0.285714,0.666667,0.238095,0.1875,0.197674,2.2,0.611111
3,4,23.1,0,2,9,1,0,True,False,3,5,0,8,2,0,4,0,0,3,1,0,5,4,1,0,False,False,117.0,78.0,39.6,42.9,29.6,6,38,45.4,42.6,36.4,44.8,1.7,5.7,14.8,1.9,7.3,24.6,31,35,124.5,159.0,1,2,11,10,6,7,1,33.0,0.477064,0.1,0.0,0.0,1.0,0.25,0.0,0.5,0.0,0.0,0.375,0.125,0.0,0.625,0.5,0.125,0.0,0.56701,0.633588,0.793028,0.296552,0.600823,0.357143,0.83871,0.804829,0.7603,0.773148,0.791045,0.517241,0.539216,0.617778,0.212121,0.301205,0.374429,0.634029,0.244186,0.52381,0.47619,0.25,0.285714,0.0625,0.430233,1.1,0.305556
4,5,22.5,0,1,9,1,1,False,False,0,5,4,7,3,4,5,0,2,2,8,5,2,0,5,6,False,False,118.5,83.0,24.6,42.0,22.3,1,15,23.0,35.1,17.5,40.8,0.7,3.8,5.5,2.1,6.6,24.8,24,30,127.5,162.5,1,2,5,14,15,13,11,27.0,0.422018,0.1,0.1,0.5,0.875,0.375,0.5,0.625,0.0,0.25,0.25,1.0,0.625,0.25,0.0,0.625,0.75,0.582474,0.709924,0.466231,0.275862,0.300412,0.0,0.096774,0.354125,0.61985,0.335648,0.492537,0.172414,0.352941,0.204444,0.272727,0.216867,0.383562,0.682183,0.312016,0.238095,0.666667,0.625,0.571429,0.6875,0.290698,0.2,0.055556


## 3. Principal component analysis (PCA)

The body composition/ bio-impedence measures still contain too many variables to be useful as a predictor in the downstream analysis, so I will try and reduce the number of variables using PCA. Although I have already reduced the number of diet variables through the calculation of food group indexes (see nutrition-data-wrangling), I also want to evaluate PCA as a means of reducing the number of diet variables. 

### 3.1 Body composition/ bio-impedence

The body composition variables initially consisted of multiple highly correlated variables. I already (a) dropped the most highly correlated body composition variables and (b) determined that we need to include standing_height, sitting_height and avg_grip in this group because of their high correlations (see nutrition-data-wrangling). 

Here, I will use PCA to reduce the number of dimensions further for this to be a useful measure of health. 

In [4]:
# Selecting the variables (standardized, minimally transformed - see pca-compare in the compare repository for a detailed explanation of this choice)
df2_bodycomp = df2[['body_fat_scaled', 'muscle_mass_scaled', 'bmi_scaled', 'visceral_fat_scaled', 'metabolic_age_scaled', 'bfat_rarm_scaled', 'bfat_rleg_scaled', 'bfat_trunk_scaled', 'ecw_percentage_scaled', 'fat_rarm_scaled', 'fat_rleg_scaled', 'fat_trunk_scaled', 'muscle_rarm_scaled', 'muscle_rleg_scaled', 'muscle_trunk_scaled', 'avg_grip_scaled', 'sitting_height_scaled', 'standing_height_scaled']]

In [5]:
#Conduct the PCA
pca_bodycomp = PCA(n_components=2) # Specify the number of principal components you want
principalComponents = pca_bodycomp.fit_transform(df2_bodycomp)

#Create the df that will contain all the principal component values
principal_bodycomp_Df = pd.DataFrame(data = principalComponents
             , columns = ['pc1_bodycomp', 'pc2_bodycomp'])

#Check the results
print(principal_bodycomp_Df.head())

   pc1_bodycomp  pc2_bodycomp
0     -1.126766      0.329169
1     -0.382697     -0.608950
2     -0.317581     -0.800923
3      1.100858     -0.047855
4     -0.096137     -0.377779


In [6]:
# Get the explained variance
print('Explained variation per principal component: {}'.format(pca_bodycomp.explained_variance_ratio_))

Explained variation per principal component: [0.56057493 0.31045483]


In [7]:
# Calculate the total variance explained
print('Total explained variance:')
print(0.56057493+0.31045483)

#The first two PCs explained 87.1% of the variance
#A third PC for the standardized, minimally transformed variables only explained an additional 3.3% (results not shown) so we will use 2PCs

Total explained variance:
0.8710297600000001


In [8]:
# Get the loadings on each of the principal components
# Define feature_names
feature_names = ['body_fat_scaled', 'muscle_mass_scaled', 'bmi_scaled', 'visceral_fat_scaled', 'metabolic_age_scaled', 'bfat_rarm_scaled', 'bfat_rleg_scaled', 'bfat_trunk_scaled', 'ecw_percentage_scaled', 'fat_rarm_scaled', 'fat_rleg_scaled', 'fat_trunk_scaled', 'muscle_rarm_scaled', 'muscle_rleg_scaled', 'muscle_trunk_scaled', 'avg_grip_scaled', 'sitting_height_scaled', 'standing_height_scaled']

# Get loadings
loadings = pca_bodycomp.components_.T * np.sqrt(pca_bodycomp.explained_variance_)
loading_matrix=pd.DataFrame(loadings, columns=['pc1_bodycomp', 'pc2_bodycomp'], index = feature_names)
print(loading_matrix.sort_values(by=['pc1_bodycomp'], ascending=False))

# PC1 can be referred to as the fat component, with higher values on this component indicating body types that are higher in body fat & metabolic age and lower in strength, muscle and height
# PC2 can be referred to as the muscular component with higher values indicating more strength height and a stronger grip, with lower body fat

                        pc1_bodycomp  pc2_bodycomp
metabolic_age_scaled        0.273252      0.112116
bfat_rarm_scaled            0.232205     -0.055926
bfat_rleg_scaled            0.215600     -0.098019
body_fat_scaled             0.212856     -0.029861
bfat_trunk_scaled           0.193802      0.034284
fat_trunk_scaled            0.189878      0.085810
fat_rleg_scaled             0.185065     -0.010018
fat_rarm_scaled             0.184285      0.043247
bmi_scaled                  0.168174      0.078898
visceral_fat_scaled         0.131910      0.090745
ecw_percentage_scaled       0.120551     -0.027622
muscle_trunk_scaled         0.012943      0.172699
sitting_height_scaled      -0.005481      0.031680
muscle_mass_scaled         -0.012195      0.194111
muscle_rleg_scaled         -0.038202      0.202670
muscle_rarm_scaled         -0.051449      0.205630
standing_height_scaled     -0.067368      0.134332
avg_grip_scaled            -0.091289      0.147289


In [9]:
# Add the PCs to the original dataframe (concatenate)
df2 = pd.concat([df2, principal_bodycomp_Df], axis=1)
print('shape', df2.shape)
df2.tail()

shape (217, 102)


Unnamed: 0,id,age,sex,income,items_home,cold_flu,antibiotics,ill_7days,ill_now,exercise_stren,exercise_mod,fruit_juice,fruit,veg,red_meat,chicken,pork,fish,eggs,bread,pap_samp,rice_pasta,dairy,soft_drinks,take_away,smoke,alcohol,avg_systbp,avg_diabp,body_fat,muscle_mass,bmi,visceral_fat,metabolic_age,bfat_rarm,bfat_rleg,bfat_trunk,ecw_percentage,fat_rarm,fat_rleg,fat_trunk,muscle_rarm,muscle_rleg,muscle_trunk,left_grip,right_grip,sitting_height,standing_height,rs174547,rs9939609,exercise,fruitveg_index,carbs_index,protein_index,junkfood_index,avg_grip,age_scaled,cold_flu_scaled,antibiotics_scaled,fruit_juice_scaled,fruit_scaled,veg_scaled,red_meat_scaled,chicken_scaled,pork_scaled,fish_scaled,eggs_scaled,bread_scaled,pap_samp_scaled,rice_pasta_scaled,dairy_scaled,soft_drinks_scaled,take_away_scaled,avg_systbp_scaled,avg_diabp_scaled,body_fat_scaled,muscle_mass_scaled,bmi_scaled,visceral_fat_scaled,metabolic_age_scaled,bfat_rarm_scaled,bfat_rleg_scaled,bfat_trunk_scaled,ecw_percentage_scaled,fat_rarm_scaled,fat_rleg_scaled,fat_trunk_scaled,muscle_rarm_scaled,muscle_rleg_scaled,muscle_trunk_scaled,sitting_height_scaled,standing_height_scaled,exercise_scaled,fruitveg_index_scaled,carbs_index_scaled,protein_index_scaled,junkfood_index_scaled,avg_grip_scaled,infections_index,infections_index_scaled,pc1_bodycomp,pc2_bodycomp
212,283,22.8,1,3,9,3,3,False,False,0,2,4,0,0,0,2,0,0,6,3,6,0,6,4,1,False,False,145.0,70.5,14.9,52.9,22.1,2,15,12.2,11.6,17.7,40.1,0.5,1.3,6.2,3.2,9.7,27.5,50,44,128.0,172.0,1,3,2,4,9,8,5,47.0,0.449541,0.3,0.3,0.5,0.0,0.0,0.0,0.25,0.0,0.0,0.75,0.375,0.75,0.0,0.75,0.5,0.125,0.85567,0.519084,0.254902,0.526437,0.292181,0.071429,0.096774,0.136821,0.179775,0.340278,0.440299,0.103448,0.107843,0.235556,0.606061,0.590361,0.506849,0.690209,0.496124,0.095238,0.190476,0.375,0.333333,0.3125,0.755814,0.6,0.166667,-0.590205,0.389889
213,284,19.5,0,1,8,2,6,False,False,0,0,0,1,2,0,2,0,0,4,5,0,2,3,1,1,False,False,111.0,75.0,25.1,54.4,24.1,5,34,22.5,17.2,29.8,42.5,1.0,2.3,12.5,3.1,10.3,28.0,22,29,134.0,178.0,1,2,0,3,7,6,2,25.5,0.146789,0.2,0.6,0.0,0.125,0.25,0.0,0.25,0.0,0.0,0.5,0.625,0.0,0.25,0.375,0.125,0.125,0.505155,0.587786,0.477124,0.56092,0.374486,0.285714,0.709677,0.344064,0.284644,0.62037,0.619403,0.275862,0.205882,0.515556,0.575758,0.662651,0.52968,0.786517,0.612403,0.0,0.142857,0.291667,0.238095,0.125,0.255814,0.8,0.222222,0.227934,0.522392
214,285,18.8,0,1,9,0,0,False,False,0,4,0,0,1,0,1,1,0,3,3,0,1,5,2,3,False,True,106.0,70.5,23.2,38.3,19.3,1,12,27.0,32.5,17.0,40.6,0.6,3.1,4.9,1.6,6.1,22.9,30,31,125.0,165.0,1,2,4,1,4,5,5,30.5,0.082569,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.2,0.0,0.375,0.375,0.0,0.125,0.625,0.25,0.375,0.453608,0.519084,0.43573,0.190805,0.176955,0.0,0.0,0.434608,0.571161,0.324074,0.477612,0.137931,0.284314,0.177778,0.121212,0.156627,0.296804,0.642055,0.360465,0.190476,0.047619,0.166667,0.190476,0.3125,0.372093,0.0,0.0,-0.207928,-0.540128
215,286,20.5,0,1,9,4,0,False,False,0,0,2,1,0,0,3,1,0,1,3,0,2,1,4,3,False,True,104.0,73.5,15.9,47.8,22.0,2,13,16.6,8.0,20.2,40.4,0.6,0.9,6.4,2.6,9.6,24.1,46,49,126.0,165.0,1,3,0,3,5,5,7,47.5,0.238532,0.4,0.0,0.25,0.125,0.0,0.0,0.375,0.2,0.0,0.125,0.375,0.0,0.25,0.125,0.5,0.375,0.43299,0.564885,0.276688,0.409195,0.288066,0.071429,0.032258,0.225352,0.11236,0.398148,0.462687,0.137931,0.068627,0.244444,0.424242,0.578313,0.351598,0.658106,0.360465,0.0,0.142857,0.208333,0.190476,0.4375,0.767442,0.4,0.111111,-0.553219,0.168071
216,290,20.747685,1,1,8,0,0,False,False,0,3,0,1,3,5,6,3,0,0,4,0,6,7,7,8,False,False,109.0,80.0,12.0,46.5,19.3,1,12,10.6,6.2,15.6,40.8,0.3,0.6,4.6,2.7,9.1,23.6,40,41,126.0,170.0,1,2,3,4,10,14,15,40.5,0.261256,0.0,0.0,0.0,0.125,0.375,0.625,0.75,0.6,0.0,0.0,0.5,0.0,0.75,0.875,0.875,1.0,0.484536,0.664122,0.191721,0.37931,0.176955,0.0,0.0,0.104628,0.078652,0.291667,0.492537,0.034483,0.039216,0.164444,0.454545,0.518072,0.328767,0.658106,0.457364,0.142857,0.190476,0.416667,0.619048,0.9375,0.604651,0.0,0.0,-0.761171,0.069846


### 3.2 Diet

In [10]:
# Selecting the variables (standardized, minimally transformed [preferred option] & unstandardized, fully transformed [option that explained the most variance for diet] - see pca-compare in the compare repository for a detailed explanation)
# Minimally transformed + scaled variables
df2_diet = df2[['fruit_juice_scaled', 'fruit_scaled', 'veg_scaled', 'red_meat_scaled', 'chicken_scaled', 'pork_scaled', 'fish_scaled', 'eggs_scaled', 'bread_scaled' ,'pap_samp_scaled', 'rice_pasta_scaled', 'dairy_scaled', 'soft_drinks_scaled', 'take_away_scaled']]

In [11]:
#Conduct the PCA (option1)
pca_diet = PCA(n_components=4) # Specify the number of principal components you want
principalComponents1 = pca_diet.fit_transform(df2_diet)

#Create the df that will contain all the principal component values
principal_diet_Df1 = pd.DataFrame(data = principalComponents1
             , columns = ['pc1_diet', 'pc2_diet', 'pc3_diet', 'pc4_diet'])

#Check the results
print(principal_diet_Df1.head())

   pc1_diet  pc2_diet  pc3_diet  pc4_diet
0  0.466198 -0.351016 -0.160803 -0.242920
1  0.296830 -0.124553  0.634409  0.217376
2 -0.146761 -0.112919  0.633558 -0.264708
3 -0.569061 -0.493586 -0.030137 -0.246972
4  0.391785  0.044189 -0.539200  0.103205


In [12]:
# Get the explained variance
print('Explained variation per principal component: {}'.format(pca_diet.explained_variance_ratio_))

Explained variation per principal component: [0.18727942 0.1513243  0.11319585 0.08243915]


In [13]:
# Calculate the total variance explained
print('Total explained variance:')
print(0.18727942+0.1513243+0.11319585+0.08243915)

#PCA is not the right approach for the diet data. Even with four principal components only 53% of the variance is explained
#I will rather use the food indexes (i.e. protein_index etc.) I calculated in the nutrition-data-wrangling

Total explained variance:
0.53423872


## 4. Saving the dataset

In [14]:
df2.to_csv('nutrition_transformed2.csv')