In [3]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import *
from sklearn.feature_selection import VarianceThreshold

import seaborn as sns


from joblib import parallel_backend
import os
from itertools import groupby

pd.set_option('display.max_columns', None)

In [4]:
path = '/home/r3406/SharedFiles/Data/PublicData/KNHANES'

In [1]:
nutri_frontmask = [
    'L', 'LS', 'LK', 'LF',
    'DA', 'DQ',
    'Y',
    'N', 'NF',
]

In [2]:

nutri_useable_feats = [
    'N_WAT_C', 'NF_CAROT', 'N_FM_WT', 'NF_VA', 'NF_K', 'N_KINDG1', 'N_KINDG2', 
    'NF_RETIN', 'NF_NA', 'NF_PHOS', 'NF_CA', 'NF_INTK', 'NF_CHO', 'NF_EN', 
    'NF_WATER', 'NF_FE', 'NF_PROT', 'NF_FAT', 'NF_B1', 'N_DIET', 'NF_VITC', 
    'N_MEAL', 'N_MEAL_T', 'N_FCODE', 'N_MTYPE', 'N_MEAL_P', 'N_DCODE', 'NF_NIAC', 
    'NF_B2', 'N_DNAME', 'N_AP', 'N_FNAME', 'N_DAY', 'N_DUSUAL', 'N_CD_VOL', 
    'N_DIET_WHY', 'N_FNAME3', 'N_FCODE3', 'NF_INTK3', 'N_TD_VOL', 'N_FNAME2', 
    'N_FCODE2']

nutri_cat_feats = ['N_KINDG1', 'N_KINDG2', 'N_DIET', 'N_MEAL', 'N_MEAL_T',
    'N_FCODE', 'N_MTYPE', 'N_MEAL_P', 'N_DCODE', 'N_DNAME', 'N_AP', 'N_FNAME', 
    'N_DAY', 'N_DUSUAL', 'N_DIET_WHY', 'N_FNAME3', 'N_FCODE3', 'N_FNAME2', 'N_FCODE2']

nutri_noncat_feats = ['N_WAT_C', 'NF_CAROT', 'N_FM_WT', 'NF_VA', 'NF_K', 
    'NF_RETIN', 'NF_NA', 'NF_PHOS', 'NF_CA', 'NF_INTK', 'NF_CHO', 'NF_EN', 
    'NF_WATER', 'NF_FE', 'NF_PROT', 'NF_FAT', 'NF_B1', 'NF_VITC', 'NF_NIAC', 'NF_INTK3',
    'N_TD_VOL', 'NF_B2', 'N_CD_VOL']
    


In [4]:
len(nutri_useable_feats) == len(nutri_cat_feats) + len(nutri_noncat_feats)

True

In [6]:
len(nutri_noncat_feats)

23

In [3]:
basic_feats = [ 
    'age', 'sex',
]

health_status_feats = [
    'DC1_dg',
    'DC11_dg',
    'DC12_dg',
    'DC2_dg',
    'DC3_dg',
    'DC4_dg',
    'DC5_dg',
    'DC6_dg',
    'DC7_dg',
    'DE1_dg',
    'DF2_dg',
    'DF2_dg',
    'DI1_dg',
    'DI3_dg',
    'DI4_dg',
    'DI5_dg',
    'DI6_dg',
    'DJ2_dg',
    'DJ4_dg',
    'DM1_dg',
    'DM2_dg',
    'DM3_dg',
    'DN1_dg',
    'DK4_dg',
]

white_list_prefix = [
    'D', 'H', 
]


# Nutri Feats for All Years (2005~2018)

In [7]:
''' 'knhanes_05_18_healthy_nutri_ltmr03_feats.csv'
05_18: from 2005 to 2018 inclusive
healthy: healthy status only (excluded all rows with any unhealthy status (morbidity == 1))
nutri_ltmr03: nutrition features with missing rate less than 30%
'''

" 'knhanes_05_18_healthy_nutri_ltmr03_feats.csv'\n05_18: from 2005 to 2018 inclusive\nhealthy: healthy status only (excluded all rows with any unhealthy status (morbidity == 1))\nnutri_ltmr03: nutrition features with missing rate less than 30%\n"

In [14]:
fname = 'knhanes_05_18_healthy_nutri_ltmr03_feats.csv'
fpath = os.path.join('jar', fname)
df0 = pd.read_csv(fpath)
print(df0.shape)
df0.head()

(3680213, 32)


Unnamed: 0.1,Unnamed: 0,N_WAT_C,NF_CAROT,N_FM_WT,NF_VA,NF_K,NF_RETIN,NF_NA,NF_PHOS,NF_CA,NF_INTK,NF_CHO,NF_EN,NF_WATER,NF_FE,NF_PROT,NF_FAT,NF_B1,N_DIET,NF_VITC,N_MEAL,N_MTYPE,N_MEAL_P,NF_NIAC,NF_B2,N_AP,N_CD_VOL,N_DIET_WHY,N_TD_VOL,age,sex,NF_INTK3
0,4246,10.0,3.159956,158.7,27.912942,25.104093,26.332965,26.684071,28.439602,7.548783,17.55531,0.491549,24.226327,13.377146,0.245774,2.071527,1.439535,0.008778,2.0,0.0,1.0,1.0,1.0,0.105332,0.049155,2.0,2260.0,8.0,250.0,55.0,1.0,
1,4247,10.0,0.0,33.84,0.0,6.074729,0.0,0.347384,1.452425,0.512466,3.743363,0.173318,0.893357,3.467851,0.032942,0.05091,0.031444,0.0,2.0,2.712066,1.0,1.0,1.0,0.0,0.0,1.0,2260.0,8.0,250.0,55.0,1.0,
2,4248,10.0,0.0,20.4,0.0,14.984071,0.0,0.067699,3.700885,0.225664,2.256637,0.676991,2.843363,1.423938,0.042876,0.121858,0.0,0.003385,2.0,0.631858,1.0,1.0,1.0,0.009027,0.007221,1.0,2260.0,8.0,250.0,55.0,1.0,
3,4249,10.0,27.433628,32.0,4.566372,6.584071,0.0,0.035398,1.238938,2.867257,3.539823,0.230088,0.920354,3.224779,0.035398,0.053097,0.010619,0.002124,2.0,0.743363,1.0,1.0,1.0,0.021239,0.003186,1.0,2260.0,8.0,250.0,55.0,1.0,
4,4250,10.0,13.185841,80.0,2.212389,25.929204,0.0,0.088496,3.893805,1.150442,8.849558,0.522124,2.123894,8.132743,0.035398,0.123894,0.00885,0.009735,2.0,0.707965,1.0,1.0,1.0,0.053097,0.00708,1.0,2260.0,8.0,250.0,55.0,1.0,


In [15]:
# drop id col
df0.drop(df0.columns[[0]], axis=1, inplace=True)
df0.columns

Index(['N_WAT_C', 'NF_CAROT', 'N_FM_WT', 'NF_VA', 'NF_K', 'NF_RETIN', 'NF_NA',
       'NF_PHOS', 'NF_CA', 'NF_INTK', 'NF_CHO', 'NF_EN', 'NF_WATER', 'NF_FE',
       'NF_PROT', 'NF_FAT', 'NF_B1', 'N_DIET', 'NF_VITC', 'N_MEAL', 'N_MTYPE',
       'N_MEAL_P', 'NF_NIAC', 'NF_B2', 'N_AP', 'N_CD_VOL', 'N_DIET_WHY',
       'N_TD_VOL', 'age', 'sex', 'NF_INTK3'],
      dtype='object')

In [16]:
# drop na rows 
print(df0.shape)
df1 = df0.dropna(axis=0, how='any')
del df0
print(df1.shape)

(3680213, 31)
(3057515, 31)


In [70]:
avail_nutri_feats = [c for c in nutri_useable_feats if c in df1]
print('len nutri_useable_feats:', len(avail_nutri_feats))

len nutri_useable_feats: 29


In [139]:
# get categorical and noncategorical nutri_feats variables
# remember to add basic feats in final dfs
df2 = df1[avail_nutri_feats]
cats = df2.columns[df2.nunique() < 100]
noncats = [col for col in df2.columns if col not in cats]
print('len df2.columns == len cats + len noncats: ', df2.shape[1] == len(cats) + len(noncats))
print('df2', df2.shape)
print('cats', len(cats))
print('noncats', len(noncats))

len df2.columns == len cats + len noncats:  True
df2 (3057515, 29)
cats 6
noncats 23


In [126]:
# create df_noncats for regular PCA
df_nutri_feats_noncats = df1[noncats + basic_feats] # add basic feats to noncat nutri_feats
df_nutri_feats_noncats.shape

(3057515, 25)

In [127]:
# separate sex dfs
df_nutri_feats_noncats_m = df_nutri_feats_noncats[df_nutri_feats_noncats.sex == 1].drop(['sex'], axis=1)
df_nutri_feats_noncats_f = df_nutri_feats_noncats[df_nutri_feats_noncats.sex == 2].drop(['sex'], axis=1)
df_nutri_feats_noncats_allSex = df_nutri_feats_noncats.drop(['sex'], axis=1)
print('m: {}, f: {}, all: {}'.format(
    df_nutri_feats_noncats_m.shape, df_nutri_feats_noncats_f.shape, df_nutri_feats_noncats_allSex.shape))

m: (1347610, 24), f: (1709905, 24), all: (3057515, 24)


In [128]:
df_nutri_feats_lowcat_allSex.columns

Index(['N_WAT_C', 'NF_CAROT', 'N_FM_WT', 'NF_VA', 'NF_K', 'NF_RETIN', 'NF_NA',
       'NF_PHOS', 'NF_CA', 'NF_INTK', 'NF_CHO', 'NF_EN', 'NF_WATER', 'NF_FE',
       'NF_PROT', 'NF_FAT', 'N_DIET', 'NF_VITC', 'N_MEAL', 'N_MTYPE',
       'N_MEAL_P', 'NF_NIAC', 'N_AP', 'N_CD_VOL', 'N_DIET_WHY', 'NF_INTK3',
       'N_TD_VOL', 'age'],
      dtype='object')

In [129]:
df_nutri_feats_lowcatSex.describe()

Unnamed: 0,N_WAT_C,NF_CAROT,N_FM_WT,NF_VA,NF_K,NF_RETIN,NF_NA,NF_PHOS,NF_CA,NF_INTK,NF_CHO,NF_EN,NF_WATER,NF_FE,NF_PROT,NF_FAT,N_DIET,NF_VITC,N_MEAL,N_MTYPE,N_MEAL_P,NF_NIAC,N_AP,N_CD_VOL,N_DIET_WHY,NF_INTK3,N_TD_VOL,age,sex
count,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0,3057515.0
mean,5.923511,49.58094,60.38283,9.883721,39.9914,1.442875,60.45349,15.85907,6.508502,18.30889,4.433946,27.7268,11.84502,0.201113,1.024766,0.5824488,1.768582,1.093082,2.269913,4.502366,5.342044,0.2219352,1.118574,624.3137,6.530257,18.73442,184.7086,46.90799,1.559247
std,27.98615,533.1131,216.8534,97.91261,116.7223,40.83364,273.4423,46.27758,28.86803,58.12423,16.02105,87.02289,48.63987,1.110699,4.040378,3.23143,0.4892705,7.538035,0.8920488,4.822349,14.55032,0.8888365,0.3232871,1048.708,2.644151,61.60497,218.7131,13.55401,0.4964775
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,19.0,1.0
25%,3.0,0.0,1.1,0.0,0.4814945,0.0,0.01902009,0.1301775,0.069,0.3673818,0.01737508,0.4076377,0.02232692,0.002781818,0.005025,0.0005903188,2.0,0.0,2.0,1.0,1.0,0.0008479046,1.0,100.0,8.0,0.4,50.0,36.0,1.0
50%,5.0,0.0,5.67,0.0,5.463426,0.0,0.3477083,1.66243,0.6531747,2.118519,0.196875,2.373377,0.6137156,0.024,0.07180117,0.01913676,2.0,0.0,2.0,3.0,1.0,0.015,1.0,281.8,8.0,2.405532,101.7,47.0,2.0
75%,6.5,1.048443,36.0,0.6938081,33.77861,0.0,15.94915,11.78091,4.0,12.0,1.372655,13.225,5.979238,0.1419082,0.5216441,0.207532,2.0,0.1722612,3.0,5.0,7.0,0.1284,1.0,700.0,8.0,12.2625,250.0,58.0,2.0
max,999.0,391999.7,79122.5,65323.77,31204.53,15998.8,141438.5,20665.73,5372.208,9801.0,1702.627,13515.23,9134.532,1029.6,2113.719,967.1158,9.0,2743.255,5.0,99.0,99.0,534.3231,2.0,111548.5,9.0,14851.2,25120.2,70.0,2.0


In [16]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [130]:
# test
target = 'age'
df = df_nutri_feats_lowcat_allSex

In [131]:
def getCumVarExp(n_components, explained_variance):
    n_comp = n_components
    tot = sum(explained_variance)
    var_exp = [v/tot for v in sorted(explained_variance, reverse=True)] 
    cum_var_exp = np.cumsum(var_exp)

    return cum_var_exp

def getCompVar(cum_var_exp, n_components, th):
    if th == 1.0:
        return n_components
    comp_var = np.argmax(cum_var_exp > th) + 1

    return comp_var

def cumVarExpVisualizer(n_components, explained_variance):
    tot = sum(explained_variance)
    var_exp = [(i/tot)*100 for i in sorted(explained_variance, reverse=True)] 
    cum_var_exp = np.cumsum(var_exp)

    # plot out explained variances superimposed
    plt.figure(figsize=(10, 5))
    plt.step(range(1, n_components+1), cum_var_exp, where='mid',label='cumulative explained variance')
    plt.title('Cumulative Explained Variance as a Function of the Number of Components')
    plt.ylabel('Cumulative Explained variance')
    plt.xlabel('Principal components')
    for vr in (0.95, 0.90, 0.85, 0.80, 0.75):
        plt.axhline(y = vr, linestyle='--', label = '{}% Explained Variance'.format(vr))
    plt.legend(loc='best')
    plt.show()

def getPCA(th, X_train, X_test):
    pca = PCA(th)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)

    return pca, X_train, X_test

def getScore(X_train, y_train, X_test, y_test):
    # LogReg Pred Scores
    lr = LinearRegression(n_jobs=num_cores)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)

    # evaluate the model and collect the score
    # report the model performance
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # cnf_matrix = confusion_matrix(y_test, y_pred)
    # print(cnf_matrix)
    # target_names = [str(n) for n in range(19, 70+1)]
    # report = classification_report(y_test, y_pred, target_names=target_names)
    # print(report)

    return round(r2,4), round(rmse,4)

def getRVoutcomes(retained_ths, initial_feature_names, X_train, X_test, y_train, y_test):
    r2_ls, rmse_ls, n_components_ls, n_unique_components_ls, components_ls = [],[],[],[],[]
    for retained_th in retained_ths:
        pca, X_train_new, X_test_new = getPCA(retained_th, X_train, X_test)
        n_components = pca.n_components_
        explained_variance = pca.explained_variance_

        # show visualization
        if retained_th == 0.75: 
            cumVarExpVisualizer(n_components, explained_variance)

        # print('n_components', n_components)
        r2, rmse = getScore(X_train_new, y_train, X_test_new, y_test)

        # get the index of the most important feature on EACH component i.e. largest absolute value
        '''
        DO NOT USE THIS METHOD of getting next argmax when encountering duplicates.
        Rather, lessen the total number of final PCs by removing duplicates; they are normal. 
        [ https://stackoverflow.com/questions/67769996/why-pca-output-some-components-duplicately ]
        '''
        # # get the next argmax if already in list # in noncomprehnsion form for clarity
        # most_important_ids = []
        # for component in pca.components_:
        #     for id in reversed(range(n_components)): # descending sort for max
        #         _argmax = np.argsort(np.abs(component), axis=-1)[id] 
        #         if _argmax not in most_important_ids :
        #             most_important_ids.append(_argmax) 
        #             break # break out of inner loop

        # get pos of max of absoluted components
        most_important_ids = [np.abs(component).argmax() for component in pca.components_]

        # get feature names
        most_important_names = [initial_feature_names[id] for id in most_important_ids]
        
        # get num components excluding duplicates
        n_unique_components = len(np.unique(np.array(most_important_names)))

        # append as dict entry
        components = {'PC_{}'.format(i+1):name for i, name in \
            enumerate(most_important_names)} #groupby takes unique_in_order
        
        for val, val_ls in zip(
            [r2, rmse, n_components, n_unique_components, components], 
            [r2_ls, rmse_ls, n_components_ls, n_unique_components_ls, components_ls]):
                val_ls.append(val)

        if n_components == 1:
            break # quit after adding metrics of first n_comp=1 

    return r2_ls, rmse_ls, n_components_ls, n_unique_components_ls, components_ls


In [132]:
def getPCAdf(df, target='age'):
    print('input shape', df.shape)

    y = np.ravel(df[[target]])#.values
    X_df = df.drop([target], axis=1)

    print('OG num features:', X_df.shape[1])
    X = X_df.values
    X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                    random_state=2018, 
                                    test_size=0.2, 
                                    shuffle=True)
    print('Shapes of X_train,', 'X_test,', 'y_train,', 'y_test:')
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    # standardize
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # pca for all retained_ths
    initial_feature_names = X_df.columns
    retained_ths = [None, 'mle', 0.99] + list(np.arange(0.95, 0, -0.05))
    r2_ls, rmse_ls, n_components_ls, n_unique_components_ls, components_ls = getRVoutcomes(
        retained_ths, initial_feature_names, X_train, X_test, y_train, y_test) 

    # create final dataframe
    pca_df = pd.DataFrame(list(zip(retained_ths, n_components_ls, n_unique_components_ls, r2_ls, rmse_ls)), 
        columns=['Variance_Retained', 'N_Components', 'N_Unique_Components', 'R2', 'RMSE'])
    out_df = pca_df.join(pd.DataFrame(components_ls))
    
    return out_df.round(4)
        

In [133]:
num_cores = os.cpu_count()
with parallel_backend('threading', n_jobs=num_cores):
    pc_df = getPCAdf(df_nutri_feats_lowcat_allSex, )

pc_df.head(20)

input shape (3057515, 28)
OG num features: 27
Shapes of X_train, X_test, y_train, y_test:
(2446012, 27) (611503, 27) (2446012,) (611503,)


Unnamed: 0,Variance_Retained,N_Components,N_Unique_Components,R2,RMSE,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,PC_11,PC_12,PC_13,PC_14,PC_15,PC_16,PC_17,PC_18,PC_19,PC_20,PC_21,PC_22,PC_23,PC_24,PC_25,PC_26,PC_27
0,,27,23,0.0466,13.24,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,NF_NA,NF_FE,NF_VITC,N_MTYPE,NF_CA,NF_FAT,N_FM_WT,NF_K,NF_NIAC,NF_INTK3,NF_PHOS,N_DIET,NF_VA,NF_EN,NF_INTK
1,mle,26,22,0.0466,13.24,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,NF_NA,NF_FE,NF_VITC,N_MTYPE,NF_CA,NF_FAT,N_FM_WT,NF_K,NF_NIAC,NF_INTK3,NF_PHOS,N_DIET,NF_VA,NF_EN,
2,0.99,22,20,0.0452,13.2499,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,NF_NA,NF_FE,NF_VITC,N_MTYPE,NF_CA,NF_FAT,N_FM_WT,NF_K,NF_NIAC,NF_INTK3,,,,,
3,0.95,18,16,0.0446,13.2541,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,NF_NA,NF_FE,NF_VITC,N_MTYPE,NF_CA,NF_FAT,,,,,,,,,
4,0.9,16,14,0.0425,13.2684,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,NF_NA,NF_FE,NF_VITC,N_MTYPE,,,,,,,,,,,
5,0.85,14,13,0.0419,13.273,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,NF_NA,NF_FE,,,,,,,,,,,,,
6,0.8,12,11,0.0408,13.2807,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,N_MEAL,,,,,,,,,,,,,,,
7,0.75,11,10,0.0406,13.2821,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,NF_RETIN,N_MEAL_P,,,,,,,,,,,,,,,,
8,0.7,9,9,0.036,13.3136,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,N_WAT_C,,,,,,,,,,,,,,,,,,
9,0.65,8,8,0.0359,13.3145,NF_PHOS,NF_CAROT,N_DIET,NF_RETIN,N_MTYPE,NF_CHO,N_CD_VOL,N_TD_VOL,,,,,,,,,,,,,,,,,,,


In [134]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# FAMD (Factorial Analysis of Mixed Data)
* https://towardsdatascience.com/famd-how-to-generalize-pca-to-categorical-and-numerical-data-2ddbeb2b9210

In [140]:
# create df cats and non cats for FAMD PCA
df_nutri_feats_allTypes = df1[noncats + cats + basic_feats] # add basic feats as well
df_nutri_feats_allTypes.shape

ValueError: operands could not be broadcast together with shapes (23,) (6,) 

In [136]:
# check variance threshold
variance = VarianceThreshold(threshold=(0.9 * (1-0.9)))
variance.fit(df_nutri_feats_allTypes)
arr_vari = variance.get_support()

df_vari = df_nutri_feats_allTypes[arr_vari].shape[1]

print(df_nutri_feats_allTypes.shape[1], df_vari.shape[1])

NameError: name 'df_nutri_feats_allTypes' is not defined