# Mini Project - USDA Food Composition

[Data](https://drive.google.com/open?id=1ZSeFZs4PijD1dF6fFw_mYUAUsPawaEmU)


# Step 1: Data Familiarization

Read the documentation to familiarize with the content:

https://fdc.nal.usda.gov/data-documentation.html

* What is this data about?
* What kind of information does it contain? 
* What is the meaning of this information? 
* How is the data structured? 
* What fields will I need to use? 
* What is their meaning? 
* How are the variables distributed?


In [None]:
import pandas as pd
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
data_df = pd.read_csv('usda-grp-nomiss.csv', delimiter=',')
data_df.head()

In [None]:
cols=[ 'Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD',\
       'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium',\
       'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin',\
       'Riboflavin', 'Niacin', 'Panto_Acid', 'Vit_B6', 'Folate_Tot.',\
       'Folic_Acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12',\
       'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot',\
       'Beta_Crypt', 'Lycopene', 'Lut.Zea', 'Vit_E', 'Vit_D', 'Vit_D.1',\
       'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl', 'GmWt_1',\
       'GmWt_Desc1', 'GmWt_2', 'GmWt_Desc2', 'Refuse_Pct']


In [None]:
processed = data_df[cols].apply(pd.to_numeric, errors="coerce")
processed = processed.replace(np.nan, 0)

# Step 2: Answer Data Questions

# Set 2

Repeat the same 4 steps outlined above focusing on the following subsets of variables:
All the macronutrients
All the vitamins
All the minerals


## Analysis for All Vitamins :

In [None]:
vit_cols=[ 'Folate_Tot.','Folic_Acid','Food_Folate','Folate_DFE','Choline_Tot','Vit_C','Vit_B6','Vit_B12','Vit_A_IU', 'Vit_A_RAE','Vit_E', 'Vit_D', 'Vit_D.1',\
       'Vit_K','Retinol','Alpha_Carot','Beta_Carot','Beta_Crypt','Lycopene','Lut.Zea','Niacin','Riboflavin','Thiamin']    
vit_cols

### Clustering:

In [None]:
#Run PCA for dimensionality reduction 

from sklearn.decomposition import PCA
pca = PCA(n_components=2)

''' prepare the features for be analyszed '''
X = processed[vit_cols].values

''' find the first two principal components '''
reduced_data = pca.fit_transform(X)
# reduced_data

In [None]:
rd = pd.DataFrame(data=reduced_data, columns=['x', 'y'])
alt.Chart(rd).mark_point().encode(
    x='x:Q',
    y='y:Q'
)

In [None]:
#Run Tsne :
from sklearn.manifold import TSNE

In [None]:
result_tsne = TSNE(n_components=2).fit_transform(X)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

In [None]:
#Use Elbow method to decide the clusters 
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

Sum_of_squared_distances = []
K = range(1,8)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(rd)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
#Running K-means on PCA reduced data:
kmeans = KMeans(n_clusters=10)
model= kmeans.fit(rd)
labels = kmeans.predict(rd)

centers = model.cluster_centers_


In [None]:
#adding the cluster column in data
rd.insert(2, "clusterlabel", labels, True)
rd

In [None]:
#Plotting the vitamin data after PCA
points = alt.Chart(rd).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='clusterlabel:N'
)

points

In [None]:
#Running K-means on T-SNE reduced data:
kmeans = KMeans(n_clusters=12)
model= kmeans.fit(tsne_df)
labels_tsne = kmeans.predict(tsne_df)

centers_tsne = model.cluster_centers_

In [None]:
#adding the cluster column in data
tsne_df.insert(2, "clusterlabel", labels_tsne, True)
tsne_df

In [None]:
#Plotting the vitamin data after PCA
points = alt.Chart(tsne_df).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='clusterlabel:N'
)

points

### Cluster vs Labels:

##### Question : Do food belonging to the same food group fall in the same clusters? 


In [None]:
#Plotting clusters according to food group for vitamins afte TSNE
data_vitamins = data_df.copy()
data_vitamins.insert(2, "clusterlabel", labels_tsne, True)

In [None]:
foodgroupcluster_vitamin = data_vitamins.groupby('FdGrp_Desc')['clusterlabel'].value_counts()
foodgroupcluster_vitamin

In [None]:
vit=alt.Chart(data_vitamins).mark_bar().encode(
    x=alt.X('FdGrp_Desc', sort='-y'),
    y=alt.Y('count(clusterlabel)', stack="normalize"),
    color='clusterlabel:N'
)
vit

### Interpretation:

##### Are there foods that belong to the same food group but fall into different clusters (that is, same food group but different food composition)?

In [None]:
data_vitamins_processed=data_vitamins[vit_cols].apply(pd.to_numeric, errors="coerce")
data_vitamins_processed = data_vitamins_processed.replace(np.nan,0)
data_vitamins_processed.head()

In [None]:
from sklearn import preprocessing
x = data_vitamins_processed.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
vit_normalized_df = pd.DataFrame(x_scaled)

vit_normalized_df = pd.DataFrame(x_scaled, columns = data_vitamins_processed.columns)

In [None]:
#Normalize the data before pdp plot
wide_form = vit_normalized_df.reset_index().rename(columns={'index':'id'})
wide_form.head()


In [None]:
wide_form.insert(2, "clusterlabel", labels_tsne, True)
wide_form.columns

In [None]:
wide_form = wide_form.groupby('clusterlabel').median().reset_index()
to_plot = pd.melt(wide_form,id_vars=['id','clusterlabel'],var_name='metrics', value_name='values')

to_plot

In [None]:
#new graph across vitamins
#cluster wise spread of nutrients (by median value )
alt.Chart(to_plot).mark_line().encode(
    x='metrics:N',
    y='values:Q',
    color='clusterlabel:N',
).properties(width=1000, height=300*2)

In [None]:
#Plotting the PDP by foodgroup - cluster wise and foodgroup spread of nutrients
pdp_data = data_df.copy()
pdp_data.head()

In [None]:
#normalize data for pdp
cols_to_norm = ['Water','Energ_Kcal','Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD',\
       'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium',\
       'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin',\
       'Riboflavin', 'Niacin', 'Panto_Acid', 'Vit_B6', 'Folate_Tot.',\
       'Folic_Acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12',\
       'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot',\
       'Beta_Crypt', 'Lycopene', 'Lut.Zea', 'Vit_E', 'Vit_D', 'Vit_D.1',\
       'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl', 'GmWt_1',\
       'GmWt_Desc1', 'GmWt_2', 'GmWt_Desc2', 'Refuse_Pct']
#x = pdp_data[cols].values #retu
pdp_data=pdp_data[cols_to_norm].apply(pd.to_numeric, errors="coerce")
pdp_data = pdp_data.replace(np.nan,0)

x=pdp_data[cols_to_norm].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
pdp_data_normalized = pd.DataFrame(x_scaled)
pdp_data_normalized_df = pd.DataFrame(x_scaled, columns = pdp_data.columns)
#pdp_data_normalized_df['FdGrp_CD'] = data_df['FdGrp_CD']
pdp_data_normalized_df['FdGrp_Desc'] = data_df['FdGrp_Desc']
pdp_data_normalized_df['Shrt_Desc'] = data_df['Shrt_Desc']
pdp_data_normalized_df.head()

In [None]:
#normalize data for pdp - vitamins

#x = pdp_data[cols].values #retu
pdp_data=pdp_data[vit_cols].apply(pd.to_numeric, errors="coerce")
pdp_data = pdp_data.replace(np.nan,0)

x=pdp_data[vit_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
pdp_data_normalized = pd.DataFrame(x_scaled)
pdp_data_normalized_df = pd.DataFrame(x_scaled, columns = pdp_data.columns)
#pdp_data_normalized_df['FdGrp_CD'] = data_df['FdGrp_CD']
pdp_data_normalized_df['FdGrp_Desc'] = data_df['FdGrp_Desc']
#pdp_data_normalized_df['Shrt_Desc'] = data_df['Shrt_Desc']
pdp_data_normalized_df.head()

In [None]:
pdp_data_normalized_df.insert(2, "clusterlabel", labels_tsne, True)
pdp_data_normalized_df.head()

In [None]:

wide_form = pdp_data_normalized_df.reset_index().rename(columns={'index':'id'})
wide_form.head()
# convert from wide-form to long-form directly
to_plot = pd.melt(wide_form,id_vars=['id','clusterlabel','FdGrp_Desc'],var_name='metrics', value_name='values')

to_plot.shape

###### Are there any foods that tend to be different from all the others (that is, they are outliers)? What is unique about them? What makes them unique?For the clusters and outliers identified above, what aspects of their food composition make them different from the others and similar to the food that belong to the same cluster (that is, what makes a cluster (or outlier) unique in terms of their data values and distribution and how does it compare to the other clusters)?


In [None]:
#cluster wise spread of nutrients for each foodgroup by vitamins

alt.Chart(to_plot).mark_line(
    opacity = 0.2,
    strokeWidth = .5
).encode(
    x='metrics:N',
    y='values:Q',
    detail='id:N',
    row = 'FdGrp_Desc:O',
    color=alt.Color('clusterlabel:O', scale=alt.Scale(scheme='rainbow'))
).properties(width=500*2,
    height=100)

In [None]:
#cluster wise spread of nutrients for each foodgroup by cluster and color - foodgroup

alt.Chart(to_plot).mark_line(
    opacity = 0.7,
    strokeWidth = .5
).encode(
    x='metrics:N',
    y='values:Q',
    detail='id:N',
    row = 'clusterlabel:O',
    color=alt.Color('FdGrp_Desc:O', scale=alt.Scale(scheme='rainbow'))
).properties(width=500*2,
    height=100)

## Analysis for Minerals :

In [None]:
minerals_cols=['Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium','Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium','Ash']    
minerals_cols

In [None]:
X_minerals = processed[minerals_cols].values
result_tsne_mineral = TSNE(n_components=2).fit_transform(X_minerals)

tsne_df_mineral = pd.DataFrame(data=result_tsne_mineral, columns=['x','y'])

alt.Chart(tsne_df_mineral).mark_point().encode(
    x='x:Q',
    y='y:Q',
)

### Clustering:

In [None]:
kmeans = KMeans(n_clusters=12)
model= kmeans.fit(tsne_df_mineral)
labels_tsne_mineral = kmeans.predict(tsne_df_mineral)

centers_tsne_mineral = model.cluster_centers_

In [None]:
#adding the cluster column in data
tsne_df_mineral.insert(2, "clusterlabel", labels_tsne_mineral, True)
#tsne_df_mineral
points = alt.Chart(tsne_df_mineral).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='clusterlabel:N'
)

points

### Clusters vs labels

In [None]:
#Plotting clusters according to food group for minerals afte TSNE
data_minerals = data_df.copy()
data_minerals.insert(2, "clusterlabel", labels_tsne_mineral, True)
data_minerals.columns

In [None]:
minerals=alt.Chart(data_minerals).mark_bar().encode(
    x=alt.X('FdGrp_Desc', sort='-y'),
    y=alt.Y('count(clusterlabel)', stack="normalize"),
    color='clusterlabel:N'
)
minerals

### Interpretation

In [None]:
data_minerals_processed=data_minerals[minerals_cols].apply(pd.to_numeric, errors="coerce")
data_minerals_processed = data_minerals_processed.replace(np.nan,0)
data_minerals_processed.head()

In [None]:
#Normalize the data before pdp plot
x_min = data_minerals_processed.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled_min = min_max_scaler.fit_transform(x_min)
min_normalized_df = pd.DataFrame(x_scaled_min)

min_normalized_df = pd.DataFrame(x_scaled_min, columns = data_minerals_processed.columns)
min_normalized_df.head()

In [None]:
wide_form_min = min_normalized_df.reset_index().rename(columns={'index':'id'})
wide_form_min.insert(2, "clusterlabel", labels_tsne_mineral, True)
wide_form_min.columns

In [None]:
wide_form_min = wide_form_min.groupby('clusterlabel').median().reset_index()
to_plot_min = pd.melt(wide_form_min,id_vars=['id','clusterlabel'],var_name='metrics', value_name='values')

to_plot_min

In [None]:
#new chart across minerals
alt.Chart(to_plot_min).mark_line().encode(
    x='metrics:N',
    y='values:Q',
    color='clusterlabel:N',
).properties(width=800, height=300*2)

In [None]:
#Plotting the PDP by foodgroup - cluster wise and foodgroup spread of nutrients
pdp_data_mineral = data_df.copy()
pdp_data_mineral.head()

In [None]:
#normalize data for pdp
cols_to_norm = ['Water','Energ_Kcal','Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD',\
       'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium',\
       'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin',\
       'Riboflavin', 'Niacin', 'Panto_Acid', 'Vit_B6', 'Folate_Tot.',\
       'Folic_Acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12',\
       'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot',\
       'Beta_Crypt', 'Lycopene', 'Lut.Zea', 'Vit_E', 'Vit_D', 'Vit_D.1',\
       'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl', 'GmWt_1',\
       'GmWt_Desc1', 'GmWt_2', 'GmWt_Desc2', 'Refuse_Pct']
#x = pdp_data[cols].values #retu
pdp_data_mineral=pdp_data_mineral[cols_to_norm].apply(pd.to_numeric, errors="coerce")
pdp_data_mineral = pdp_data_mineral.replace(np.nan,0)

x_mineral=pdp_data_mineral[cols_to_norm].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled_min = min_max_scaler.fit_transform(x_mineral)
pdp_data_normalized = pd.DataFrame(x_scaled_min)
pdp_data_normalized_df = pd.DataFrame(x_scaled_min, columns = pdp_data_mineral.columns)
#pdp_data_normalized_df['FdGrp_CD'] = data_df['FdGrp_CD']
pdp_data_normalized_df['FdGrp_Desc'] = data_df['FdGrp_Desc']
pdp_data_normalized_df['Shrt_Desc'] = data_df['Shrt_Desc']
pdp_data_normalized_df.head()

In [None]:
#normalize data for pdp

#x = pdp_data[cols].values #retu
pdp_data_mineral=pdp_data_mineral[minerals_cols].apply(pd.to_numeric, errors="coerce")
pdp_data_mineral = pdp_data_mineral.replace(np.nan,0)

x_mineral=pdp_data_mineral[minerals_cols].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled_min = min_max_scaler.fit_transform(x_mineral)
pdp_data_normalized = pd.DataFrame(x_scaled_min)
pdp_data_normalized_df = pd.DataFrame(x_scaled_min, columns = pdp_data_mineral.columns)
#pdp_data_normalized_df['FdGrp_CD'] = data_df['FdGrp_CD']
pdp_data_normalized_df['FdGrp_Desc'] = data_df['FdGrp_Desc']
#pdp_data_normalized_df['Shrt_Desc'] = data_df['Shrt_Desc']
pdp_data_normalized_df.head()

In [None]:
pdp_data_normalized_df.insert(2, "clusterlabel", labels_tsne_mineral, True)
pdp_data_normalized_df.head()

wide_form = pdp_data_normalized_df.reset_index().rename(columns={'index':'id'})
wide_form.head()
# convert from wide-form to long-form directly
to_plot = pd.melt(wide_form,id_vars=['id','clusterlabel','FdGrp_Desc'],var_name='metrics', value_name='values')

to_plot.shape

In [None]:
#cluster wise spread of nutrients for each foodgroup- across minerals content

alt.Chart(to_plot).mark_line(
    opacity = 0.2,
    strokeWidth = .5
).encode(
    x='metrics:N',
    y='values:Q',
    detail='id:N',
    row = 'FdGrp_Desc:O',
    color=alt.Color('clusterlabel:O', scale=alt.Scale(scheme='sinebow'))
).properties(width=500*2,
    height=100)

In [None]:
#cluster wise spread of nutrients for each foodgroup- across minerals content with by cluster

alt.Chart(to_plot).mark_line(
    opacity = 0.7,
    strokeWidth = .5
).encode(
    x='metrics:N',
    y='values:Q',
    detail='id:N',
    row = 'clusterlabel:O',
    color=alt.Color('FdGrp_Desc:O', scale=alt.Scale(scheme='sinebow'))
).properties(width=500*2,
    height=100)

## Analysis for all Macronutrients :

In [None]:
macronutrients=['Protein', 'Lipid_Tot', 'Carbohydrt', 'Fiber_TD',\
       'Sugar_Tot', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl']    


In [None]:
processed=data_df[macronutrients].apply(pd.to_numeric, errors="coerce")
processed = processed.replace(np.nan,0)
X_macronutrients = processed[macronutrients].values
result_tsne_macro = TSNE(n_components=2).fit_transform(X_macronutrients)

tsne_df_macro = pd.DataFrame(data=result_tsne_macro, columns=['x','y'])

alt.Chart(tsne_df_macro).mark_point().encode(
    x='x:Q',
    y='y:Q',
)
X_macronutrients

In [None]:
kmeans = KMeans(n_clusters=12)
model= kmeans.fit(tsne_df_macro)
labels_tsne_macro = kmeans.predict(tsne_df_macro)

centers_tsne_macro = model.cluster_centers_

In [None]:
#adding the cluster column in data
tsne_df_macro.insert(2, "clusterlabel", labels_tsne_macro, True)
#tsne_df_mineral
points = alt.Chart(tsne_df_macro).mark_point().encode(
    x='x:Q',
    y='y:Q',
    color='clusterlabel:N'
)

points

### Clusters vs labels

In [None]:
#Plotting clusters according to food group for macronutrients afte TSNE
data_macro = data_df.copy()
data_macro.insert(2, "clusterlabel", labels_tsne_macro, True)
#for step 3


In [None]:
all_macro=alt.Chart(data_macro).mark_bar().encode(
    x=alt.X('FdGrp_Desc', sort='-y'),
    y=alt.Y('count(clusterlabel)', stack="normalize"),
    color='clusterlabel:N'
)
all_macro

### Interpretation

In [None]:
data_macro_processed=data_macro[macronutrients].apply(pd.to_numeric, errors="coerce")
data_macro_processed = data_macro_processed.replace(np.nan,0)
data_macro_processed.head()

In [None]:
#Normalize the data before pdp plot
x_macro = data_macro_processed.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled_macro = min_max_scaler.fit_transform(x_macro)
macro_normalized_df = pd.DataFrame(x_scaled_macro)

macro_normalized_df = pd.DataFrame(x_scaled_macro, columns = data_macro_processed.columns)
macro_normalized_df.head()

In [None]:
wide_form_macro = macro_normalized_df.reset_index().rename(columns={'index':'id'})
wide_form_macro.insert(2, "clusterlabel", labels_tsne_macro, True)

wide_form_macro.columns
wide_form_macro = wide_form_macro.groupby('clusterlabel').median().reset_index()
to_plot_macro = pd.melt(wide_form_macro,id_vars=['id','clusterlabel'],var_name='metrics', value_name='values')

to_plot_macro

In [None]:
#cluster wise spread of nutrients (by median value )
alt.Chart(to_plot_macro).mark_line().encode(
    x='metrics:N',
    y='values:Q',
    color='clusterlabel:N',
).properties(width=800, height=300*2)

In [None]:
pdp_data_macro = data_df.copy()

In [None]:
#Plotting the PDP by foodgroup - cluster wise and foodgroup spread of nutrients

#pdp_data_mineral.head()
#normalize data for pdp
cols_to_norm = ['Water','Energ_Kcal','Protein', 'Lipid_Tot', 'Ash', 'Carbohydrt', 'Fiber_TD',\
       'Sugar_Tot', 'Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium',\
       'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium', 'Vit_C', 'Thiamin',\
       'Riboflavin', 'Niacin', 'Panto_Acid', 'Vit_B6', 'Folate_Tot.',\
       'Folic_Acid', 'Food_Folate', 'Folate_DFE', 'Choline_Tot', 'Vit_B12',\
       'Vit_A_IU', 'Vit_A_RAE', 'Retinol', 'Alpha_Carot', 'Beta_Carot',\
       'Beta_Crypt', 'Lycopene', 'Lut.Zea', 'Vit_E', 'Vit_D', 'Vit_D.1',\
       'Vit_K', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl','Refuse_Pct']
#x = pdp_data[cols].values #retu
pdp_data_macro=pdp_data_macro[cols_to_norm].apply(pd.to_numeric, errors="coerce")
pdp_data_macro = pdp_data_macro.replace(np.nan,0)

x=pdp_data_macro[cols_to_norm].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
pdp_data_normalized = pd.DataFrame(x_scaled)
pdp_data_normalized_df = pd.DataFrame(x_scaled, columns = pdp_data_macro.columns)
pdp_data_normalized_df['FdGrp_Desc'] = data_df['FdGrp_Desc']
#pdp_data_normalized_df['Shrt_Desc'] = data_df['Shrt_Desc']
pdp_data_normalized_df.head()

In [None]:
cols_to_norm = ['Protein', 'Lipid_Tot', 'Carbohydrt', 'Fiber_TD',\
       'Sugar_Tot', 'FA_Sat', 'FA_Mono', 'FA_Poly', 'Cholestrl']
#x = pdp_data[cols].values #retu
pdp_data_macro=pdp_data_macro[cols_to_norm].apply(pd.to_numeric, errors="coerce")
pdp_data_macro = pdp_data_macro.replace(np.nan,0)

x=pdp_data_macro[cols_to_norm].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
pdp_data_normalized = pd.DataFrame(x_scaled)
pdp_data_normalized_df = pd.DataFrame(x_scaled, columns = pdp_data_macro.columns)
pdp_data_normalized_df['FdGrp_Desc'] = data_df['FdGrp_Desc']
#pdp_data_normalized_df['Shrt_Desc'] = data_df['Shrt_Desc']
pdp_data_normalized_df.head()

In [None]:
pdp_data_normalized_df.insert(2, "clusterlabel", labels_tsne_macro, True)
pdp_data_normalized_df.head()

wide_form = pdp_data_normalized_df.reset_index().rename(columns={'index':'id'})
wide_form.head()
# convert from wide-form to long-form directly
to_plot = pd.melt(wide_form,id_vars=['id','clusterlabel','FdGrp_Desc'],var_name='metrics', value_name='values')

to_plot.shape


In [None]:
#cluster wise spread of nutrients for each foodgroup-macronutrients

alt.Chart(to_plot).mark_line(
    opacity = 0.7,
    strokeWidth = .5
).encode(
    x='metrics:N',
    y='values:Q',
    detail='id:N',
    row = 'FdGrp_Desc:O',
    color=alt.Color('clusterlabel:O', scale=alt.Scale(scheme='rainbow'))
).properties(width=500*2,
    height=100)

In [None]:
#cluster wise spread of nutrients for each foodgroup-macronutrients

alt.Chart(to_plot).mark_line(
    opacity = 0.7,
    strokeWidth = .5
).encode(
    x='metrics:N',
    y='values:Q',
    detail='id:N',
    row = 'clusterlabel:O',
    color=alt.Color('FdGrp_Desc:O', scale=alt.Scale(scheme='rainbow'))
).properties(width=500*2,
    height=100)

#### Set 3: