# Data Analysis

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

## Introduction

- Crispdm problema
- The dataset
- Business Questions

## Python ETL

### Loading data

In [None]:
folder = 'FoodData_Central_foundation_food_csv_2025-04-24'
# Read all relevant CSVs as string columns

foods = pd.read_csv(os.path.join(folder, 'food.csv'), dtype=str)
category = pd.read_csv(os.path.join(folder, 'food_category.csv'), dtype=str)

nutrients = pd.read_csv(os.path.join(folder, 'nutrient.csv'), dtype=str)
food_nutrients = pd.read_csv(os.path.join(folder, 'food_nutrient.csv'), dtype=str)

food_nutrient_conversion_factor = pd.read_csv(os.path.join(folder, 'food_nutrient_conversion_factor.csv'), dtype=str)
calorie_factor = pd.read_csv(os.path.join(folder, 'food_calorie_conversion_factor.csv'), dtype=str)

### Data Transformations

In [35]:
# Data preparation: renaming columns and converting data types
nutrients = nutrients.rename(
    columns={'id': 'nutrient_id', 'name': 'nutrient_name'}
)

category = category.rename(
    columns={'id': 'food_category_id', 'description': 'food_category'}
)

food_nutrient_conversion_factor = food_nutrient_conversion_factor.rename(
    columns={'id': 'food_nutrient_conversion_factor_id'}
)

foods = foods[['fdc_id', 'description', 'food_category_id']]
food_nutrients['amount'] = pd.to_numeric(food_nutrients['amount'], errors='coerce')

calorie_factor['protein_value'] = calorie_factor['protein_value'].astype(float)
calorie_factor['carbohydrate_value'] = calorie_factor['carbohydrate_value'].astype(float)
calorie_factor['fat_value'] = calorie_factor['fat_value'].astype(float)

In [36]:
# Join nutrient names to food_nutrients
food_nutrients = food_nutrients.merge(nutrients, 
                    on='nutrient_id',
                    how='left')

# Identify top 10 most common nutrients
top_nutrients = (
    food_nutrients.groupby('nutrient_id')['fdc_id']
    .count()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# Force add some relevant nutrients if not in top 10
target_names = ['Carbohydrates', 'Carbohydrate, by difference', 'Carbohydrate, other', 'Total lipid (fat)', 'Protein']
for name in target_names:
    nutrient_id = nutrients.loc[nutrients['nutrient_name'] == name, 'nutrient_id'].values
    if len(nutrient_id) > 0 and nutrient_id[0] not in top_nutrients:
        top_nutrients = top_nutrients.append(pd.Index([nutrient_id[0]]))

# Filter food_nutrients to keep only those top 10 nutrients
food_nutrients = food_nutrients[food_nutrients['nutrient_id'].isin(top_nutrients)]

# Pivot the table to have nutrients as columns
nutrients_pivot = food_nutrients.pivot_table(
    index='fdc_id',
    columns='nutrient_name',
    values='amount',
    aggfunc='mean'
).reset_index()

In [None]:
# Merge calorie_factor with food_nutrient_conversion_factor to fdc_id
calorie_factor = calorie_factor.merge(food_nutrient_conversion_factor,
    on='food_nutrient_conversion_factor_id',
    how='left'
)

#### Join relational tables

In [40]:
# Add food categories to foods
foods = foods.merge(category, 
        on='food_category_id',
        how='left')

# Merge the pivoted nutrients into foods
foods = foods.merge(nutrients_pivot,
        on='fdc_id',
        how='left')

# Merge calorie_factor into foods
foods = foods.merge(calorie_factor[['fdc_id', 'protein_value', 'carbohydrate_value', 'fat_value']],
        on='fdc_id',
        how='left')

In [49]:
# Get total energy from macronutrients
foods['calculated_energy'] = (
    foods['protein_value'] * foods['Protein'] +
    foods['carbohydrate_value'] * foods['Carbohydrate, by difference'] +
    foods['fat_value'] * foods['Total lipid (fat)']
)

#### Final Ajustments

In [None]:
foods.drop(columns=['food_category_id', 'code'], inplace=True)
foods = foods[foods['calculated_energy'].notna()]

In [60]:
foods.to_csv('cleaned_data/foods_with_nutrients.csv', index=False)

---
## Exploratory Data Analysis

In [66]:
foods.isna().sum()

fdc_id                         0
description                    0
food_category                  0
Calcium, Ca                    4
Carbohydrate, by difference    0
Copper, Cu                     4
Iron, Fe                       4
Magnesium, Mg                  4
Manganese, Mn                  4
Phosphorus, P                  4
Potassium, K                   4
Protein                        0
Total lipid (fat)              0
Water                          0
Zinc, Zn                       4
protein_value                  0
carbohydrate_value             0
fat_value                      0
calculated_energy              0
dtype: int64

In [67]:
foods.fillna(0, inplace=True)

## To do

### Blog Post
- Communicate findings
- Title and image
