# Data Analysis

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

## Introduction

- Crispdm problema
- The dataset
- Business Questions

## Python ETL

### Loading data

In [76]:
folder = 'FoodData_Central_foundation_food_csv_2025-04-24'
# Read all relevant CSVs as string columns

foods = pd.read_csv(os.path.join(folder, 'food.csv'), dtype=str)
category = pd.read_csv(os.path.join(folder, 'food_category.csv'), dtype=str)

nutrients = pd.read_csv(os.path.join(folder, 'nutrient.csv'), dtype=str)
food_nutrients = pd.read_csv(os.path.join(folder, 'food_nutrient.csv'), dtype=str)

food_nutrient_conversion_factor = pd.read_csv(os.path.join(folder, 'food_nutrient_conversion_factor.csv'), dtype=str)
calorie_factor = pd.read_csv(os.path.join(folder, 'food_calorie_conversion_factor.csv'), dtype=str)
food_protein_conversion_factor = pd.read_csv(os.path.join(folder, 'food_protein_conversion_factor.csv'), dtype=str)

# food_attribute = pd.read_csv(os.path.join(folder, 'food_attribute.csv'), dtype=str)
# food_attribute_type = pd.read_csv(os.path.join(folder, 'food_attribute_type.csv'), dtype=str)

### Data Transformations

In [81]:
# Data preparation: renaming columns and converting data types
nutrients = nutrients.rename(
    columns={'id': 'nutrient_id', 'name': 'nutrient_name'}
)

category = category.rename(
    columns={'id': 'food_category_id', 'description': 'food_category'}
)

foods = foods[['fdc_id', 'description', 'food_category_id']]
food_nutrients['amount'] = pd.to_numeric(food_nutrients['amount'], errors='coerce')

In [82]:
# Join nutrient names to food_nutrients
food_nutrients = food_nutrients.merge(nutrients, 
                    on='nutrient_id',
                    how='left')

# Identify top 10 most common nutrients
top_nutrients = (
    food_nutrients.groupby('nutrient_id')['fdc_id']
    .count()
    .sort_values(ascending=False)
    .head(10)
    .index
)

# Force add some relevant nutrients if not in top 10
target_names = ['Carbohydrates', 'Carbohydrate, by difference', 'Carbohydrate, other', 'Total lipid (fat)', 'Protein']
for name in target_names:
    nutrient_id = nutrients.loc[nutrients['nutrient_name'] == name, 'nutrient_id'].values
    if len(nutrient_id) > 0 and nutrient_id[0] not in top_nutrients:
        top_nutrients = top_nutrients.append(pd.Index([nutrient_id[0]]))

# Filter food_nutrients to keep only those top 10 nutrients
food_nutrients = food_nutrients[food_nutrients['nutrient_id'].isin(top_nutrients)]

# Pivot the table to have nutrients as columns
nutrients_pivot = food_nutrients.pivot_table(
    index='fdc_id',
    columns='nutrient_name',
    values='amount',
    aggfunc='mean'
).reset_index()

#### Join relational databases

In [51]:
# Add food categories to foods
foods = foods.merge(category, 
        on='food_category_id',
        how='left')

# Merge the pivoted nutrients into foods
foods = foods.merge(nutrients_pivot,
        on='fdc_id',
        how='left')

## To do

### Blog Post
- Communicate findings
- Title and image
