## 1. Import Libraries and Load Data

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the dataset
df = pd.read_csv('../../data/processed/cleaned-diet-compositions.csv.csv')

# Convert year to datetime if needed, or extract year
# The sample showed '1961-01-01', so let's extract the year
if 'year' in df.columns and df['year'].dtype == 'object':
    df['Year_Int'] = pd.to_datetime(df['year']).dt.year
else:
    df['Year_Int'] = df['year']

# Display first few rows
print(f"Dataset Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print("\n" + "="*60)
print("First 5 rows of the dataset:")
print("="*60)
df.head()

Dataset Shape: 8,154 rows √ó 13 columns

First 5 rows of the dataset:


Unnamed: 0,entity,year,cereals_and_grains,pulses,starchy_roots,sugar,oils_fats,meat,dairy_eggs,fruit_and_vegetables,other,alcoholic_beverages,Year_Int
0,Afghanistan,1961-01-01,2060,16.0,25,51,92.0,88.0,102.0,82.0,13,0.0,1961
1,Afghanistan,1962-01-01,2060,17.0,22,45,98.0,88.0,101.0,76.0,12,0.0,1962
2,Afghanistan,1963-01-01,2060,17.0,23,47,106.0,91.0,110.0,79.0,13,0.0,1963
3,Afghanistan,1964-01-01,2060,18.0,24,55,102.0,93.0,110.0,95.0,11,0.0,1964
4,Afghanistan,1965-01-01,2060,18.0,24,57,105.0,95.0,118.0,95.0,13,0.0,1965


## 2. Data Structure and Quality Assessment

In [3]:
# Basic information
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"‚Ä¢ Countries: {df['entity'].nunique()}")
print(f"‚Ä¢ Year Range: {df['Year_Int'].min()} - {df['Year_Int'].max()}")
print(f"‚Ä¢ Food Groups: {[col for col in df.columns if col not in ['entity', 'year', 'Year_Int']]}")

# Missing values
print("\nMISSING VALUES")
print(df.isnull().sum()[df.isnull().sum() > 0])

DATASET INFORMATION
‚Ä¢ Countries: 173
‚Ä¢ Year Range: 1961 - 2013
‚Ä¢ Food Groups: ['cereals_and_grains', 'pulses', 'starchy_roots', 'sugar', 'oils_fats', 'meat', 'dairy_eggs', 'fruit_and_vegetables', 'other', 'alcoholic_beverages']

MISSING VALUES
Series([], dtype: int64)


## 3. Global Dietary Trends

In [4]:
# Calculate global average consumption per year
food_cols = [col for col in df.columns if col not in ['entity', 'year', 'Year_Int']]
global_trends = df.groupby('Year_Int')[food_cols].mean().reset_index()

# Melt for plotting
global_melted = global_trends.melt(id_vars='Year_Int', var_name='Food Group', value_name='Consumption')

# Line chart
fig = px.line(global_melted, x='Year_Int', y='Consumption', color='Food Group',
              title='Global Average Consumption of Food Groups Over Time',
              labels={'Year_Int': 'Year', 'Consumption': 'Average Consumption (kcal/capita/day or similar)'})
fig.update_layout(height=600, hovermode='x unified')
fig.show()

In [5]:
# Stacked area chart for global diet composition
fig = px.area(global_melted, x='Year_Int', y='Consumption', color='Food Group',
              title='Evolution of Global Diet Composition',
              labels={'Year_Int': 'Year', 'Consumption': 'Consumption'})
fig.update_layout(height=600)
fig.show()

## 4. Country-Specific Analysis

In [6]:
# Interactive Country Explorer
countries = sorted(df['entity'].unique())

country_dropdown = widgets.Dropdown(
    options=countries,
    value='United States' if 'United States' in countries else countries[0],
    description='Country:',
    style={'description_width': 'initial'}
)

output = widgets.Output()

def update_country_plot(change):
    with output:
        output.clear_output(wait=True)
        country = change['new']
        country_data = df[df['entity'] == country]
        
        # Melt data
        country_melted = country_data.melt(id_vars=['Year_Int'], value_vars=food_cols, 
                                           var_name='Food Group', value_name='Consumption')
        
        # Create subplots
        fig = make_subplots(rows=1, cols=2, specs=[[{"type": "xy"}, {"type": "domain"}]],
                            subplot_titles=(f'Dietary Trends in {country}', f'Diet Composition in {country_data["Year_Int"].max()}'))
        
        # Line chart
        for food in food_cols:
            food_data = country_melted[country_melted['Food Group'] == food]
            fig.add_trace(go.Scatter(x=food_data['Year_Int'], y=food_data['Consumption'], 
                                     name=food, mode='lines'), row=1, col=1)
        
        # Pie chart for latest year
        latest_year = country_data['Year_Int'].max()
        latest_data = country_data[country_data['Year_Int'] == latest_year]
        latest_values = latest_data[food_cols].iloc[0]
        
        fig.add_trace(go.Pie(labels=latest_values.index, values=latest_values.values, 
                             name='Composition'), row=1, col=2)
        
        fig.update_layout(height=500, title_text=f'Dietary Analysis: {country}')
        fig.show()

country_dropdown.observe(update_country_plot, names='value')

display(widgets.VBox([widgets.HTML('<h3>üåç Explore Country Diets</h3>'), 
                      country_dropdown, output]))

# Trigger initial plot
update_country_plot({'new': country_dropdown.value})

VBox(children=(HTML(value='<h3>üåç Explore Country Diets</h3>'), Dropdown(description='Country:', index=164, opt‚Ä¶

## 5. Comparative Analysis

In [7]:
# Top consumers of specific food groups (Latest Year)
latest_year = df['Year_Int'].max()
latest_df = df[df['Year_Int'] == latest_year]

food_dropdown = widgets.Dropdown(
    options=food_cols,
    value='meat',
    description='Food Group:',
)

top_output = widgets.Output()

def update_top_consumers(change):
    with top_output:
        top_output.clear_output(wait=True)
        food = change['new']
        
        top_15 = latest_df.nlargest(15, food)
        
        fig = px.bar(top_15, x=food, y='entity', orientation='h',
                     title=f'Top 15 Countries by {food} Consumption ({latest_year})',
                     color=food, color_continuous_scale='Viridis')
        fig.update_layout(yaxis={'categoryorder':'total ascending'}, height=600)
        fig.show()

food_dropdown.observe(update_top_consumers, names='value')

display(widgets.VBox([widgets.HTML('<h3>üèÜ Top Consumers by Food Group</h3>'), 
                      food_dropdown, top_output]))

update_top_consumers({'new': food_dropdown.value})

VBox(children=(HTML(value='<h3>üèÜ Top Consumers by Food Group</h3>'), Dropdown(description='Food Group:', index‚Ä¶

## 6. Correlation Analysis

In [8]:
# Correlation heatmap between food groups
corr_matrix = df[food_cols].corr()

fig = px.imshow(corr_matrix, text_auto=True, aspect='auto', color_continuous_scale='RdBu_r',
                title='Correlation Matrix of Food Groups',
                zmin=-1, zmax=1)
fig.update_layout(height=600)
fig.show()

---
## Summary of Findings

### Key Observations
1. **Global Trends**: [To be filled after running] - Typically shows increase in meat and vegetable oils consumption globally.
2. **Dietary Shifts**: [To be filled after running] - Shift from traditional staples (cereals/roots) to more diverse diets in developing nations.
3. **Regional Differences**: [To be filled after running] - Distinct patterns for Western vs. Asian vs. African diets.

### Implications
- **Health**: High sugar/fat consumption correlates with NCDs (Non-Communicable Diseases).
- **Environment**: Shift towards meat-heavy diets has higher environmental impact.

### Next Steps
- Correlate with Obesity and Diabetes datasets.
- Analyze impact of GDP on diet composition.