In [49]:
import altair as alt
import pandas as pd
from vega_datasets import data

choco = pd.read_csv('./flavors_of_cacao.csv')

choco.head()

Unnamed: 0,Company \n(Maker-if known),Specific Bean Origin\nor Bar Name,REF,Review\nDate,Cocoa\nPercent,Company\nLocation,Rating,Bean\nType,Broad Bean\nOrigin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [50]:
# rename the columns
original_colnames = choco.columns
new_colnames = ['company', 'species', 'REF', 'review_year', 'cocoa_p',
                'company_location', 'rating', 'bean_typ', 'country']
choco = choco.rename(columns=dict(zip(original_colnames, new_colnames)))
## And modify data types
choco['cocoa_p'] = choco['cocoa_p'].str.replace('%','').astype(float)/100
choco["species"] = choco["species"].str.split(",").str[0]
choco.head()

Unnamed: 0,company,species,REF,review_year,cocoa_p,company_location,rating,bean_typ,country
0,A. Morin,Agua Grande,1876,2016,0.63,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,0.7,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,0.7,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,0.7,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,0.7,France,3.5,,Peru


## LO1: The user will recognize important cocoa production areas worldwide
This visualization aims at presenting the important cocoa production country worldwide. It uses the data of chocolate bars production, selects the species of cocoa beans and their origin country, and use it as the country of origin for cocoa production. It is plotted in format of a choropleth map on a world map, where deeper color represents greater cocoa production. It presents the geographical distribution of cocoa prodcution in a intuitive way, as deep color and large, connected area will give user an impression that the area on map is significant in cocoa production. The color scheme is selected so that it fits the impression of cocoa and chocolate. This learning objective can be evaluated by testing users how they recognize the geographical distribution of cocoa production areas.

In [51]:
# a simplified data frame of country of origin of cocoa
species = pd.DataFrame(choco['species'].value_counts().reset_index())
species.columns=['name', 'count']
species['id']=range(0, len(species))
species

Unnamed: 0,name,count,id
0,Madagascar,68,0
1,Ecuador,53,1
2,Peru,50,2
3,Dominican Republic,45,3
4,Porcelana,32,4
...,...,...,...
677,Medagla,1,677
678,Barba,1,678
679,Bellavista Coop,1,679
680,Palo Blanco w/ panela,1,680


Following is an alternative visualization of the map plot. It shows the country name and its production more precisely, but not as intuitive as the map plot.

In [52]:
countries = alt.topo_feature(data.world_110m.url, 'countries')

colors = alt.Chart(countries).mark_geoshape().encode(
    color=alt.Color('count:Q', scale=alt.Scale(scheme="browns"), legend=None,),
    tooltip=["name:N"]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(species, 'id', list(species.columns))
).properties(
    width=800,
    height=600
).project(
    type="equirectangular"
)
background = alt.Chart(countries).mark_geoshape(fill="linen").properties(
    width=800,
    height=600
).project(
    type="equirectangular"
)
background + colors

In [53]:
alt.Chart(species).mark_bar().encode(
    x=alt.X('name:N',sort=alt.EncodingSortField(
            field="count",order="descending"), title="Country of origin of cocoa"),
    y=alt.Y('count:Q'),
    color=alt.Color("count:Q", scale=alt.Scale(scheme="browns"))
).transform_window(
    rank='rank(count)',
    sort=[alt.SortField('count', order='descending')]
).transform_filter(
    (alt.datum.rank < 30)
)

In [54]:
cocoa = pd.read_csv('./Flavors_of_Cacao2022.csv')
cocoa.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,999,Bahen & Co.,Australia,2012,Blend,Houseblend,0.7,"2- B,S","chalky, fragrant, then off",2.5
1,999,Bahen & Co.,Australia,2012,Brazil,Bahia,0.7,"2- B,S","chalky, intense, chemical",2.5
2,999,Mast Brothers,U.S.A.,2012,Venezuela,Chuao,0.73,"2- B,S","gritty, sour, sweet",2.75
3,999,Mast Brothers,U.S.A.,2012,Blend,Madagascar,0.72,"2- B,S","gritty, overly tart, sour",2.5
4,995,Bahen & Co.,Australia,2012,Madagascar,Sambirano,0.7,"2- B,S","unrefined, flat, grassy",3.0


In [55]:
import pycountry

def lookup(name):
    try:
        return pycountry.countries.lookup(name).numeric
    except LookupError:
        return None


cocoa_species = pd.DataFrame(cocoa['Country of Bean Origin'].value_counts().reset_index())
cocoa_species = cocoa_species.replace('U.S.A.', 'USA')
cocoa_species.columns=['name', 'count']
cocoa_species['id']=range(0, len(cocoa_species))
cocoa_species['uid']=cocoa_species['name'].apply(lambda name : lookup(name))

cocoa_species

Unnamed: 0,name,count,id,uid
0,Venezuela,254,0,862
1,Peru,248,1,604
2,Dominican Republic,234,2,214
3,Ecuador,223,3,218
4,Madagascar,184,4,450
...,...,...,...,...
58,Principe,1,58,
59,Suriname,1,59,740
60,Martinique,1,60,474
61,Gabon,1,61,266


In [56]:
country = alt.topo_feature(data.world_110m.url, 'countries')
brush = alt.selection_interval(encodings=['y'])
opacityCond = alt.condition(brush,alt.value(1),alt.value(0.6))
colorCond = alt.condition(brush,alt.value(1),alt.value(0.6))
color2 = alt.Chart(country).mark_geoshape().encode(
    #color=alt.Color('count:Q', scale=alt.Scale(scheme="browns"), legend=None,),
    color = alt.condition(brush,
                      alt.Color('count:Q', scale=alt.Scale(scheme="browns"), legend=None),
                      alt.value('linen')),
    tooltip=[
            alt.Tooltip("name:N", title="Country"),
            alt.Tooltip("count:Q", title="Count"),
        ],
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(cocoa_species, 'uid', list(cocoa_species.columns))
).project("naturalEarth1").properties(width=600, height=400)
background2 = alt.Chart(country).mark_geoshape(fill="linen").project("naturalEarth1").properties(width=600, height=400)


bars = alt.Chart(cocoa_species).mark_bar().add_selection(brush).encode(
    y=alt.Y('name:N',sort=alt.EncodingSortField(
            field="count",order="descending"), title="Country of origin of cocoa"),
    x=alt.X('count:Q'),
    color=alt.Color("count:Q", scale=alt.Scale(scheme="browns")),
    opacity=opacityCond, 
).transform_window(
    rank='rank(count)',
    sort=[alt.SortField('count', order='descending')]
).transform_filter(
    (alt.datum.rank < 30)
).properties(width=200, height=400)
background2+color2 | bars

In [57]:
import altair as alt
import janitor
import pandas as pd
from vega_datasets import data

CASES_WORLDWIDE = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv"


def get_worldwide_cases(url: str = CASES_WORLDWIDE):
    cases = pd.read_csv(url)
    cleaned = cases.clean_names(
    ).rename_column(
        "long_", "lon"
    ).transform_column(
        "last_update", lambda x: pd.to_datetime(x).normalize()
    )
    return cleaned


world_source = get_worldwide_cases()

source = alt.topo_feature(data.world_110m.url, "countries")

background = alt.Chart(source).mark_geoshape(fill="white")

foreground = (
    alt.Chart(source).mark_geoshape(
        stroke="black", strokeWidth=0.15
    ).encode(
        color=alt.Color(
            "incident_rate:N", scale=alt.Scale(scheme="lightgreyred"), legend=None,
        ),
        tooltip=[
            alt.Tooltip("country_region:N", title="Country"),
            alt.Tooltip("incident_rate:Q", title="Cases pr. 100k"),
        ],
    ).transform_lookup(
        lookup="id",
        from_=alt.LookupData(world_source, "uid", ["incident_rate", "country_region"]),
    )
)

chart = (
    (background + foreground)
    .configure_view(strokeWidth=0)
    .properties(width=700, height=400)
    .project("naturalEarth1")
)

chart


## LO2: The user will summarize the preferance of chocolate consumption by country
This visualization aims at presenting the preferance of chocolate type by cocoa percent, grouped by country. It uses count of chocolate bar production to represents the preference, grouped by country and cocoa percent. As this visualization is targeted at informing user the preferance between chocolate types, and how the preferance varies between country, the final result is normalized to percentage of total production. The color of stacked bars implicitly indicates the type of chocolate, makes the visualization more intuitive. This learning objective can be evaluated by asking users what type of chocolate is favored by each country.

In [58]:
countries2 = pd.DataFrame(choco['company_location'].value_counts().reset_index())
countries2.columns=['name', 'count']
countries2 = countries2[countries2['count'] >= 40]
countries2

Unnamed: 0,name,count
0,U.S.A.,764
1,France,156
2,Canada,125
3,U.K.,96
4,Italy,63
5,Ecuador,54
6,Australia,49
7,Belgium,40


Following is an old version of this visualization. It uses scatter points to show the production of each type of chocolate by country, and its production amount. It can be seen that USA has much larger production than other countries, but I think comparation between chocolate types is more emphasized in this visualization than production amount, so I normalized it and changed to a bar chart.

In [59]:
# normalized to percentage
country_options = countries2['name'].tolist()
selections2 = alt.binding_select(options=country_options, name="Show: ")
buttons2 = alt.selection_single(fields=['company_location'], bind=selections2, 
                                init={'company_location': "U.S.A."},)
vis2 = alt.Chart(choco).mark_bar().transform_joinaggregate(
        groupby=["company_location"],
        location_count="count()"
    ).transform_filter(
        alt.datum.location_count>=40
    ).transform_bin(
        "cocoa_p_bin", field="cocoa_p", bin=alt.Bin(step=0.1)
    ).transform_joinaggregate(
        groupby=["company_location", "cocoa_p_bin"],
        cocoa_location_count="count()"
    ).encode(
        x=alt.X("location_count:Q", 
                stack="normalize",
                axis=alt.Axis(format='%'), 
                title="percentage"),
        y=alt.Y("company_location:N", title=None),
        color=alt.Color("cocoa_p:Q", bin=True, 
                        scale=alt.Scale(scheme=alt.SchemeParams(name="browns", extent=[0, 2])), 
                        title="Cocoa percent"),
        tooltip=[alt.Tooltip("location_count:Q", title="Total Count"), alt.Tooltip("cocoa_location_count:Q", title="Count at this cocoa percent")]
    ).properties(width=600)

vis2.add_selection(buttons2)

In [1]:
country_options = countries2['name'].tolist()
selections2 = alt.binding_select(options=country_options, name="Show: ")
buttons2 = alt.selection_single(fields=['company_location'], bind=selections2, 
                                init={'company_location': "U.S.A."},)
vis2 = alt.Chart(choco).mark_bar().transform_filter(
        buttons2
    ).encode(
        x=alt.X("cocoa_p:Q", bin=True,title="Cocoa percent"),
        y=alt.Y("count(cocoa_p):Q", title="Count"),
        color=alt.Color("cocoa_p:Q", bin=True, 
                        scale=alt.Scale(scheme=alt.SchemeParams(name="browns", extent=[0, 2])), 
                        title="Cocoa percent"),
    ).properties(width=600)
vis2.add_selection(buttons2)

NameError: name 'countries2' is not defined

In [61]:
import altair as alt
import pandas as pd
import numpy as np

rand = np.random.RandomState(42)

df = pd.DataFrame({
    'xval': range(100),
    'yval': rand.randn(100).cumsum()
})

slider = alt.binding_range(min=0, max=100, step=1, name='cutoff:')
selector = alt.selection_single(name="SelectorName", fields=['cutoff'],
                                bind=slider, init={'cutoff': 50})

alt.Chart(df).mark_point().encode(
    x='xval',
    y='yval',
    color=alt.condition(
        alt.datum.xval < selector.cutoff,
        alt.value('red'), alt.value('blue')
    )
).add_selection(
    selector
)

In [62]:
# relationship between production and cocoa percentage, group by country
alt.Chart(choco).mark_point().transform_joinaggregate(
    groupby=["company_location"],
    location_count="count()"
).transform_filter(
    alt.datum.location_count>=40
).encode(
    x=alt.X("cocoa_p:Q", bin=True),
    y="count()",
    color="company_location:N"
)

## LO3: The user will be able to infer the effect of chocolate on health
This visualization will compare the health data of 45 participants that accepts a clinical trial over a duration of four weeks. The participants are given 70% cocoa chocolate with acids (ursolic acids and oleanolic acid) from a Brazilian plant, Mansoa Hirsuta,and the goal of the test was to determine whether Mansoa Hirsuta's antioxidant, anti-inflammatory, antifungal, and antibiotic properties improve health.
The visualization is in format of bar chart, grouped by health parameters and put the before-after bars side to side, so that the change before and after the clinical trial can be identified clearly, and the effect of chocolate on health can be inferred based on the observation. This learning objective can be evaluated by asking users whether there is evidence that chocolate has an effect on health.

In [63]:
filepath = './BMI weight and waist circumference of participants.csv'
health = pd.read_csv(filepath)

health

Unnamed: 0,variable,group,trial,mean,standard deviation
0,Weight (kg),Test,Before,70.07,15.99
1,BMI (kg/m^2),Test,Before,25.42,4.62
2,Waist Circumference (cm),Test,Before,92.13,14.43
3,Weight (kg),Test,After,68.87,15.04
4,BMI (kg/m^2),Test,After,24.99,4.28
5,Waist Circumference (cm),Test,After,88.73,13.26
6,Weight (kg),Placebo,Before,73.73,17.7
7,BMI (kg/m^2),Placebo,Before,26.12,4.87
8,Waist Circumference (cm),Placebo,Before,93.8,13.45
9,Weight (kg),Placebo,After,75.4,18.23


In [64]:
weight = alt.Chart(health).mark_bar().transform_filter(
    alt.datum.variable=="Weight (kg)"
    #alt.datum.trial=="Before"
).encode(
    x=alt.X("trial:N", title=None),
    y=alt.Y("mean:Q"),
    color=alt.Color("group:N"),
    #column=alt.Column("trial", sort=["Before", "After"]),
    #column=alt.Column("variable", sort=["Weight (kg)", "BMI (kg/m^2)", "Waist Circumference (cm)"])
    column=alt.Column("group", sort=["Test", "Placebo", "Control"], title="Weight (kg)")
)
bmi = alt.Chart(health).mark_bar().transform_filter(
    alt.datum.variable=="BMI (kg/m^2)"
    #alt.datum.trial=="Before"
).encode(
    x=alt.X("trial:N", title=None),
    y=alt.Y("mean:Q"),
    color=alt.Color("group:N"),
    #column=alt.Column("trial", sort=["Before", "After"]),
    #column=alt.Column("variable", sort=["Weight (kg)", "BMI (kg/m^2)", "Waist Circumference (cm)"])
    column=alt.Column("group", sort=["Test", "Placebo", "Control"], title="BMI (kg/m^2)	")
)
waist = alt.Chart(health).mark_bar().transform_filter(
    alt.datum.variable=="Waist Circumference (cm)"
    #alt.datum.trial=="Before"
).encode(
    x=alt.X("trial:N", title=None),
    y=alt.Y("mean:Q"),
    color=alt.Color("group:N"),
    #column=alt.Column("trial", sort=["Before", "After"]),
    #column=alt.Column("variable", sort=["Weight (kg)", "BMI (kg/m^2)", "Waist Circumference (cm)"])
    column=alt.Column("group", sort=["Test", "Placebo", "Control"], title="Waist Circumference (cm)")
)
#weight | bmi | waist
options3 = ["Weight (kg)", "BMI (kg/m^2)", "Waist Circumference (cm)"]
buttons3 = alt.selection_single(
    fields=["variable"],
    bind=alt.binding_radio(options=options3, labels=["Weight", "BMI", "Waist Circumference"], name="Parameter: "),)
vis3 = alt.Chart(health).mark_bar().transform_filter(
    buttons3
).encode(
    x=alt.X("trial:N", title=None),
    y=alt.Y("mean:Q"),
    color=alt.Color("group:N"),
    #column=alt.Column("trial", sort=["Before", "After"]),
    #column=alt.Column("variable", sort=["Weight (kg)", "BMI (kg/m^2)", "Waist Circumference (cm)"])
    column=alt.Column("group", sort=["Test", "Placebo", "Control"], title="Waist Circumference (cm)")
)
vis3.add_selection(buttons3)

## LO4: The user will summary the nutrition facts of chocolate
This visualization uses the data of nutrient facts of chocolate bars produced by different manufactors, and use a boxplot to present the calories and fat contained every 100 grams. A boxplot presents maximum, minimum, average and quaters value, gives users a summary of nutrient facts of chocolates. This learning objective can be evaluated by how much energy they would intake if they eat a bar of chocolate.

In [65]:
nutrient = pd.read_csv("chocolates.csv")
nutrient.head()

Unnamed: 0,Name,MFR,Country,Type,Calories,CalFat,TotFat,SatFat,Chol,Na,Carbs,Fiber,Sugars,Protein
0,Noir 86% Cacao,Cote D'Or,Belgium,Dark,460.0,380.0,44.0,26.0,0.0,0.0,28.0,10.0,10.0,8.0
1,70% Cocoa,Cote D'Or,Belgium,Dark,600.0,432.0,48.0,30.0,10.0,0.0,42.0,10.0,28.0,8.0
2,Solid Dark Chocolate Bar,Godiva,Belgium,Dark,534.883721,302.325581,32.55814,20.930233,0.0,0.0,60.465116,9.302326,46.511628,6.976744
3,72% Extra Dark Chocolate,Godiva,Belgium,Dark,534.883721,348.837209,39.534884,20.930233,0.0,81.395349,41.860465,13.953488,27.906977,9.302326
4,Extra Dark Chocolate Bar,Guylian,Belgium,Dark,575.757576,333.333333,39.393939,24.242424,0.0,90.909091,48.484848,12.121212,33.333333,9.090909


In [66]:
cal = alt.Chart(nutrient).mark_boxplot().encode(
    x=alt.X("Type:N"),
    y=alt.Y("Calories:Q"),
    color=alt.Color("Type:N", scale=alt.Scale(scheme="browns"), sort=["Milk", "Dark"])
)
calfat = alt.Chart(nutrient).mark_boxplot().encode(
    x=alt.X("Type:N"),
    y=alt.Y("CalFat:Q", title="Calories from fat"),
    color=alt.Color("Type:N", scale=alt.Scale(scheme="browns"), sort=["Milk", "Dark"])
)
totfat = alt.Chart(nutrient).mark_boxplot().encode(
    x=alt.X("Type:N"),
    y=alt.Y("TotFat:Q", title="Total Fat(g)"),
    color=alt.Color("Type:N", scale=alt.Scale(scheme="browns"), sort=["Milk", "Dark"])
)
satfat = alt.Chart(nutrient).mark_boxplot().encode(
    x=alt.X("Type:N"),
    y=alt.Y("SatFat:Q", title="Saturated Fat(g)"),
    color=alt.Color("Type:N", scale=alt.Scale(scheme="browns"), sort=["Milk", "Dark"])
)
cal | calfat | totfat | satfat

In [69]:
import altair as alt

import pandas as pd

make = pd.DataFrame({'name': ['Honda', 'Ford', 'Dodge']})
fuel = pd.DataFrame({
    'Honda': [9, 8, 8, 7, 7],
    'Ford': [5, 4, 3, 2, 1],
    'Dodge': [6, 5, 5, 3, 4]
}).reset_index().melt(id_vars=['index'], var_name='name', value_name='fuel')

selection = alt.selection_multi(fields=['name'])
color = alt.condition(selection, alt.Color('name:N'), alt.value('lightgray'))
make_selector = alt.Chart(make).mark_rect().encode(y='name', color=color).add_selection(selection)
fuel_chart = alt.Chart(fuel).mark_line().encode(x='index', y=alt.Y('fuel', scale=alt.Scale(domain=[0, 10])), color='name').transform_filter(selection)

make_selector | fuel_chart

There are also scatter points version of this visualization, which enables users to see what brand of chocolate has most and least calories. Since it is not very related to contents of wikipedia article, it is not used in the article.

In [70]:
brands = nutrient['MFR'].drop_duplicates().tolist()
selection4 = alt.binding_select(options=brands, name="Show: ")
buttons4 = alt.selection_multi(fields=['MFR'], bind=selection4, )
                                #init={'MFR': "Bendicks"},)
dark_points = alt.Chart(nutrient).transform_filter(
    buttons4
).mark_point(color="brown", opacity=0.5).encode(
    x=alt.X("MFR:N"),
    y=alt.Y("Calories:Q", scale=alt.Scale(domain=[200, 800])),
    tooltip=['Name','MFR','Country','Calories']
).properties(width=600)
dark_texts = alt.Chart(nutrient).transform_filter(
    (alt.datum.Type == "Dark") & ((alt.datum.Calories > 650) | (alt.datum.Calories < 450))
).mark_text(dy=-10).encode(
    x=alt.X("MFR:N"),
    y=alt.Y("Calories:Q"),
    text=alt.Text("Name")
)
dark_points.add_selection(buttons4)

SchemaValidationError: Invalid specification

        altair.vegalite.v4.schema.core.SelectionDef->1->bind, validating 'anyOf'

        {'input': 'select', 'options': ["Cote D'Or", 'Godiva', 'Guylian', 'Nacional De Chocolates', 'Merci', 'Poulain', "President's Choice", 'Ritter Sport', 'Choceur', 'Lindt', 'Nestle', 'Toblerone', 'Bendicks', 'Cadbury', 'Celtic', "Green & Black's", "Sainsbury's", 'Thorntons', 'Chocolove', 'Dagoba', 'Dove', 'Ghiradelli', "Hershey's", 'Mars', 'Meijer', 'Scharffen Berger', 'Starbucks', "Trader Joe's", 'Milka', 'Jet'], 'name': 'Show: '} is not valid under any of the given schemas
        

In [77]:
types = nutrient["Type"].drop_duplicates().tolist()
selection4 = alt.binding_select(options=types, name="Chocolate type: ")
buttons4 = alt.selection_single(fields=['Type'], bind=selection4, )
milk_points = alt.Chart(nutrient).transform_filter(
    buttons4
).mark_point(opacity=0.5).encode(
    x=alt.X("MFR:N"),
    y=alt.Y("Calories:Q", scale=alt.Scale(domain=[200, 800])),
    color="Type",
    tooltip=['Name','MFR','Country','Calories']
).properties(width=600)
# milk_texts = alt.Chart(nutrient).transform_filter(
#     (alt.datum.Type == "Milk") & ((alt.datum.Calories > 600) | (alt.datum.Calories < 490))
# ).mark_text(dy=-10).encode(
#     x=alt.X("MFR:N"),
#     y=alt.Y("Calories:Q"),
#     text=alt.Text("Name")
# )
milk_points.add_selection(buttons4)

## Datasets source:
 1. Chocolate Bar Ratings. https://www.kaggle.com/datasets/rtatman/chocolate-bar-ratings
 2. Cocoa chocolate and health. https://www.kaggle.com/datasets/lameesmohammad/unique-70-cocoa-chocolate-improves-health
 3. chocolates.csv. https://github.com/schloerke/cranvasOLD/blob/master/files/data/chocolates.csv