# Aggregated heatmap

Corpus x value

In [1]:
import json
from glob import glob
import re

import pandas as pd

# import colorcet as cc
from bokeh.models import LabelSet, ColumnDataSource, LinearColorMapper
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

from corpora import code2country, corpora, country2code
from util import story_tokenize, collect_tokens, fname2name, stats
from create import tokenize_values, load_source, calc_occurences
from palettes import pal

In [2]:
from stemmers import stemmers

# fname = "values-flat"
# fname = "Refined_dictionary"  # from https://osf.io/vt8nf
# fname = "Provisional_dictionary"  # from https://osf.io/vt8nf
# fname = "mft_original"  # from https://github.com/medianeuroscience/emfd
# fname = "mfd2"  # from https://github.com/medianeuroscience/emfd
fname = "values-edited"
# tkn = "sb2"
tkn = "sb"
# tkn = "ps"
# tkn = 'dummy'
# tkn = 'wnl'

In [3]:
values, values_backref = tokenize_values(tkn, fname=fname)
print(f"Values count: {len(values)}")
# values

Values count: 31


In [4]:
fulltexts, tokenized = load_source(stemmers[tkn], corpora)

In [5]:
stats(fulltexts, tokenized)

tales: {'Italy': 30, 'Germany': 30, 'Portugal': 30}
symbols: {'Italy': 234158, 'Germany': 306475, 'Portugal': 231149}
tokens: {'Italy': 45223, 'Germany': 59500, 'Portugal': 44887}


({'Italy': 30, 'Germany': 30, 'Portugal': 30},
 {'Italy': 234158, 'Germany': 306475, 'Portugal': 231149},
 {'Italy': 45223, 'Germany': 59500, 'Portugal': 44887})

In [6]:
occurences, occurences_tv, occurences_backref = calc_occurences(values, tokenized)
sum(v for v in occurences.values()), sum(1 for o in occurences_backref.keys())

(5085, 29)

In [7]:
data = [
    [
        k[0].split("/")[0],
        f"{fname2name(k[0])} [{country2code[k[0].split('/')[0]]}]",
        k[1],
        v,
        str(v),
        f"/index.html#/{tkn}/{k[0]}.html",
    ]
    for k, v in occurences.items()
]

# print(data)
df = pd.DataFrame(data)
df.columns = ["country", "text", "value", "count", "label", "url"]
# value_range = sorted(values)

df.head()

Unnamed: 0,country,text,value,count,label,url
0,Italy,Don Firriulieddu [it],brother,12,12,/index.html#/sb/Italy/LXXVI_DON_FIRRIULIEDDU_....
1,Italy,Don Firriulieddu [it],love,1,1,/index.html#/sb/Italy/LXXVI_DON_FIRRIULIEDDU_....
2,Italy,Don Firriulieddu [it],pieti,3,3,/index.html#/sb/Italy/LXXVI_DON_FIRRIULIEDDU_....
3,Italy,Don Firriulieddu [it],mother,6,6,/index.html#/sb/Italy/LXXVI_DON_FIRRIULIEDDU_....
4,Italy,Little Chick Pea [it],justic,1,1,/index.html#/sb/Italy/LXXVII_LITTLE_CHICK_PEA_...


In [8]:
df.groupby(["country", "label"]).sum("count")

Unnamed: 0_level_0,Unnamed: 1_level_0,count
country,label,Unnamed: 2_level_1
Germany,1,89
Germany,10,80
Germany,11,33
Germany,12,36
Germany,13,39
...,...,...
Portugal,6,60
Portugal,64,64
Portugal,7,42
Portugal,8,56


In [9]:
df_agg = df.groupby(["country", "value"]).sum("count")
df_agg["country"] = df_agg.index.get_level_values(0)
df_agg["value"] = df_agg.index.get_level_values(1)
df_agg["label"] = df_agg["count"].astype(str)
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,count,country,value,label
country,value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Germany,brother,108,Germany,brother,108
Germany,cooper,93,Germany,cooper,93
Germany,correct,3,Germany,correct,3
Germany,curios,7,Germany,curios,7
Germany,dialogu,2,Germany,dialogu,2
...,...,...,...,...,...
Portugal,right,6,Portugal,right,6
Portugal,smart,8,Portugal,smart,8
Portugal,solidar,3,Portugal,solidar,3
Portugal,toler,6,Portugal,toler,6


In [10]:
df.groupby(["country"]).sum("count")

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
Germany,1848
Italy,1803
Portugal,1434


In [11]:
text_range = list(set(df_agg["country"].sort_values()))
value_range = sorted(
    list(occurences_backref.keys()), key=lambda x: -sum(occurences_backref[x].values())
)

max_count = df_agg["count"].max()

# reshape to 1D array or rates with a month and year for each row.
# this is the colormap from the original NYTimes plot
# colors = [
#     "#75968f",
#     "#a5bab7",
#     "#c9d9d3",
#     "#e2e2e2",
#     "#dfccce",
#     "#ddb7b1",
#     "#cc7878",
# ]  # , "#933b41", "#550b1d"]
source = ColumnDataSource(data=df_agg)

# TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

# output_notebook()
title = f"Value frequencies by country ({tkn})"
output_file(filename=f"values2countries-{tkn}.html", title=title)

p = figure(
    title=title,
    y_range=text_range,
    x_range=value_range,
    x_axis_location="above",
    width=27 * len(value_range),
    height=200,
    toolbar_location=None,
    tools=[],
    tooltips=[("label", "@value/@country: @count")],
)

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
# p.axis.major_label_text_font_size = "7px"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 1.4


for c in corpora:
    gmapper = LinearColorMapper(palette=pal[c], low=max_count, high=0)
    gsource = ColumnDataSource(data=df_agg[df_agg["country"] == c])
    p.rect(
        x="value",
        y="country",
        width=1,
        height=1,
        source=gsource,
        fill_color={"field": "count", "transform": gmapper},
        line_color=None,
    )

labels = LabelSet(
    x="value",
    y="country",
    text="label",
    y_offset=-5,
    text_align="center",
    level="glyph",
    # render_mode="canvas",
    text_color="grey",
    text_font_size="1em",
    source=source,
)
p.add_layout(labels)

p.toolbar.logo = None
# p.toolbar_location = None

show(p)