# Vega/Vega-lite dataset generation

Main libraries and global infos

In [1]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
import altair as alt

from IPython.display import display

import catk
import scopus.biblio_extractor as bex


def name_shortener(name, *, max_len=24, padding="…"):
    if len(name) <= max_len:
        return name
    first_alt = name.split(bex.ALT_SEP)[0]
    if len(first_alt) <= max_len:
        return f"{first_alt}{bex.ALT_SEP}{padding}"

    return f"{first_alt[:max_len]}{padding}"


assert name_shortener("heat/resistance") == "heat/resistance"
assert name_shortener("heat stress/heat tolerance/heat resistance") == "heat stress/…"
assert name_shortener("heavy heat important stress/heat tolerance/heat resistance") == "heavy heat important str…"


def df_shortener(df):
    res = df.copy()
    new_idx = pd.MultiIndex.from_tuples([tuple(name_shortener(x) for x in idx) for idx in df.index])
    new_col = pd.MultiIndex.from_tuples([tuple(name_shortener(x) for x in idx) for idx in df.columns])
    res.index = new_idx
    res.columns = new_col
    return res


def series_shortener(ser):
    res = ser.copy()
    new_idx = pd.MultiIndex.from_tuples([tuple(name_shortener(x) for x in idx) for idx in ser.index])
    res.index = new_idx
    return res


'/home/romulus/Documents/unc-informatique.pharmaco-chemistry-biblio/catk/catk/data/data.py' loaded
'/home/romulus/Documents/unc-informatique.pharmaco-chemistry-biblio/catk/catk/data/__init__.py' loaded
'/home/romulus/Documents/unc-informatique.pharmaco-chemistry-biblio/catk/catk/ca.py' loaded
'/home/romulus/Documents/unc-informatique.pharmaco-chemistry-biblio/catk/catk/__init__.py' loaded


Dataset

In [2]:
DATASET_FILENAME = "results/pharmaco_chemistry_2_cross_2022-10-24_15-57-13.csv"
dataset_full, margin_rows, margin_cols, number_of_papers = bex.load_results(DATASET_FILENAME)
margin_rows = series_shortener(margin_rows)
margin_cols = series_shortener(margin_cols)
dataset_full = df_shortener(dataset_full)
dataset_ww = dataset_full.xs("w/", level=2).xs("w/", level=2, axis=1)


display(dataset_full)
display(dataset_ww)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,...,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,toxicity,toxicity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,heat stress/…,heat stress/…,heavy metal stress/…,heavy metal stress/…,hydric stress/…,hydric stress/…,ph stress/…,ph stress/…,salt stress/…,salt stress/…,...,obesity,obesity,rheumatism,rheumatism,sedative/analgesic,sedative/analgesic,wound,wound,toxicity,toxicity
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,w/o,w/,w/o,w/,w/o,w/,w/o,w/,w/o,w/,...,w/o,w/,w/o,w/,w/o,w/,w/o,w/,w/o,w/
alkaloid,benzylamine,w/o,435715,3144,438336,523,436331,2528,438063,796,435270,3589,...,428022,10837,438047,812,423418,15441,427082,11777,381684,57175
alkaloid,benzylamine,w/,967,4,971,0,971,0,968,3,965,6,...,920,51,971,0,899,72,951,20,801,170
alkaloid,colchicine,w/o,427287,3118,429882,523,427896,2509,429609,796,426825,3580,...,419752,10653,429702,703,415298,15107,418931,11474,374687,55718
alkaloid,colchicine,w/,9395,30,9425,0,9406,19,9422,3,9410,15,...,9190,235,9316,109,9019,406,9102,323,7798,1627
alkaloid,cyclopeptide,w/o,433182,3143,435804,521,433799,2526,435528,797,432736,3589,...,425497,10828,435545,780,420912,15413,424595,11730,379416,56909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
terpenoid/terpene,sesterterpene,w/,285,0,285,0,285,0,285,0,285,0,...,283,2,284,1,281,4,283,2,266,19
terpenoid/terpene,tetraterpene/…,w/o,400136,2613,402347,402,401105,1644,401994,755,400321,2428,...,392515,10234,401948,801,387340,15409,391144,11605,347508,55241
terpenoid/terpene,tetraterpene/…,w/,36546,535,36960,121,36197,884,37037,44,35914,1167,...,36427,654,37070,11,36977,104,36889,192,34977,2104
terpenoid/terpene,triterpene/saponin,w/o,412755,3074,415309,520,413340,2489,415038,791,412272,3557,...,405504,10325,415190,639,401397,14432,405055,10774,362184,53645


Unnamed: 0_level_0,Unnamed: 1_level_0,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,biotic,biotic,biotic,biotic,...,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,toxicity
Unnamed: 0_level_1,Unnamed: 1_level_1,heat stress/…,heavy metal stress/…,hydric stress/…,ph stress/…,salt stress/…,uv stress/…,antifeedant/…,attractant,germination,herbicidal,...,burns,cardiovascular,cytotoxicity,dementia/alzheimer,immuno-modulatory,obesity,rheumatism,sedative/analgesic,wound,toxicity
alkaloid,benzylamine,4,0,0,3,6,0,1,1,10,9,...,1,93,159,84,0,51,0,72,20,170
alkaloid,colchicine,30,0,19,3,15,2,1,1,179,5,...,41,987,1671,213,2,235,109,406,323,1627
alkaloid,cyclopeptide,5,2,2,2,6,2,1,1,32,2,...,2,134,676,64,1,60,32,100,67,436
alkaloid,imidazole,149,3,8,76,35,17,24,15,99,25,...,154,2010,3766,558,3,388,6,1419,658,3866
alkaloid,indole,194,84,344,70,571,80,172,126,1358,37,...,38,1400,4922,1059,2,493,31,1284,642,4160
alkaloid,indolizidine,0,0,0,0,0,0,3,0,2,0,...,0,0,35,3,0,0,1,4,1,35
alkaloid,isoquinoline,13,1,5,3,7,1,5,2,21,6,...,6,205,938,201,0,57,7,271,59,520
alkaloid,isoxazole,1,0,1,1,0,1,3,1,7,35,...,5,74,406,178,0,51,7,265,50,405
alkaloid,muscarine,1,0,0,0,0,0,0,0,0,0,...,1,44,2,42,0,2,0,27,5,36
alkaloid,oxazole,15,0,1,6,0,2,2,3,6,15,...,3,76,374,49,0,50,0,103,26,281


## Sankey diagram

To JSON format needed by `sankey.json`.

In [3]:
source = []
for idx in dataset_ww.index.get_level_values(1):
    dataset_ww.xs(idx, level=1)
    for idx2 in dataset_ww.columns.get_level_values(1):
        dataset_ww.xs(idx2, level=1, axis=1)
        value = int(dataset_ww.xs(idx, level=1).xs(idx2, level=1, axis=1).sum())
        source.append({"key": {"stk1": idx, "stk2": idx2}, "doc_count": value})


data_sankey = {"aggregations": {"table": {"buckets": source}}}


In [27]:
SANKEY_DATA = Path("vizu/data/sankey_data.json")
with open(SANKEY_DATA, "w") as json_output:
    json.dump(data_sankey, json_output, indent=2, ensure_ascii=False)


## Histograms

To JSON format needed by `vbar.json`.


In [28]:
def generate_vbar_data():
    for (c_class, com), row in dataset_ww.iterrows():
        for (a_class, acti), value in row.items():
            yield {"value": value, "acti": acti, "com": com, "class": a_class}


vbar_data = list(generate_vbar_data())

VBAR_DATA = Path("vizu/data/vbar_data.json")
with open(VBAR_DATA, "w") as json_output:
    json.dump(vbar_data, json_output, indent=2, ensure_ascii=False)


Now, we have to fill **manually** the select boxes in `vbar.json` from the following data.

In [30]:
# for activities classes
domain = json.dumps(list(dataset_ww.columns.get_level_values(0)), ensure_ascii=False)
display(domain)

# for compounds
options = json.dumps(list(dataset_ww.index.get_level_values(1)), ensure_ascii=False)
# note :: manulally remove extra \ in \u2026 if ascii is forced
display(options)


## For Correspondence Analysis





In [37]:
import metrics as m
SEED = 42
ca = catk.ca.CA(SEED)


source = {
    "x": [],
    "y": [],
    "size": [],
    "color": [],
    "name": [],
    "type": [],
    "shape": [],
    "croix": [],
}

THRESHOLD = 1000
ROW_THRESHOLD = number_of_papers // THRESHOLD
COL_THRESHOLD = number_of_papers // THRESHOLD
print(f"Thresholds for rows={ROW_THRESHOLD} and cols={COL_THRESHOLD}")

# for func in m.metrics:
    # print(func.label if func.label else func.__name__)

metrics_names = [metric.label if metric.label else metric.__name__ for metric in m.metrics]
json.dumps(metrics_names, ensure_ascii=False)

Thresholds for rows=439 and cols=439


`metrics_names` is to bet set manually in `mark.json` in `bind.options`.

In [8]:
rows_filter = margin_rows.xs(bex.SELECTORS[1], level=2) >= ROW_THRESHOLD
cols_filter = margin_cols.xs(bex.SELECTORS[1], level=2) >= COL_THRESHOLD
filtered = dataset_full.loc[rows_filter, cols_filter]
filtered.index.name = "Compounds"
filtered.columns.name = "Activities"
display(filtered)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,...,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,toxicity,toxicity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,heat stress/…,heat stress/…,heavy metal stress/…,heavy metal stress/…,hydric stress/…,hydric stress/…,ph stress/…,ph stress/…,salt stress/…,salt stress/…,...,obesity,obesity,rheumatism,rheumatism,sedative/analgesic,sedative/analgesic,wound,wound,toxicity,toxicity
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,w/o,w/,w/o,w/,w/o,w/,w/o,w/,w/o,w/,...,w/o,w/,w/o,w/,w/o,w/,w/o,w/,w/o,w/
alkaloid,benzylamine,w/o,435715,3144,438336,523,436331,2528,438063,796,435270,3589,...,428022,10837,438047,812,423418,15441,427082,11777,381684,57175
alkaloid,benzylamine,w/,967,4,971,0,971,0,968,3,965,6,...,920,51,971,0,899,72,951,20,801,170
alkaloid,colchicine,w/o,427287,3118,429882,523,427896,2509,429609,796,426825,3580,...,419752,10653,429702,703,415298,15107,418931,11474,374687,55718
alkaloid,colchicine,w/,9395,30,9425,0,9406,19,9422,3,9410,15,...,9190,235,9316,109,9019,406,9102,323,7798,1627
alkaloid,cyclopeptide,w/o,433182,3143,435804,521,433799,2526,435528,797,432736,3589,...,425497,10828,435545,780,420912,15413,424595,11730,379416,56909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
terpenoid/terpene,sesquiterpene,w/,11850,36,11886,0,11844,42,11886,0,11862,24,...,11778,108,11833,53,11550,336,11589,297,10406,1480
terpenoid/terpene,tetraterpene/…,w/o,400136,2613,402347,402,401105,1644,401994,755,400321,2428,...,392515,10234,401948,801,387340,15409,391144,11605,347508,55241
terpenoid/terpene,tetraterpene/…,w/,36546,535,36960,121,36197,884,37037,44,35914,1167,...,36427,654,37070,11,36977,104,36889,192,34977,2104
terpenoid/terpene,triterpene/saponin,w/o,412755,3074,415309,520,413340,2489,415038,791,412272,3557,...,405504,10325,415190,639,401397,14432,405055,10774,362184,53645


In [34]:
score_df = {}
for name, metric in zip(metrics_names, m.metrics):
    score_df[name] = m.apply_metric(filtered, metric)


scores_summary_df = pd.DataFrame.from_dict(
    {f_name: [df.values.min(), df.values.mean(), df.values.max(), df.values.std()] for f_name, df in score_df.items()},
    orient="index",
    columns=["min", "mean", "max", "std"],
)

scores_summary_df

Unnamed: 0,min,mean,max,std
Simple projection,0.0,490.203247,55613.0,1972.028545
% of compound with the activity,0.0,0.039809,0.71826,0.071489
% of activity with the compound,0.0,0.027547,0.7824,0.056851
Fowlkes-Mallows index,0.0,0.023451,0.563134,0.039622
The odds of having both the compound and the activity,2.137598e-08,1.315835,125.621097,3.881873
Accuracy,0.6827274,0.935154,0.997738,0.060765


In [35]:
size = list(pd.concat([margin_rows[rows_filter], margin_cols[cols_filter]]).xs(bex.SELECTORS[True], level=2))

print(size)
shape = ["Activity" for _ in margin_rows[rows_filter].xs(bex.SELECTORS[True], level=2)] + [
    "Compound" for _ in margin_cols[cols_filter].xs(bex.SELECTORS[True], level=2)
]

print(shape)
legend_class = [name[0] for name in margin_rows[rows_filter].xs(bex.SELECTORS[True], level=2).index] + [
    name[0] for name in margin_cols[cols_filter].xs(bex.SELECTORS[True], level=2).index
]
legend_class = list(set(legend_class))
print(legend_class)



[971, 9425, 3505, 22464, 26967, 3624, 2602, 1731, 13092, 10401, 2693, 26110, 5014, 1100, 4860, 21245, 2925, 2744, 14624, 510, 15703, 2352, 6964, 12176, 98010, 5416, 4028, 9058, 2654, 7740, 20305, 2931, 472, 14329, 3154, 1435, 36685, 13916, 13885, 5133, 3018, 11886, 37081, 24001, 3148, 523, 2528, 799, 3595, 2339, 2761, 895, 7478, 656, 2004, 56983, 1101, 4591, 564, 948, 38566, 37965, 35653, 9142, 24298, 58756, 99508, 2148, 13952, 10370, 1847, 20930, 59402, 12004, 10888, 812, 15513, 11797, 57345]
['Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activity', 'Activit

In [15]:


data_select = [metric.label if metric.label else metric.__name__ for metric in m.metrics]
for metric_name in data_select:
    ca.fit(score_df[metric_name])
    current_metrix = ca.contributions(K=2)["Coords (princ.)"]

    index = list(current_metrix.index)
    # index is a list of tuples get the first element and the second one in 2 different lists
    color = [i[0] for i in index]
    name = [i[1] for i in index]
    x = list(current_metrix[1])
    y = list(current_metrix[2])

    if metric_name == "accuracy_metric":
        x = [element * 100 for element in x]
        y = [element * 100 for element in y]

    source = {
        "x": source["x"] + x,
        "y": source["y"] + y,
        "size": source["size"] + size,
        "color": source["color"] + color,
        "name": source["name"] + name,
        "type": source["type"] + [metric_name] * len(current_metrix[1]),
        "shape": source["shape"] + shape,
        "croix": 0,
    }



In [17]:
source_df = pd.DataFrame(source)
display(source_df)

MARK_DATA = Path("vizu/data/mark_data.json")
source_df.to_json(MARK_DATA, orient="records", force_ascii=False, indent=2)

Unnamed: 0,x,y,size,color,name,type,shape,croix
0,0.388516,-0.024124,971,abiotic,heat stress/…,Simple projection,Activity,0
1,0.040071,-0.929503,9425,abiotic,heavy metal stress/…,Simple projection,Activity,0
2,0.977152,-0.170381,3505,abiotic,hydric stress/…,Simple projection,Activity,0
3,-0.039292,0.086074,22464,abiotic,ph stress/…,Simple projection,Activity,0
4,0.824716,-0.186538,26967,abiotic,salt stress/…,Simple projection,Activity,0
...,...,...,...,...,...,...,...,...
469,-0.001428,0.000567,10888,terpenoid/terpene,diterpene,Accuracy,Compound,0
470,-0.001130,0.000409,812,terpenoid/terpene,monoterpene,Accuracy,Compound,0
471,-0.001212,-0.000155,15513,terpenoid/terpene,sesquiterpene,Accuracy,Compound,0
472,0.013555,0.006049,11797,terpenoid/terpene,tetraterpene/…,Accuracy,Compound,0
