In [1]:
import json
import pandas as pd
import scopus.biblio_extractor as bex


demo = [
    {"key": {"stk1": "aa", "stk2": "11"}, "doc_count": 7},
    {"key": {"stk1": "aa", "stk2": "111"}, "doc_count": 4},
    {"key": {"stk1": "cc", "stk2": "111"}, "doc_count": 4},
    {"key": {"stk1": "bb", "stk2": "1111"}, "doc_count": 8}
]


def name_shortener(name, *, max_len=24, padding="…"):
    if len(name) <= max_len:
        return name
    first_alt = name.split(bex.ALT_SEP)[0]
    if len(first_alt) <= max_len:
        return f"{first_alt}{bex.ALT_SEP}{padding}"

    return f"{first_alt[:max_len]}{padding}"


print(name_shortener("heat/resistance"))
print(name_shortener("heat stress/heat tolerance/heat resistance"))
print(name_shortener("heavy heat important stress/heat tolerance/heat resistance"))


def df_shortener(df):
    res = df.copy()
    new_idx = pd.MultiIndex.from_tuples([tuple(name_shortener(x) for x in idx) for idx in df.index])
    new_col = pd.MultiIndex.from_tuples([tuple(name_shortener(x) for x in idx) for idx in df.columns])
    res.index = new_idx
    res.columns = new_col
    return res


def series_shortener(ser):
    res = ser.copy()
    new_idx = pd.MultiIndex.from_tuples([tuple(name_shortener(x) for x in idx) for idx in ser.index])
    res.index = new_idx
    return res


heat/resistance
heat stress/…
heavy heat important str…


In [2]:

# DATASET_FILENAME = ("results/pharmaco_chemistry_cross_2022-05-19_17-18-20.csv")
DATASET_FILENAME = "results/pharmaco_chemistry_2_cross_2022-10-24_15-57-13.csv"
dataset, _, _, number_of_papers = bex.load_results(DATASET_FILENAME)
all_compounds = set(dataset.index.get_level_values(1))
all_activities = set(dataset.columns.get_level_values(1))

dataset = df_shortener(dataset.xs("w/", level=2).xs("w/", level=2, axis=1))

(dataset)


Unnamed: 0_level_0,Unnamed: 1_level_0,abiotic,abiotic,abiotic,abiotic,abiotic,abiotic,biotic,biotic,biotic,biotic,...,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,pharmaco,toxicity
Unnamed: 0_level_1,Unnamed: 1_level_1,heat stress/…,heavy metal stress/…,hydric stress/…,ph stress/…,salt stress/…,uv stress/…,antifeedant/…,attractant,germination,herbicidal,...,burns,cardiovascular,cytotoxicity,dementia/alzheimer,immuno-modulatory,obesity,rheumatism,sedative/analgesic,wound,toxicity
alkaloid,benzylamine,4,0,0,3,6,0,1,1,10,9,...,1,93,159,84,0,51,0,72,20,170
alkaloid,colchicine,30,0,19,3,15,2,1,1,179,5,...,41,987,1671,213,2,235,109,406,323,1627
alkaloid,cyclopeptide,5,2,2,2,6,2,1,1,32,2,...,2,134,676,64,1,60,32,100,67,436
alkaloid,imidazole,149,3,8,76,35,17,24,15,99,25,...,154,2010,3766,558,3,388,6,1419,658,3866
alkaloid,indole,194,84,344,70,571,80,172,126,1358,37,...,38,1400,4922,1059,2,493,31,1284,642,4160
alkaloid,indolizidine,0,0,0,0,0,0,3,0,2,0,...,0,0,35,3,0,0,1,4,1,35
alkaloid,isoquinoline,13,1,5,3,7,1,5,2,21,6,...,6,205,938,201,0,57,7,271,59,520
alkaloid,isoxazole,1,0,1,1,0,1,3,1,7,35,...,5,74,406,178,0,51,7,265,50,405
alkaloid,muscarine,1,0,0,0,0,0,0,0,0,0,...,1,44,2,42,0,2,0,27,5,36
alkaloid,oxazole,15,0,1,6,0,2,2,3,6,15,...,3,76,374,49,0,50,0,103,26,281


In [3]:
#{"key": {"stk1": "bb", "stk2": "1111"}, "doc_count": 8}
source = []
for idx in dataset.index.get_level_values(1):
    dataset.xs(idx, level=1)
    for idx2 in dataset.columns.get_level_values(1):
        dataset.xs(idx2, level=1, axis=1)
        value = int(dataset.xs(idx, level=1).xs(idx2, level=1, axis=1).sum())
        source.append({"key": {"stk1": idx, "stk2": idx2}, "doc_count": value})

# source

In [4]:
data_sankey =  {
        "aggregations": {
            "table": {
               "buckets": source
                  }
            }
        }
# data_sankey

In [5]:


with open("vizu/data/sankey_data.json", "w") as f:
    json.dump(data_sankey, f)

