In [9]:
from pathlib import Path
from itertools import islice, product
from pprint import pprint

import pandas as pd
import numpy as np
import numpy.typing as npt
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import display
import scopus.biblio_extractor as bex
import catk
from filtre import *


np.set_printoptions(precision=4, suppress=True)
pd.set_option("display.float_format", lambda x: "{:.3f}".format(x))
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_rows", 20)
pd.set_option("display.min_rows", 10)

# (11.7, 8.27) = A4 landscape
sns.set_theme(style="dark", palette="muted", font_scale=1.10, rc={"figure.figsize": (16.54, 11.7)})


DATASET_FILENAME = Path("results/pharmaco_chemistry_cross_2022-05-19_17-18-20.csv")
# DATASET_FILENAME = Path("results/activities_2022-01-29_16-33-05.csv")
print(f"{DATASET_FILENAME.stem = } {DATASET_FILENAME.suffix = }")

SEED = 42


DATASET_FILENAME.stem = 'pharmaco_chemistry_cross_2022-05-19_17-18-20' DATASET_FILENAME.suffix = '.csv'


In [10]:
dataset, margin_rows, margin_cols, number_of_papers = bex.load_results(DATASET_FILENAME)
all_compounds = set(dataset.index.get_level_values(1))
all_activities = set(dataset.columns.get_level_values(1))
with_with_matrix = dataset.xs("w/", level=2).xs("w/", level=2, axis=1)


In [11]:
def apply_metric(data, func, write=False):
    C, A = len(data.index) // 2, len(data.columns) // 2
    values = np.moveaxis(data.values.reshape((C, 2, A, 2)), 1, -2).reshape((C * A, 4))
    matrix = np.apply_along_axis(func, 1, values).reshape((C, A))
    sub = data.xs(bex.SELECTORS[True], axis=0, level=2).xs(bex.SELECTORS[True], axis=1, level=2)
    df = pd.DataFrame(matrix, index=sub.index, columns=sub.columns)
    df.index.name = "Compounds"
    df.columns.name = "Activities"

    if write:
        filename = Path(f"{DATASET_FILENAME.stem}_{func.__namme__}{DATASET_FILENAME.suffix}")
        df.to_csv(Path("results") / filename)
    return df



In [12]:
ROW_THRESHOLD = number_of_papers // 1000
COL_THRESHOLD = number_of_papers // 1000
print(f"Thresholds for rows={ROW_THRESHOLD} and cols={COL_THRESHOLD}")

rows_filter = margin_rows.xs(bex.SELECTORS[1], level=2) >= ROW_THRESHOLD
cols_filter = margin_cols.xs(bex.SELECTORS[1], level=2) >= COL_THRESHOLD
filtered = dataset.loc[rows_filter, cols_filter]
filtered.index.name = "Compounds"
filtered.columns.name = "Activities"

score_df = {}
for metric in metrics:
    metric_name = metric.__name__
    score_df[metric_name] = apply_metric(filtered, metric, False)


scores_summary_df = pd.DataFrame.from_dict(
    {f_name: [df.values.min(), df.values.mean(), df.values.max(), df.values.std()] for f_name, df in score_df.items()},
    orient="index",
    columns=["min", "mean", "max", "std"],
)

ca = catk.ca.CA(SEED)
ca.fit(score_df["tt_projection_metric"])
#display(ca.axes())
#display(ca.contributions(K = 2))



Thresholds for rows=250 and cols=250


<catk.ca.CA at 0x19ca5bf08b0>

In [13]:

import altair as alt

source = ca.contributions(K = 2)["Coords (princ.)"]

# alt.Chart(source).mark_circle(size=60).encode(
#     x=("Coords (princ.)", 1),
#     y=("Coords (princ.)", 2),
# ).interactive()

index = list(source.index)
# index is a list of tuples get the first element and the second one in 2 different lists
color = [i[0] for i in index]
name = [i[1] for i in index]

source = pd.DataFrame({
    "x": list(source[1]),
    "y": list(source[2]),
    "size":list(ca.contributions(K = 2)["Mass (%)"]),
    "color": color,
    "name": name
    
})

display(source)
mark = alt.Chart(source).mark_circle(size=60).encode(
    x='x',
    y='y',
    color='color',
    size='size',
    tooltip=['name', 'color']
).properties(
    height = 500,
    width = 700
).interactive()

mark.save('json_chart/mark.json')


Unnamed: 0,x,y,size,color,name
0,0.797,0.026,22.586,abiotic,antioxidant
1,0.862,-0.074,0.353,abiotic,drought
2,-0.095,-0.134,7.376,abiotic,metal
3,-0.409,-0.474,3.942,abiotic,salt
4,0.356,-0.062,1.969,abiotic,uv
...,...,...,...,...,...
62,-0.277,-0.173,1.131,terpenoid/terpene,diterpene
63,0.077,0.280,0.281,terpenoid/terpene,monoterpene
64,-0.235,0.046,1.956,terpenoid/terpene,sesquiterpene
65,0.888,-0.095,3.526,terpenoid/terpene,tetraterpene/carotenoid/xanthophyll


In [14]:
alt.Chart(source).mark_rect(size=60).encode(
    x='x:O',
    y='y:O',
    color='size:Q',
    tooltip=['name', 'color'],
).interactive().save('json_chart/hit_map.json')