# Extract single leaf images from sheets dataset

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import warnings

warnings.simplefilter(action="ignore", category=UserWarning)

from pathlib import Path

from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from rich.progress import track

from plotly.subplots import make_subplots
import plotly.graph_objects as go

sys.path.insert(0, os.path.join("..", "scripts"))

import gav_mildiou_const as goc
import gav_mildiou_func as gof


## Constants

In [None]:
DATA_COLUMNS = ["oiv", "sporulation", "densite_sporulation", "necrose", "taille_necrose", "surface_necrosee"]

## Functions

In [None]:
def plot_balance(df):
    columns = [
        ["oiv", "", ""],
        ["sporulation", "densite_sporulation", ""],
        ["necrose", "taille_necrose", "surface_necrosee"],
    ]

    fig = make_subplots(rows=3, cols=3, subplot_titles=np.array(columns).flatten())

    for idl, l in enumerate(columns):
        for idc, c in enumerate(l):
            if not c:
                continue
            fig.add_trace(
                go.Histogram(
                    x=df[c].sort_values().astype(str),
                    texttemplate="%{y}",
                    textfont_size=20,
                    name=c,
                ),
                row=idl + 1,
                col=idc + 1,
            )

    fig.update_layout(
        xaxis_title="Value",
        yaxis_title="Count",
        height=800,
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )

    fig.update_layout(
        legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99),
    )

    return fig


In [None]:
def show_unique_values(df):
    for col in df.columns.to_list():
        print(f"{col}: {df[col].unique()}")

## Load source dataframe

In [None]:
allowed_vals = [1, 3, 5, 7, 9]


In [None]:
dfs = gof.build_all_dataframes()


In [None]:
dfs.keys()

In [None]:
df = (
    gof.build_all_dataframes()["raw_merged"]
    .assign(rep=lambda x: x.experiment.str.split(pat="_", expand=True)[1])
    .assign(rep=lambda x: x.rep.str.replace("saisie", "NA"))
    .assign(
        experiment=lambda x: x.experiment.str.split(pat="_", expand=True)[0],
        year=lambda x: x.experiment.str.lower()
        .str.split(pat="exp", expand=True)[1]
        .str.split(pat="dm", expand=True)[0]
        .astype(int),
    )
    .assign(
        necrose=lambda x: x.necrose.replace(r"^([A-Za-z]|_)+$", np.NaN, regex=True),
        oiv=lambda x: x.oiv.replace(r"^([A-Za-z]|_)+$", np.NaN, regex=True),
        sporulation=lambda x: x.sporulation.replace(
            r"^([A-Za-z]|_)+$", np.NaN, regex=True
        ),
        surface_necrosee=lambda x: x.surface_necrosee.replace(
            r"^([A-Za-z]|_)+$", np.NaN, regex=True
        ),
        densite_sporulation=lambda x: x.densite_sporulation.replace(
            r"^([A-Za-z]|_)+$", np.NaN, regex=True
        ),
        taille_necrose=lambda x: x.taille_necrose.replace(
            r"^([A-Za-z]|_)+$", np.NaN, regex=True
        ),
    )
)

df = df[df.year.isin([20, 21, 22])]

def try_for_number(val):
    try:
        int(val)
    except:
        pass
    else:
        return int(val)
    try:
        float(val)
    except:
        return np.NaN
    else:
        try:
            return round(float(val))
        except:
            return np.NaN


for col in DATA_COLUMNS:
    df[col] = df[col].apply(lambda x: try_for_number(x))


for k, v in {
    "oiv": allowed_vals,
    "sporulation": [0, 1],
    "densite_sporulation": allowed_vals,
    "necrose": [0, 1],
    "taille_necrose": allowed_vals,
    "surface_necrosee": allowed_vals,
}.items():
    df[k] = df[k].apply(lambda x: x if x in v else np.NaN)

df = (
    df.assign(
        necrose=lambda x: x.necrose.astype("Int64"),
        oiv=lambda x: x.oiv.astype("Int64"),
        sporulation=lambda x: x.sporulation.astype("Int64"),
        surface_necrosee=lambda x: x.surface_necrosee.astype("Int64"),
        densite_sporulation=lambda x: x.densite_sporulation.astype("Int64"),
        taille_necrose=lambda x: x.taille_necrose.astype("Int64"),
        dai=lambda x: x.sheet.str.extract("(\d+)"),
    )
    .drop_duplicates()
    .sort_values(["experiment", "image_name", "ligne", "colonne"])
    .reset_index(drop=True)
)


# df = gof.invert_axis(df, 0)

df

plot_balance(df)


In [None]:
df

In [None]:
df.oiv.plot.hist()

In [None]:
show_unique_values(df[DATA_COLUMNS])

In [None]:
df.shape

## Create sample dataframe

In [None]:
data = []
vals = [0, 1, 3, 5, 7, 9]
sample_size = 3
for year in track(df.year.unique()):
    # Numeric Variables
    for var in ["densite_sporulation", "taille_necrose", "surface_necrosee", "oiv"]:
        for val in vals:
            tmp_df = df[(df.year == year) & (df[var] == val)]
            data.append(tmp_df.sample(n=min(sample_size, tmp_df.shape[0])))
    # Binary varaibles
    for var in [
        "sporulation",
        "necrose",
    ]:
        for val in [0, 1]:
            tmp_df = df[(df.year == year) & (df[var] == val)]
            data.append(tmp_df.sample(n=min(sample_size, tmp_df.shape[0])))
    # Trash
    # for var in [
        # "densite_sporulation",
        # "taille_necrose",
        # "surface_necrosee",
        # "oiv",
    # ]:
    #     tmp_df = df[(df.year == year) & (~df[var].isin(vals))]
    #     data.append(tmp_df.sample(n=min(sample_size, tmp_df.shape[0])))
    # for var in [
    #     "sporulation",
    #     "necrose",
    # ]:
    #     tmp_df = df[(df.year == year) & (~df[var].isin([0, 1]))]
    #     data.append(tmp_df.sample(n=min(sample_size, tmp_df.shape[0])))


df_ld = (
    pd.concat(data)
    .drop_duplicates()
    .reset_index(drop=True)
    .assign(exp_folder=lambda x: "EXP-20" + x.year.astype(str))
)

plot_balance(df_ld)


In [None]:
df_ld.shape

In [None]:
df_ld.to_csv(Path.cwd().parent.joinpath(goc.dataframes_path, "ld_dataset_ilastik_train.csv"))

## Load existing dataset dataframe

In [None]:
d = pd.read_csv(str(Path.cwd().parent.joinpath(goc.dataframes_path, "ld_dataset_ilastik_train.csv")), sep=",")
d

## Balance overview

In [None]:
plot_balance(d)

In [None]:
d.sheet.unique()

In [None]:
d.loc[d.sheet == "Feuil1", "sheet"] = "fichier_total"
d.loc[d.sheet == "fichier total", "sheet"] = "fichier_total"

d.sheet.unique()

## Serach folders related to experiment

In [None]:
Path.cwd()

In [None]:
root_folder = Path.cwd().parent.joinpath("data_in", "gav_phenotypage")
root_folder.is_dir()

In [None]:
def get_ld_sheet(exp_year_folder, experiment, rep, dai, image_name) -> Path | str:
    fld_candidates = [
        f for f in root_folder.joinpath(exp_year_folder).glob(f"*{experiment}*")
    ]
    if len(fld_candidates) == 0:
        return "No match for experiment folder"
    elif len(fld_candidates) > 1:
        return f"Ambiguous experiment folder, {len(fld_candidates)} found"

    fld_candidate = fld_candidates[0]
    if fld_candidate.is_dir() is False:
        return "Experiment folder is not folder"

    name_parts = image_name.replace("-", "_").split("_")
    if len(name_parts[-1]) < 3:
        nparts = name_parts[:-1]
        end = name_parts[-1]
        image_name = "_".join(name_parts[:-1]) + "_" + end[0] + "0" + end[1]

    if len(name_parts) == 3:
        e, i, p = image_name.replace("-", "_").split("_")
        r_img_name = (
            f"{e}_{i}_T{int(dai) if type(dai) == int or type(dai) == float else 0}_{p}"
        )
    else:
        r_img_name = image_name.replace("-", "_")
    candidates = [
        fc
        for fc in fld_candidate.glob(
            f"**/*{r_img_name.replace('i', '?').replace('I', '?')}.*"
        ) if fc.suffix in [".JPG", ".jpg"]
    ]

    if len(candidates) == 0:
        return "No image matches name"
    elif len(candidates) == 1:
        return candidates[0]
    else:
        return f"Ambiguous image query"


In [None]:
def get_sheet_from_row(row) -> Path | str:
    return get_ld_sheet(
        exp_year_folder=row.exp_folder.to_list()[0],
        experiment=row.experiment.to_list()[0],
        rep=row.rep.to_list()[0],
        dai=row.dai.to_list()[0],
        image_name=row.image_name.to_list()[0],
    )

In [None]:
graber = d[["exp_folder", "experiment", "rep", "dai", "image_name"]].drop_duplicates().reset_index()
graber

In [None]:
try_data = {
    "exp_folder": [],
    "experiment": [],
    "rep": [],
    "dai": [],
    "image_name": [],
    "file_path": [],
}

for row in tqdm([row for _, row in d.iterrows()]):
    try:
        try_data["exp_folder"].append(row.exp_folder)
        try_data["experiment"].append(row.experiment)
        try_data["rep"].append(row.rep)
        try_data["dai"].append(row.dai)
        try_data["image_name"].append(row.image_name)
        try_data["file_path"].append(
            str(
                get_ld_sheet(
                    exp_year_folder=row.exp_folder,
                    experiment=row.experiment,
                    rep=row.rep,
                    dai=row.dai,
                    image_name=row.image_name,
                )
            )
        )
    except Exception as e:
        try_data["file_path"].append(str(e))


try_outcome = pd.DataFrame(data=try_data)
try_outcome


In [None]:
bad_outcomes = {k:try_outcome[try_outcome.file_path == k ]  for k in try_outcome[~try_outcome.file_path.str.contains("/")].file_path.unique()}
bad_outcomes.keys()
# try_outcome.file_path.unique()

In [None]:
nimn = bad_outcomes["No image matches name"].sort_values(["experiment", "image_name"]).drop_duplicates().reset_index(drop=True)
nimn.experiment.unique()

In [None]:
nimn

In [None]:
good_outcomes = try_outcome[try_outcome.file_path.str.contains("/")]
good_outcomes

In [None]:
import shutil

for file_ in tqdm(good_outcomes.file_path.to_list()):
    shutil.copyfile(file_, Path.cwd().parent.joinpath("data_in", "images", "ld_copied", Path(file_).name))

In [None]:
df[df.image_name == "Exp22DM08_inoc2_T5_P29"]