In [None]:
import os
import warnings
import itertools

%load_ext autoreload
%autoreload 2

import markdown

import pandas as pd
import numpy as np


from PIL import Image as PilImage

import datapane as dp


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from yellowbrick.cluster import (
    KElbowVisualizer,
    SilhouetteVisualizer,
    InterclusterDistance,
)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import ipywidgets as ipw

import sys  
sys.path.insert(0, os.path.join("..", "scripts"))

import gav_oidium_func as gof
import gav_oidium_const as goc
import gav_oidium_text as got
import gav_oidium_plot_plotly as gop

import IPython.display as disp

import ipywidgets as widgets
from IPython.display import Image as IpImage
from IPython.display import display
from ipywidgets import Button, HBox, VBox

pd.options.plotting.backend = "plotly"
pd.options.display.float_format = "{:4,.2f}".format

warnings.simplefilter("ignore")

os.chdir(os.path.abspath(''))
os.chdir("..")
os.getcwd()

In [None]:
def disp_mkdw(text):
    disp.display(disp.Markdown(text))

def disp_img(path):
    disp.display(disp.Image(path))

def get_yellow_fig(visualizer):
    fig = visualizer.fig
    ax = visualizer.show()
    fig.axes.append(ax)
    return visualizer.fig

default_plot_height = 600

conflict_columns = [
    "sporulation",
    "necrose",
    "taille_necrose",
    "surface_necrosee",
    "densite_sporulation",
]

In [None]:
txt_title = f"{goc.lvl_1_header} Collation and Review of Downy Mildew Annotations"
disp_mkdw(txt_title)

In [None]:
txt_intro = f"""
{goc.lvl_2_header} Introduction
In this document we will collate all available OIV 452 annotation and then review them in the hope of predicting OIV 452 using the new proposed variables. 
When this fails we will try to understand the issues and propose actions the adress the problems.

We will 1) explain what OIV 452 and Downy Mildiou 2) Build a model ready dataframe
"""
disp_mkdw(txt_intro)

In [None]:
txt_def_pm = f"""
{goc.lvl_2_header} Powdery Mildew
&ndash; *Powdery Mildew, from [Wikipedia](https://en.wikipedia.org/wiki/Downy_mildew)

**Downy mildew** refers to any of several types of oomycete microbes that are obligate parasites of plants. 
Downy mildews exclusively belong to the Peronosporaceae family. In commercial agriculture, they are a 
particular problem for growers of crucifers, grapes and vegetables that grow on vines. 
The prime example is Peronospora farinosa featured in NCBI-Taxonomy and HYP3. 
This pathogen does not produce survival structures in the northern states of the United States, 
and overwinters as live mildew colonies in Gulf Coast states. It progresses northward with cucurbit production 
each spring. Yield loss associated with downy mildew is most likely related to soft rots that occur after plant 
canopies collapse and sunburn occurs on fruit. Cucurbit downy mildew only affects leaves of cucurbit plants.

**Symptoms**: Initial symptoms include large, angular or blocky, yellow areas visible on the upper surface. 
As lesions mature, they expand rapidly and turn brown. The under surface of infected leaves appears watersoaked. 
Upon closer inspection, a purple-brown mold (see arrow) becomes apparent. Small spores shaped like footballs can be 
observed among the mold with a 10x hand lens. In disease-favorable conditions (cool nights with long dew periods), 
downy mildew will spread rapidly, destroying leaf tissue without affecting stems or petioles.
"""
disp_mkdw(txt_def_pm)

In [None]:
txt_def_oiv_452_1 = markdown.markdown(
    f"""
{goc.lvl_2_header} OIV 452
{got.txt_oiv_452_spec}
"""
)
disp_mkdw(txt_def_oiv_452_1)

In [None]:
img_def_oiv_452_1 = os.path.join(goc.datain_path, "images", "OIV_examples.png")
disp_img(img_def_oiv_452_1)

In [None]:
txt_oiv_necrose = f"""
{goc.lvl_3_header} OIV 452-2
A new version of the annotation specification added necrosis to the observed traits.
"""
disp_mkdw(txt_oiv_necrose)

In [None]:
plt_oiv_necrosis = gop.plot_sample_oiv_images(height=900)
plt_oiv_necrosis

In [None]:
txt_new_var_spex = f"""
{goc.lvl_3_header} New variables
{got.txt_what_we_want}
"""

img_new_var_spex = os.path.join(goc.datain_path, "images", "oiv_452-1_desc.png")

disp_mkdw(txt_new_var_spex)
disp_img(img_new_var_spex)

In [None]:
txt_bdf_intro = f"""{goc.lvl_2_header} Build dataframe

Experiment data is divided by year and ewperiment, in order to proceed to model building we need to first collate all darta

{goc.lvl_3_header} Locating the files
Files containing experiment's phenotyping data are stored by year and experiment, the data files are 
Excel classifiers which contain the word "saisie", 
we're going to parse all the folders year by year and retrieve the files.

- Files containing DM for downy mildew, ie mildiou, are selected for OIV analysis
- Files containing PM for powdery mildew, ie oïdium, are discarded

{goc.lvl_3_header} Extracting data from the sheets
Each Excel file contains one or more sheets. We parse all available sheets and discard them if:
- A valid header is not found
- The dataframe within the sheet is not valid
"""

disp_mkdw(txt_bdf_intro)

In [None]:
files = gof.get_distant_excels()
len(files)

In [None]:
gof.copy_excel_files(files)

In [None]:
df_result = gof.filter_csvs()

lcl_csv_files = [
    os.path.join(goc.oidium_extracted_csvs_path, filename)
    for filename in df_result.csv_file_name.dropna().to_list()
]

txt_bdf_sample_sheet_header = "Sample of the first sheet"

df_bdf_sample_sheet_df = pd.read_csv(lcl_csv_files[0])


disp_mkdw(txt_bdf_sample_sheet_header)
df_bdf_sample_sheet_df.head()

In [None]:
txt_bdf_filtering_review_title = f"{goc.lvl_3_header} Filtereing review"

txt_bdf_filtering_review = f"""
After checking all the available sheets we end up with {df_result[df_result.comment == 'success'].shape[0]} valid sheets.

{got.txt_rejected_csvs}
"""

disp_mkdw(txt_bdf_filtering_review_title)
disp_mkdw(txt_bdf_filtering_review)

In [None]:
plot_dfb_filtering_result = gop.plot_rejected_hist(df_result)
plot_dfb_filtering_result

In [None]:
df_bdf_filtering_outcome = gof.sheet_filtering_out_df(df_result)
df_bdf_filtering_outcome

In [None]:
txt_bdf_why_rejected = got.txt_rejected_csvs
disp_mkdw(txt_bdf_why_rejected)

In [None]:
txt_bdf_merge_sheets_intro = f"""
{goc.lvl_3_header} Merging all sheets into a single dataframe
We are going to merge all the sheets into a single dataframe. Since all the phenotyping was written manually we're to check cata consistency with the following rules 
"""
disp_mkdw(txt_bdf_merge_sheets_intro)

In [None]:
txt_bdf_consistency_check = f"""
{goc.lvl_4_header} Consistency rules
{got.txt_oiv_452_spec_req}
The data histogram shows that there are numerous inconsistencies in the data:
- Variables are not always limited to their set values
- Variables are inconsistent within themselves, ie. sporulation may be set to 1 with an OIV 9 which is impossible since an OIV 9 means no sporulation at all
"""
disp_mkdw(txt_bdf_consistency_check)

In [None]:
clean_steps = {}
df_raw_merged = gof.build_raw_merged(lcl_csv_files)
clean_steps["raw_merge"] = (df_raw_merged.shape[0], 0)

plot_bdf_inconsistency_raw = gop.plot_inconsistencies(
    df_raw_merged,
    sort_values=False,
    height=700,
)

plot_bdf_inconsistency_raw

In [None]:
df_raw_merged

In [None]:
txt_bdf_consistency_error = f"""
{goc.lvl_4_header} Consistency errors overview
First we're going to check what are the inconsistencies and how often do they appear
"""
disp_mkdw(txt_bdf_consistency_error)

In [None]:
txt_bdf_inconsistency_legend = f"""
{goc.lvl_5_header} Sheets with inconsistent data
- **oob**: Out of bounds, value outside of permitted values
- **n_inc**: Linked values inconsistent

"""
df_bdf_inconsistent = gof.build_inconsistencies_dataframe(df_raw_merged)

disp_mkdw(txt_bdf_inconsistency_legend)
df_bdf_inconsistent

In [None]:
cols = [
    "sporulation_oob",
    "sporulation_ds_inc",
    "densite_sporulation_oob",
    "necrose_oob",
    "necrose_sn_inc",
    "necrose_tn_inc",
    "taille_necrose_oob",
    "surface_necrosee_oob",
    "oiv_oob",
    "oiv_s_inc",
    "ligne_oob",
]

df_bdf_inconsistency_count = pd.DataFrame(
    data={"Inconsistency type count": [df_bdf_inconsistent[col].sum() for col in cols]},
    index=cols,
)

df_bdf_inconsistency_count

In [None]:
txt_bdf_consistent_dataframe = f"""
{goc.lvl_5_header} Consistent dataframe
"""
disp_mkdw(txt_bdf_consistent_dataframe)

In [None]:
df_merged = gof.clean_merged_dataframe(df_raw_merged)
clean_steps["clean_raw_merge"] = (
    df_merged.shape[0],
    df_raw_merged.shape[0] - df_merged.shape[0],
)
df_merged

In [None]:

txt_bdf_count_after_clean = f"After removing inconsistent lines we went from {df_raw_merged.shape[0]} to {df_merged.shape[0]} consistent rows"
disp_mkdw(txt_bdf_count_after_clean)

In [None]:
plot_bdf_inconsistency_clean = gop.plot_inconsistencies(
    df_merged,
    height=900,
    title="No visible errors remain in the data",
)
plot_bdf_inconsistency_clean

In [None]:
txt_bdf_data_overview_intro = f"""
{goc.lvl_3_header} Data overview
"""
disp_mkdw(txt_bdf_data_overview_intro)

In [None]:
txt_bdf_data_overview_set_balance = f"{goc.lvl_4_header} Set balance"
disp_mkdw(txt_bdf_data_overview_set_balance)

In [None]:
plot_bdf_data_overview_set_balance = gop.plot_balance_histogram(
    labels=df_merged.oiv.sort_values().astype(str),
    color=df_merged.oiv.sort_values().astype(str),
    is_text=True,
    height=default_plot_height,
)
plot_bdf_data_overview_set_balance


In [None]:
txt_bdf_data_overview_nan_values = f"""
{goc.lvl_4_header} NaN values
NaN values happen when:
- If "necrosis" is 0, "surface_necrosee" and "taille_necrose" are NaN
- If "sporulation" is 0, "densite_sporulation" is NaN
"""
disp_mkdw(txt_bdf_data_overview_nan_values)

In [None]:
df_bdf_data_overview_nan_values = pd.DataFrame(
    data={"NaN count": [df_merged[c].isna().sum() for c in df_merged.columns]},
    index=df_merged.columns,
).sort_values(by=["NaN count"], ascending=False)

df_bdf_data_overview_nan_values

In [None]:
txt_bdf_numeric_dataframe_intro = f"""
{goc.lvl_3_header} Numeric dataframe
We remove all columns that are not usefull for a classification model, such as column, line, etc... and drop all rows with **NaN** values 
as they **will not be accepted by the models**.
After removing this columns there will be duplicates that are removed.
"""
disp_mkdw(txt_bdf_numeric_dataframe_intro)

In [None]:
df_num = (
    df_merged.drop(["colonne"], axis=1)
    .dropna()
    .select_dtypes(exclude=object)
    .drop_duplicates()
)
df_num_cols = df_num.columns
df_num_cols = [
    df_num_cols[3],
    df_num_cols[0],
    df_num_cols[2],
    df_num_cols[4],
    df_num_cols[1],
    df_num_cols[5],
]
df_num = df_num[df_num_cols].sort_values(["oiv", "sporulation", "necrose"])

clean_steps["numeric_dataframe"] = (
    df_num.shape[0],
    df_merged.shape[0] - df_num.shape[0],
)

In [None]:
txt_bdf_numeric_dataframe_new_set_balance = f"{goc.lvl_4_header} New set balance"
plot_bdf_numeric_dataframe_new_set_balance = gop.plot_balance_histogram(
    labels=df_num.oiv.sort_values().astype(str),
    color=df_num.oiv.sort_values().astype(str),
    is_text=True,
    height=default_plot_height,
)
disp_mkdw(txt_bdf_numeric_dataframe_new_set_balance)
plot_bdf_numeric_dataframe_new_set_balance


In [None]:
txt_bdf_numeric_dataframe_no_9 = f"""
Since OIV 9 implies no sporulation, there are no longer rows with OIV value 9
There are only **{df_num.shape[0]}** observations left, two hypothesis:
- There are only this amount of phenotypes possible
- The human eye can only discriminate this many

**There are no rows with OIV 9, this makes building a model pointless, we'll find another way**
"""
disp_mkdw(txt_bdf_numeric_dataframe_no_9)

In [None]:
txt_bdf_invert_axis = f"""
{goc.lvl_3_header} {got.txt_lvl2_header_invert_axes}
{got.txt_fail}
"""
disp_mkdw(txt_bdf_invert_axis)

In [None]:
df_inverted = gof.invert_axis(df_merged)

df_inverted = df_inverted[
    [df_inverted.columns[i] for i in [9, 8, 4, 6, 0, 3, 2, 10, 5, 7, 1]]
].sort_values(["oiv", "sporulation", "necrose"])

txt_bdf_invert_axis_df_head = f"{goc.lvl_4_header} The head of new dataframe"
txt_bdf_invert_axis_df_shape = str(df_inverted.shape)

disp_mkdw(txt_bdf_invert_axis_df_head)

In [None]:
df_inverted.head(10)

In [None]:

disp_mkdw(txt_bdf_invert_axis_df_shape)

In [None]:
df_inv_num = (
    df_inverted.drop(["colonne"], axis=1)
    .select_dtypes(exclude=object)
    .drop_duplicates()
    .sort_values(
        [
            "oiv",
            "necrose",
            "taille_necrose",
            "surface_necrosee",
            "sporulation",
            "densite_sporulation",
        ]
    )
)

clean_steps["inverted_numeric_dataframe"] = (
    df_inv_num.shape[0],
    df_merged.shape[0] - df_inv_num.shape[0],
)

txt_bdf_invert_axis_df_inv_shape = str(df_inv_num.shape)


In [None]:
txt_bdf_invert_axis_df_inv_head = f"""
{goc.lvl_4_header} The numeric dataframe
We remove some columns from dataframe as they contain metadata that does not contribute to the OIV classification. After this operation we obtain the final dataframe used to build the models.
"""

disp_mkdw(txt_bdf_invert_axis_df_inv_head)

In [None]:
df_inv_num

In [None]:
disp_mkdw(txt_bdf_invert_axis_df_inv_shape)

In [None]:
txt_bdf_fdf_overview = f"{goc.lvl_4_header} Final dataset overview"
txt_bdf_fdf_overview_sankey = f"{goc.lvl_5_header} Evolution of available rows"
plot_bdf_fdf_overview_sankey = gop.observations_sankey(
    clean_steps=clean_steps,
    width=None,
    height=default_plot_height,
)
txt_bdf_fdf_overview_sankey_explain = f"""
We started with {df_raw_merged.shape[0]} annotations and after removing inconsistent data, 
columns that are not needed and duplicates we end up with {df_inv_num.shape[0]} observations
"""

disp_mkdw(txt_bdf_fdf_overview)
disp_mkdw(txt_bdf_fdf_overview_sankey)


In [None]:
plot_bdf_fdf_overview_sankey

In [None]:
disp_mkdw(txt_bdf_fdf_overview_sankey_explain)

In [None]:
txt_bdf_fdf_overview_balance = f"{goc.lvl_5_header} New set balance"
disp_mkdw(txt_bdf_fdf_overview_balance)

In [None]:
plot_bdf_fdf_overview_balance = px.histogram(
    x=df_inv_num.oiv.sort_values().astype(str),
    color=df_inv_num.oiv.sort_values().astype(str),
    text_auto=True,
    height=default_plot_height,
)
plot_bdf_fdf_overview_balance

In [None]:
txt_bdf_fdf_overview_outcome = f"""
{goc.lvl_4_header} Result
There are {df_inv_num.shape[0]} observations left instead of the previous {df_num.shape[0]}
Two hypothesis:
- There are only this amount of phenotypes possible
- The human eye can only discriminate this many
"""
disp_mkdw(txt_bdf_fdf_overview_outcome)

In [None]:
txt_bdf_fdf_overview_cm = f"{goc.lvl_5_header} Correlation matrix"
disp_mkdw(txt_bdf_fdf_overview_cm)

In [None]:
plot_bdf_fdf_overview_cm = px.imshow(
    df_inv_num.drop_duplicates().corr(),
    text_auto=True,
    height=default_plot_height,
)
plot_bdf_fdf_overview_cm

In [None]:
txt_bdf_fdf_homogeinity_intro = f"""
{goc.lvl_5_header} Data boxplot/heatmap per OIV score and average
Plotting a heat of all the variables against each OIV to see if we can detect clusters visualy.
"""

plot_bdf_fdf_vclusters_hm = [
    gop.plot_oiv_homogeneity(
        df_src=df_inv_num,
        oiv=i,
        height=400,
    )
    for i in [1, 3, 5, 7, 9]
]

plot_bdf_fdf_vclusters_hm[1]

In [None]:
plot_bdf_fdf_vclusters_bp = [
    df_inv_num[df_inv_num.oiv == i]
    .drop("oiv", axis=1)
    .boxplot()
    .update_layout(title=f"OIV {i}", height=400)
    for i in [1, 3, 5, 7, 9]
]
plot_bdf_fdf_vclusters_bp[1]


In [None]:
plot_bdf_fdf_vclusters_violin = [
    px.violin(
        df_inv_num,
        color="oiv",
        y=col,
        box=True,
    )
    for col in conflict_columns
]

plot_bdf_fdf_vclusters_violin[3]


In [None]:
txt_bdf_fdf_averages_intro = f"""
Variables averages per OIV value
"""

disp_mkdw(txt_bdf_fdf_averages_intro)
plot_bdf_fdf_vclusters_avg = gop.plot_avg_by_oiv(df_inv_num, height=400)
plot_bdf_fdf_vclusters_avg


In [None]:
txt_bdf_fdf_averages_out = f"""
Only "densite_sporulation" loks corelated to OIV but only at -O.7
"""
disp_mkdw(txt_bdf_fdf_averages_out)

In [None]:
txt_bdf_fdf_homogeinity = got.txt_homogenity_txt
disp_mkdw(txt_bdf_fdf_homogeinity)

In [None]:
txt_bdf_fdf_homogeinity_means = got.txt_homogenity_avg_txt
disp_mkdw(txt_bdf_fdf_homogeinity_means)

In [None]:
txt_models_intro = f"{goc.lvl_2_header} Predicting OIV with the other variables"

Xi = df_inv_num
yi = df_inv_num.oiv.astype(int)
Xi = Xi.drop(["oiv"], axis=1)
scaler = StandardScaler()
scaler.fit(Xi)
Xi = scaler.transform(Xi)

disp_mkdw(txt_models_intro)

In [None]:
disp_mkdw(got.txt_model_def_pca)
plot_model_pca = gop.plot_model(
    X=PCA().fit_transform(Xi),
    color=yi.astype(str),
    title="Inverted PCA 2D",
    # height=default_plot_height,
)
plot_model_pca

In [None]:
disp_mkdw(got.txt_model_def_plsda)

pls_data_all_inv = PLSRegression(n_components=Xi.shape[1])
x_new = pls_data_all_inv.fit(Xi, yi).transform(Xi)

plot_model_def_plsda = gop.plot_model(
    X=x_new,
    color=yi.astype(str),
    title=f"Inverted PLS-DA, score: {pls_data_all_inv.score(Xi, yi)}",
    axis_title_root="X-variate ",
    # height=default_plot_height,
)
plot_model_def_plsda

In [None]:
disp_mkdw(got.txt_model_def_lda)
lda_data_all_inv = LinearDiscriminantAnalysis()
x_new = lda_data_all_inv.fit(Xi, yi).transform(Xi)
plot_model_def_lda = gop.plot_model(
    X=x_new,
    color=yi.astype(str),
    title=f"Inverted LDA score: {lda_data_all_inv.score(Xi, yi)}",
    axis_title_root="X-variate ",
    # height=default_plot_height,
)
plot_model_def_lda

In [None]:
txt_model_check_overlap_intro = f"""
{goc.lvl_3_header} Check overlapping
Some observations seem to overlap, we're going to check that one point in the vectorial space codes only one OIV
"""

In [None]:
df_unique_lines = (
    df_inv_num.drop(["oiv"], axis=1)
    .drop_duplicates(conflict_columns)
    .reset_index(drop=True)
)
df_dup = pd.DataFrame(columns=conflict_columns + [f"oiv {oiv}" for oiv in [1,3,5,7,9]] + ["OIV count"])
for i in range(df_unique_lines.shape[0]):
    tmp_df = df_unique_lines.iloc[[i]]
    oivs = pd.merge(left=tmp_df, right=df_inv_num).oiv.to_list()
    for oiv in oivs:
        tmp_df[f"oiv {oiv}"] = True
    tmp_df["OIV count"] = len(oivs)
    df_dup = df_dup.reset_index(drop=True).append(tmp_df, ignore_index=True)

txt_model_conflict_row = f"{goc.lvl_4_header} Conflicted rows for OIV coding"

disp_mkdw(txt_model_conflict_row)
df_dup[df_dup["OIV count"] > 1].sort_values(["OIV count"], ascending=False).replace(np.NaN, " ")


In [None]:
txt_model_conflict_loc = f"""
{goc.lvl_4_header} Where are the conflicts
We're vgoing to plot the observation in the latent space of a PCA to visualize where they are located.
"""

df = df_dup.sort_values(["OIV count"])
color = df["OIV count"]
df = df[conflict_columns]

plot_model_conflict_locs = [
    gop.plot_pca(
        pca=PCA(),
        df=df,
        pcx=pcx,
        pcy=pcy,
        title=f"PCA for PC{pcx + 1}, PC{pcy + 1} with color by coding count",
        pca_columns=conflict_columns,
        color=color,
    )
    for pcx, pcy in [(0, 1), (0, 2), (1, 2)]
]

disp_mkdw(txt_model_conflict_loc)
plot_model_conflict_locs[0]


In [None]:
txt_model_sbs_intro = f"""
{goc.lvl_3_header} Sheet by sheet prediction
The prediction is bad at {pls_data_all_inv.score(Xi, yi)}, we try next to predict sheet by sheet to see the results
"""

disp_mkdw(txt_model_sbs_intro)

In [None]:
df_sbs_plsda = (
    gof.build_sbs_plsda(df_inverted, gof.build_dup_df(df_inv_num)["df_dup"])
    .sort_values(
        [
            "row_count",
            "score",
            "experiment",
            "sheet",
        ],
        ascending=False,
    )
    .reset_index(drop=True)
)
df_sbs_plsda


In [None]:
plot_sbs_plsda = px.scatter(
    data_frame=df_sbs_plsda[
        ((df_sbs_plsda.score > -1) & (df_sbs_plsda.score <= 1))
    ].assign(row_count=lambda x: x.row_count.astype(float)),
    y="score",
    x="dup_rate",
    color="row_count",
    color_continuous_scale=px.colors.sequential.OrRd,
    trendline="ols",
    trendline_color_override="blue",
    height=default_plot_height,
)
plot_sbs_plsda

In [None]:
disp_mkdw(got.txt_duprate_vs_prediction)

In [None]:
txt_model_rem_var_intro = f"""
{goc.lvl_3_header} Removing some variables from the dataset
Necrosis and sporulation are heavily linked to the other variables, we will test models build without them
"""

df_inv_num_wosn = (
    df_inv_num[["taille_necrose", "surface_necrosee", "densite_sporulation", "oiv"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

yi_wond = df_inv_num_wosn.oiv
X_wond = df_inv_num_wosn.drop(["oiv"], axis=1)
scaler = StandardScaler()
scaler.fit(X_wond)
X_wond = scaler.transform(X_wond)

disp_mkdw(txt_model_rem_var_intro)


In [None]:
plot_model_rem_var_pca =gop.plot_model(
    X=PCA().fit_transform(X_wond),
    color=yi_wond.astype(str),
    title="Inverted PCA 2D without sporulation nor necrosis",
    height=default_plot_height,
)
plot_model_rem_var_pca

In [None]:
pls_data_all_inv = PLSRegression(n_components=X_wond.shape[1])
x_new = pls_data_all_inv.fit(X_wond, yi_wond).transform(X_wond)
plot_model_rem_var_plsda = gop.plot_model(
    X=pls_data_all_inv.x_scores_,
    color=yi_wond.astype(str),
    title=f"Inverted PLS-DA without sporulation nor necrosis, score: {pls_data_all_inv.score(X_wond, yi_wond)}",
    axis_title_root="X-variate ",
    height=default_plot_height,
)
plot_model_rem_var_plsda

In [None]:
txt_model_rem_var_outro = "No visible change"
disp_mkdw(txt_model_rem_var_outro)

In [None]:
txt_noiv_header = f"""
{goc.lvl_2_header} {got.txt_lvl2_header_kmeans}
{got.txt_kmeans}
{got.txt_noiv_sel_cut}
{got.txt_noiv_sel_cut_outcome}
"""
disp_mkdw(txt_noiv_header)

In [None]:
X_km = df_inv_num.drop(["oiv"], axis=1).drop_duplicates().reset_index(drop=True)

plot_noiv_cut = make_subplots(rows=2, cols=3)

for (r, c), sort_order in zip(
    itertools.product([1, 2, 3], [1, 2, 3]),
    itertools.permutations(
        ["taille_necrose", "surface_necrosee", "densite_sporulation"]
    ),
):
    plot_noiv_cut.add_trace(
        go.Heatmap(
            z=X_km.drop(["sporulation", "necrose"], axis=1)
            .sort_values(list(sort_order))
            .drop_duplicates()
            .reset_index(drop=True),
            x=sort_order,
        ),
        row=r,
        col=c,
    )

plot_noiv_cut.update_layout(
    xaxis=dict(
        tickmode="array",
        tickvals=[0, 1, 2],
        ticktext=sort_order,
    ),
    height=800,
    margin=dict(l=20, r=20, t=20, b=20),
)
plot_noiv_cut

In [None]:
disp_mkdw(got.txt_kmeans_pca)

In [None]:
xkm_pca = PCA()
x_pca = xkm_pca.fit_transform(X_km)

In [None]:
plot_noiv_kmeans_pca = fig = px.scatter_3d(
    x=x_pca[:, 0],
    y=x_pca[:, 1],
    z=x_pca[:, 2],
    title="PCA",
    height=default_plot_height,
)
plot_noiv_kmeans_pca

In [None]:
plot_noiv_kmeans_pca_variance = gop.plot_variance(
    df_ev=pd.DataFrame.from_dict(
        {
            "pc": [
                f"PC{i}"
                for i in range(len(xkm_pca.explained_variance_ratio_))
            ],
            "exp_var_per": xkm_pca.explained_variance_ratio_ * 100,
        }
    ),
    height=default_plot_height,
)
plot_noiv_kmeans_pca_variance

In [None]:
df_loadings: pd.DataFrame = pd.DataFrame(
    xkm_pca.components_.T * xkm_pca.explained_variance_ratio_,
    columns=[f"PC{i+1}" for i in range(len(xkm_pca.components_))],
    index=X_km.columns,
)
plot_noiv_kmeans_pca_loadings = df_loadings.T.plot.bar()
plot_noiv_kmeans_pca_loadings.update_layout(
    height=default_plot_height,
    title="Loadings",
)
plot_noiv_kmeans_pca_loadings

In [None]:
txt_noiv_pca_outcome = "It appears that **3** components are enough"
disp_mkdw(txt_noiv_pca_outcome)

In [None]:
plot_noiv_kmeans_list = [
    px.scatter_3d(
        data_frame=pd.DataFrame(
            {
                "x": x_pca[:, 0],
                "y": x_pca[:, 1],
                "z": x_pca[:, 2],
                "color": KMeans(n_clusters=nc, init="k-means++", random_state=42)
                .fit_predict(x_pca)
                .astype(int)
                .astype(str),
            }
        ).sort_values(["color"]),
        x="x",
        y="y",
        z="z",
        height=400,
        color="color",
        title=f"{nc} classes"
    )
    for nc in range(2, 11)
]

disp_mkdw(got.txt_kmeans_explore_cluster_count)
plot_noiv_kmeans_list[0]


In [None]:
disp_mkdw(got.txt_kmeans_elbow)

In [None]:
elbow_model = KMeans(init="k-means++", random_state=42)
elb_visualizer = KElbowVisualizer(elbow_model, k=(2, 13))
elb_visualizer.fit(x_pca)

get_yellow_fig(elb_visualizer)


In [None]:
disp_mkdw(got.txt_kmeans_silhouette)

In [None]:
plot_noiv_kmeans_silhouette_list = [
    get_yellow_fig(
        SilhouetteVisualizer(
            KMeans(init="k-means++", n_clusters=nc, random_state=42)
        ).fit(x_pca)
    )
    for nc in range(2, 11)
]
plot_noiv_kmeans_silhouette_list[0]


In [None]:
disp_mkdw(got.txt_kmeans_outcome)
disp_mkdw(got.txt_icdm)

In [None]:
plot_noiv_kmeans_icdm = [
    get_yellow_fig(
        InterclusterDistance(
            KMeans(init="k-means++", n_clusters=nc, random_state=42)
        ).fit(x_pca)
    )
    for nc in [3, 6, 8]
]

plot_noiv_kmeans_icdm[1]

In [None]:
disp_mkdw(got.txt_noiv_select_oiv)

In [None]:
plot_noiv_kmeans_heat_map = [
    px.imshow(
        (
            X_km.assign(
                noiv=KMeans(n_clusters=3, init="k-means++", random_state=42)
                .fit_predict(x_pca)
                .astype(int)
            )
            .drop(["sporulation", "necrose"], axis=1)
            .drop_duplicates()
            .sort_values(
                [
                    "noiv",
                    "taille_necrose",
                    "surface_necrosee",
                    "densite_sporulation",
                ]
            )
            .reset_index(drop=True)
        )[
            [
                "noiv",
                "taille_necrose",
                "surface_necrosee",
                "densite_sporulation",
            ]
        ],
        height=400,
    )
    for i in range(2, 11)
]
plot_noiv_kmeans_heat_map[0]


In [None]:
disp_mkdw(got.txt_km_hm_conclusion)

In [None]:
txt_noiv_conclusion = f"""
{goc.lvl_2_header} {got.txt_lvl2_header_conclusion}
{got.txt_conclusion}
"""

disp_mkdw(txt_noiv_conclusion)

In [None]:
dp.Report(
    dp.Page(
        title=txt_title.replace("#", ""),
        blocks=[
            dp.Text(txt_title),
            dp.Group(
                dp.Media(
                    file=os.path.join(
                        goc.datain_path,
                        "images",
                        "oiv_samples",
                        "smp_oiv_5.png",
                    )
                ),
                # dp.Plot(plot_bdf_fdf_overview_sankey),
                dp.Plot(elb_visualizer.fig),
                columns=2,
            ),
        ],
    ),
    dp.Page(title="Introduction", blocks=[dp.Text(txt_intro)]),
    dp.Page(
        title="Definitions",
        blocks=[
            dp.Text(txt_def_pm),
            dp.Group(
                dp.Text(txt_def_oiv_452_1),
                dp.Media(file=img_def_oiv_452_1),
                columns=2,
            ),
            dp.Text(txt_oiv_necrose),
            dp.Plot(plt_oiv_necrosis),
            dp.Group(
                dp.Text(txt_new_var_spex),
                dp.Media(file=img_new_var_spex),
                columns=2,
            ),
        ],
    ),
    dp.Page(
        title="Build Dataframe",
        blocks=[
            dp.Text(txt_bdf_intro),
            # dp.Text(txt_bdf_sample_sheet_header),
            # dp.DataTable(df_bdf_sample_sheet_df.head(20)),
            dp.Text(txt_bdf_filtering_review_title),
            dp.Group(
                dp.Text(txt_bdf_filtering_review),
                dp.Select(
                    blocks=[
                        dp.Plot(
                            plot_dfb_filtering_result,
                            label="Filtering result plot",
                        ),
                        dp.DataTable(
                            df_bdf_filtering_outcome,
                            label="Filtering result errors",
                        ),
                    ]
                ),
                columns=2,
            ),
            dp.Text(txt_bdf_merge_sheets_intro),
            dp.Group(
                dp.Plot(plot_bdf_inconsistency_raw),
                dp.Text(txt_bdf_consistency_check),
                columns=2,
            ),
            dp.Select(
                dp.Text("_", label="No info"),
                dp.Group(
                    dp.Text(txt_bdf_consistency_error),
                    dp.Text(txt_bdf_inconsistency_legend),
                    dp.Select(
                        dp.DataTable(df_bdf_inconsistent, label="Inconsistencies per sheet"),
                        dp.DataTable(df_bdf_inconsistency_count, label="Inconsistencies count"),
                    ),
                    label="Consistency errors overview"
                )
            ),            
            dp.Text(txt_bdf_consistent_dataframe),
            dp.Select(
                dp.Plot(
                    plot_bdf_inconsistency_clean,
                    label="Inconsistency check after cleanup",
                ),
                dp.Group(
                    dp.DataTable(df_merged),
                    label="New dataframe",
                ),
            ),
            dp.Text(txt_bdf_count_after_clean),
            dp.Text(txt_bdf_data_overview_intro),
            dp.Group(
                dp.Group(
                    dp.Text(txt_bdf_data_overview_set_balance),
                    dp.Plot(plot_bdf_data_overview_set_balance),
                ),
                dp.Group(
                    dp.Text(txt_bdf_data_overview_nan_values),
                    dp.DataTable(df_bdf_data_overview_nan_values),
                ),
                columns=2,
            ),
            dp.Text(txt_bdf_numeric_dataframe_intro),
            dp.Group(
                dp.Group(
                    dp.Text(txt_bdf_numeric_dataframe_new_set_balance),
                    dp.Plot(plot_bdf_numeric_dataframe_new_set_balance),
                ),
                dp.Group(
                    dp.Text(txt_bdf_numeric_dataframe_no_9),
                ),
                columns=2,
            ),
            dp.Text(txt_bdf_invert_axis),
            dp.Group(
                dp.Group(
                    dp.Text(txt_bdf_invert_axis_df_head),
                    dp.DataTable(df_inverted),
                    dp.Text(txt_bdf_invert_axis_df_shape),
                ),
                dp.Group(
                    dp.Text(txt_bdf_invert_axis_df_inv_head),
                    dp.DataTable(df_inv_num),
                    dp.Text(txt_bdf_invert_axis_df_inv_shape),
                ),
                columns=2,
            ),
            dp.Text(txt_bdf_fdf_overview),
            # dp.Text(txt_bdf_fdf_overview_sankey),
            # dp.Plot(plot_bdf_fdf_overview_sankey),
            # dp.Text(txt_bdf_fdf_overview_sankey_explain),
            dp.Group(
                dp.Text(txt_bdf_fdf_overview_outcome),
                dp.Select(
                    dp.Plot(plot_bdf_fdf_overview_balance, label="New set balance"),
                    dp.Plot(plot_bdf_fdf_overview_cm, label="Correlation matrix"),
                ),
                columns=2,
            ),
            dp.Text(txt_bdf_fdf_homogeinity_intro),
            dp.Select(
                dp.Group(
                    *plot_bdf_fdf_vclusters_bp,
                    plot_bdf_fdf_vclusters_avg,
                    label="Box plot",
                    columns=3
                ),
                dp.Group(
                    *plot_bdf_fdf_vclusters_hm,
                    plot_bdf_fdf_vclusters_avg,
                    label="Heat maps",
                    columns=3
                ),
                dp.Group(
                    *plot_bdf_fdf_vclusters_violin,
                    plot_bdf_fdf_vclusters_avg,
                    label="Violin plots",
                    columns=3
                ),
            ),
            # dp.Text(txt_bdf_fdf_averages_out),
        ],
    ),
    dp.Page(
        title="Models",
        blocks=[
            dp.Text(f"""
                {goc.lvl_2_header} Models
                In this section we're going to try to find a relation between the newly added variables and the OIV value.
            """),
            dp.Select(
                dp.Group(
                    dp.Plot(plot_model_pca),
                    dp.Plot(plot_model_def_plsda),
                    dp.Plot(plot_model_def_lda),
                    columns=3,
                    label="Model plots",
                ),
                dp.Group(
                    dp.Text(got.txt_model_def_pca),
                    dp.Text(got.txt_model_def_plsda),
                    dp.Text(got.txt_model_def_lda),
                    columns=3,
                    label="Model definitions",
                ),
            ),
            dp.Text(txt_model_check_overlap_intro),
            dp.Text(txt_model_conflict_row),
            dp.DataTable(df_dup),
            dp.Text(txt_model_conflict_loc),
            dp.Group(*plot_model_conflict_locs, columns=3),
            dp.Text(txt_model_sbs_intro),
            dp.Select(
                dp.Plot(plot_sbs_plsda, label="Sheet by sheet prediction"),
                dp.DataTable(df_sbs_plsda, label="Duplicate rate over prediction score"),
            ),
            dp.Text(got.txt_duprate_vs_prediction),
            dp.Text(txt_model_rem_var_intro),
            dp.Group(
                dp.Plot(plot_model_rem_var_plsda),
                dp.Plot(plot_model_rem_var_pca),
                columns=2,
            ),
            dp.Text(txt_model_rem_var_outro),
        ],
    ),
    dp.Page(
        title="OIV alternative",
        blocks=[
            dp.Text(txt_noiv_header),
            dp.Plot(plot_noiv_cut),
            dp.Text(got.txt_kmeans_pca),
            dp.Group(
                dp.Plot(plot_noiv_kmeans_pca),
                dp.Plot(plot_noiv_kmeans_pca_variance),
                dp.Plot(plot_noiv_kmeans_pca_loadings),
                columns=3,
            ),
            dp.Text(txt_noiv_pca_outcome),
            dp.Text(got.txt_kmeans_explore_cluster_count),
            dp.Group(*plot_noiv_kmeans_list, columns=3),
            dp.Group(
                dp.Text(got.txt_kmeans_elbow),
                dp.Plot(elb_visualizer.fig),
                columns=2,
            ),
            dp.Text(got.txt_kmeans_silhouette),
            dp.Group(*plot_noiv_kmeans_silhouette_list, columns=3),
            dp.Text(got.txt_kmeans_outcome),
            # dp.Text(got.txt_icdm),
            # dp.Group(*plot_noiv_kmeans_icdm, columns=3),
            dp.Text(got.txt_noiv_select_oiv),
            dp.Group(*plot_noiv_kmeans_heat_map, columns=3),
            dp.Text(got.txt_km_hm_conclusion),
        ],
    ),
    dp.Page(
        title="Conclusion",
        blocks=[
            dp.Text(txt_noiv_conclusion),
        ],
    ),
    layout=dp.PageLayout.TOP,
).save(path=os.path.join(".", "data_out", "reports", "mildiou-report.html"))


## Sheet by sheet predctor

In [None]:
cb_exp = widgets.Dropdown(
    options=list(df_inverted.experiment.sort_values(ascending=True).unique()),
    desciption="Experiment",
)

cb_sheet = widgets.Dropdown(options=[], description="Sheet")

op_plot = widgets.Output()


def plot_prediction(df, exp, sheet):
    X = df.drop(["oiv"], axis=1)
    y = df.oiv
    X = StandardScaler().fit(X).transform(X)
    es_pls_da = PLSRegression(n_components=X.shape[1]).fit(X, y)
    return gop.plot_model(
        X=es_pls_da.x_scores_,
        x_comp=1 - 1,
        y_comp=2 - 1,
        height=800,
        color=y.astype(str),
        title=f"PLS-DA score for {exp} sheet {sheet}: {es_pls_da.score(X, y)}",
        axis_title_root="X-variate ",
    )


def predict_sheet(exp, sheet):
    op_plot.clear_output()
    with op_plot:
        display(
            plot_prediction(
                (
                    df_inverted[
                        ((df_inverted.experiment == exp) & (df_inverted.sheet == sheet))
                    ]
                    .select_dtypes(exclude=object)
                    .drop(["colonne"], axis=1)
                    .drop_duplicates()
                ),
                exp,
                sheet,
            )
        )


def on_experiment_change(change):
    cb_sheet.options = list(
        df_inverted[df_inverted.experiment == change.new].sheet.unique()
    )
    cb_sheet.index = 0
    predict_sheet(change.new, cb_sheet.value)


def on_sheet_change(change):
    predict_sheet(cb_exp.value, change.new)


cb_exp.observe(on_experiment_change, names="value")
cb_sheet.observe(on_sheet_change, names="value")

display(VBox([HBox([cb_exp, cb_sheet]), op_plot]))
