# Preliminary analysis

This notebook will:
- Retrieve all available Excel files
- Translate them to CSV and merge them
- Build models to asses the possibility of predicting OIV from various visual variables

## Imports

We need:
- Base python libraries for file management
- tqdm for progress tracking
- Pandas and Numpy for the dataframes
- SkLearn for statistics
- Plotly for ... plotting

In [None]:
import os
import shutil
from pathlib import Path
import itertools

from tqdm import tqdm

import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore")

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, normalize
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_decomposition import PLSRegression, CCA, PLSSVD

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# pd.options.plotting.backend = "plotly"
pd.options.display.float_format = '{:4,.2f}'.format

## Consts

Consts for paths and the columns needed in a dataframe

In [None]:
excel_file_path = os.path.join("..", "data_in", "oidium_source_excels", "")
oidium_extracted_csvs_path = os.path.join("..", "data_in", "oidium_extracted_csvs", "")
excel_file_list_path = os.path.join(excel_file_path, "excel_list.txt")

needed_columns = ["nomphoto", "oiv", "s", "sq", "n", "fn", "tn", "ligne", "colonne"]


## Functions

Check that the dataframe has at least the needed columns

In [None]:
def check_list_in_list(required_columns, available_columns):
    failures = []
    for rc in required_columns:
        if rc not in available_columns:
            failures.append(rc)

    return True if len(failures) == 0 else failures


Plot model variance

In [None]:
def plot_variance(df_ev):
    df_ev = df_ev.assign(cumulative=df_ev["exp_var_per"].cumsum())
    ev_fig = go.Figure()
    ev_fig.add_trace(
        go.Bar(
            x=df_ev["pc"],
            y=df_ev["exp_var_per"],
            name="individual",
        )
    )
    ev_fig.add_trace(
        go.Scatter(
            x=df_ev["pc"],
            y=df_ev["cumulative"],
            name="cumulative",
        )
    )
    ev_fig.update_layout(
        height=800,
        width=1000,
        title="Explained variance by different principal components",
        xaxis_title="Principal component",
        yaxis_title="Explained variance in percent",
    )
    return ev_fig

Plot an histogram of the variables needed for the OIV so inconsistencies can be detected

In [None]:
def plot_inconsistencies(df, sort_values: bool = True):
    columns = [
        ["sporulation", "densite_sporulation", ""],
        ["necrose", "surface_necrosee", "taille_necrose"],
        ["ligne", "colonne", "oiv"],
    ]

    fig = make_subplots(rows=3, cols=3, subplot_titles=np.array(columns).flatten())

    for idl, l in enumerate(columns):
        for idc, c in enumerate(l):
            if not c:
                continue
            fig.add_trace(
                go.Histogram(
                    x=df[c].sort_values().astype(str) if sort_values is True else df[c].astype(str),
                    texttemplate="%{y}",
                    textfont_size=20,
                    name=c,
                ),
                row=idl + 1,
                col=idc + 1,
            )

    fig.update_layout(
        height=1000,
        width=1400,
        title="Dataframe consistency check",
        xaxis_title="Value",
        yaxis_title="Count",
    )

    return fig


Generate oiv_cat from dataframe

In [None]:
def get_oiv_cat(df):
    return df.oiv.astype(str)

## Build dataframe

### Retrieve all excel files

#### Retrieve file paths

Get all related file's path in the distant server

- Files containing DM for domny mildew, ie mildiou, are selected for OIV analysis
- Files containing PM for powdery mildew, ie oïdium, are discarded

In [None]:
if os.path.isfile(excel_file_list_path):
    with open(excel_file_list_path, "r", encoding="UTF8") as f:
        files = f.read().split("?")
else:
    files = [
        os.path.join(root, name)
        for root, _, files in tqdm(os.walk("Z:",topdown=False))
        for name in files
        if name.endswith("_saisie.xlsx") and "DM" in name
    ]
    with open(excel_file_list_path, "w+", encoding="UTF8") as f:
        f.write("?".join(files))



Amount of files selected

In [None]:
len(files)

Files overview

In [None]:
files

#### Copy files

Filenames starting with "~" are system related and will be ignored

In [None]:
for file in tqdm(files):
    file_name = os.path.basename(file)
    if not file_name.startswith("~$") and not os.path.isfile(
        os.path.join(
            excel_file_path,
            file_name,
        )
    ):
        shutil.copy(src=file, dst=excel_file_path)


### Clean excels

#### List local excels

In [None]:
lcl_excel_files = [
    os.path.join(root, name)
    for root, _, files in os.walk(
        excel_file_path,
        topdown=False,
    )
    for name in files
    if name.endswith("_saisie.xlsx")
] 

In [None]:
len(lcl_excel_files)

#### Look up a weird thing

The sheet "" in the file "" renders a large amount of NaN values that are not found later

In [None]:
df_weird = pd.ExcelFile(os.path.join("..", "data_in", "Exp19DM04_inoc2_saisie.xlsx")).parse(sheet_name="fichier total", skiprows=8)
df_weird

In [None]:
df_weird.N.unique()

#### Build CSVs

We look for 2 particular headers, sheets will be discarded if:
- the header is not found
- the dataframe is corrupted, ie unable to find images or a column is malformed

In [None]:
path_to_df_result = os.path.join("..", "data_in", "excel_extraction.csv")

if os.path.isfile(path_to_df_result):
    df_result = pd.read_csv(path_to_df_result)
else:
    df_result = pd.DataFrame(
        columns=[
            "file",
            "sheet",
            "outcome",
            "comment",
            "csv_file_name",
        ]
    )

    def add_result(
        file,
        sheet,
        outcome,
        comment="success",
        csv_file_name=np.nan,
    ):
        global df_result
        df_result = df_result.append(
            {
                "file": file,
                "sheet": sheet,
                "outcome": outcome,
                "comment": comment,
                "csv_file_name": csv_file_name,
            },
            ignore_index=True,
        )

    def lower_dataframe(df):
        try:
            df.columns = df.columns.str.lower().str.replace(" ", "")
            for c in df.columns:
                if c != "nomphoto" and df[c].dtype == object:
                    df[c] = df[c].str.lower().str.replace(" ", "")
        except:
            return False
        else:
            return df

    for lcl_excel_file in tqdm(lcl_excel_files):
        tst_excel_file = pd.ExcelFile(lcl_excel_file)
        for sheet_name in tst_excel_file.sheet_names:
            df = lower_dataframe(df=tst_excel_file.parse(sheet_name=sheet_name))
            if df is False:
                add_result(
                    file=os.path.basename(lcl_excel_file),
                    sheet=sheet_name,
                    outcome=False,
                    comment="Corrupted dataframe",
                )
                continue
            header_loc = df[df == "numinc"].dropna(axis=1, how="all").dropna(how="all")
            if header_loc.shape == (0, 0):
                header_loc = df[df == "num"].dropna(axis=1, how="all").dropna(how="all")
                if header_loc.shape == (0, 0):
                    add_result(
                        file=os.path.basename(lcl_excel_file),
                        sheet=sheet_name,
                        outcome=False,
                        comment="No header",
                    )
                    continue
            column = header_loc.columns.item()
            df = lower_dataframe(
                df=tst_excel_file.parse(
                    sheet_name,
                    skiprows=header_loc.index.item() + 1,
                )
            )
            if df is False:
                add_result(
                    file=os.path.basename(lcl_excel_file),
                    sheet=sheet_name,
                    outcome=False,
                    comment="Corrupted dataframe",
                )
                continue
            if (
                res := check_list_in_list(
                    required_columns=needed_columns,
                    available_columns=df.columns.to_list(),
                )
            ) is True:
                csv_file_name = f"{Path(lcl_excel_file).stem}_{sheet_name}.csv"
                df = df.assign(
                    exp=Path(lcl_excel_file).stem,
                    sheet=sheet_name,
                ).dropna(subset=["nomphoto"])
                if df.shape[0] > 0:
                    df.to_csv(
                        os.path.join(oidium_extracted_csvs_path, csv_file_name),
                        index=False,
                    )
                    add_result(
                        file=os.path.basename(lcl_excel_file),
                        sheet=sheet_name,
                        outcome=True,
                        csv_file_name=csv_file_name,
                    )
                else:
                    add_result(
                        file=os.path.basename(lcl_excel_file),
                        sheet=sheet_name,
                        outcome=False,
                        comment="Corrupted dataframe, failed to retrieve photos",
                    )
            else:
                add_result(
                    file=os.path.basename(lcl_excel_file),
                    sheet=sheet_name,
                    outcome=False,
                    comment=f"Missing columns: {res}",
                )

    df_result.to_csv(path_to_df_result, index=False)


#### What just happen?

Number of sheets parsed

In [None]:
df_result.shape

##### Why sheets were rejected?

In [None]:
px.histogram(
    data_frame=df_result.sort_values(["comment"]),
    x="comment",
    color="comment",
    width=1400,
    height=800,
    text_auto=True,
).update_layout(
    font=dict(
        family="Courier New, monospace",
        size=18,
    )
)


##### Corrupted dataframes

- When the cause af the corruption is "Corrupted dataframe", the files are info files without data
- When the cause af the corruption is "Corrupted dataframe, failed to retrieve photos" a formula has an error

In [None]:
df_corrupted = df_result[
    df_result.comment.isin(
        [
            "Corrupted dataframe",
            "Corrupted dataframe, failed to retrieve photos",
        ]
    )
].reset_index(drop=True)

df_corrupted.to_csv(
    os.path.join("..", "data_in", "corrupted_excels.csv"),
    index=False,
    sep=";",
)

df_corrupted


### Merge dataframes

#### Retrieve CSVs

In [None]:
lcl_csv_files = [
    os.path.join(oidium_extracted_csvs_path, filename)
    for filename in df_result.csv_file_name.dropna().to_list()
]


Number of sheets successfully converted to CSVs

In [None]:
len(lcl_csv_files)

#### About The Columns

What columns are common all dataframes

In [None]:
common_columns = set(pd.read_csv(lcl_csv_files[0]).columns.to_list())
columns_occ = {}
for filepath in lcl_csv_files:
    cu_columns = pd.read_csv(filepath).columns.to_list()
    for c in cu_columns:
        if c in columns_occ:
            columns_occ[c] += 1
        else:
            columns_occ[c] = 1
    common_columns = common_columns.intersection(set(cu_columns))
common_columns = list(common_columns)
common_columns.sort()
common_columns

What are all the columns found in the dataframe

In [None]:
all_columns = sorted(list(columns_occ.keys()))
all_columns

#### Merge

##### Load dataframe

In [None]:
df_raw_merged = (
    pd.concat([pd.read_csv(filepath)[common_columns] for filepath in lcl_csv_files])
    .rename(
        columns={
            "exp": "experiment",
            "sheet": "sheet",
            "oiv": "oiv",
            "nomphoto": "image_name",
            "s": "sporulation",
            "fn": "surface_necrosee",
            "n": "necrose",
            "sq": "densite_sporulation",
            "tn": "taille_necrose",
        }
    )
    .drop(["n°tubestock", "plaque"], axis=1)
)
df_raw_merged


Check weird sheet

In [None]:
df_raw_merged[
    (df_raw_merged.experiment == "Exp19DM04_inoc2_saisie")
    & (df_raw_merged.sheet == "fichier total")
    & ~(df_raw_merged.sporulation.isna())
    & ~(df_raw_merged.necrose.isna())
]


How many different observations are here at the beginning 

In [None]:
df_raw_merged.drop(
    ["colonne", "experiment", "ligne", "image_name", "sheet"], axis=1
).drop_duplicates().reset_index(drop=True)


##### Remove observations with wrong values

In [None]:
plot_inconsistencies(df_raw_merged, sort_values=False)


In [None]:
df_raw_merged.shape

CLean dataframe

- _Sporulation_ must be 1 ou 0
- _densité sporulation_a number and not 0
- _Necrosis_ must be 1 ou 0
- _Surface_necrosee must be an integer or NaN
- _taille_necrose_ must be an integer or NaN
- _ligne_ must not be NaN
- _colonne_ must not be NaN
- _OIV_ must be an odd integer

Sheets that have incorrect values

In [None]:
odd_numbers = [n for n in [1,3,5,7,9]]

In [None]:
df_clean_merged = df_raw_merged[
    (
        df_raw_merged.sporulation.isin([0,1])
        & (df_raw_merged.densite_sporulation.isin(odd_numbers) | df_raw_merged.densite_sporulation.isna())
        & df_raw_merged.necrose.isin([0,1])
        & df_raw_merged.ligne.notna()
        & df_raw_merged.oiv.isin(odd_numbers)
        & (df_raw_merged.taille_necrose.isin(odd_numbers) | df_raw_merged.taille_necrose.isna())
        & (df_raw_merged.surface_necrosee.isin(odd_numbers) | df_raw_merged.surface_necrosee.isna())
    )
]
plot_inconsistencies(df_clean_merged, sort_values=False)

In [None]:
df_clean_merged.shape

Build dataframe with all Excels and sheets that contain corrupted values

In [None]:
df_inconsistent = (
    pd.concat(
        [
            df_raw_merged[~df_raw_merged.sporulation.isin([0, 1])].assign(
                because="sporulation"
            ),
            df_raw_merged[
                ~(
                    df_raw_merged.densite_sporulation.isin(odd_numbers)
                    | df_raw_merged.densite_sporulation.isna()
                )
            ].assign(because="densite_sporulation"),
            df_raw_merged[~df_raw_merged.necrose.isin([0, 1])].assign(because="necrose"),
            df_raw_merged[~df_raw_merged.ligne.notna()].assign(because="ligne"),
            df_raw_merged[
                ~(
                    df_raw_merged.taille_necrose.isin(odd_numbers)
                    | df_raw_merged.taille_necrose.isna()
                )
            ].assign(because="taille_necrose"),
            df_raw_merged[
                ~(
                    df_raw_merged.surface_necrosee.isin(odd_numbers)
                    | df_raw_merged.surface_necrosee.isna()
                )
            ].assign(because="surface_necrosee"),
            df_raw_merged[~df_raw_merged.oiv.isin(odd_numbers)].assign(because="oiv"),
        ]
    )[["experiment", "sheet", "because"]]
    .sort_values(["experiment", "sheet", "because"])
    .drop_duplicates()
    .reset_index(drop=True)
)

df_inconsistent = (
    df_inconsistent.assign(
        sporulation=np.where(df_inconsistent.because == "sporulation", 1, 0),
        densite_sporulation=np.where(
            df_inconsistent.because == "densite_sporulation", 1, 0
        ),
        necrose=np.where(df_inconsistent.because == "necrose", 1, 0),
        ligne=np.where(df_inconsistent.because == "ligne", 1, 0),
        taille_necrose=np.where(df_inconsistent.because == "taille_necrose", 1, 0),
        surface_necrosee=np.where(df_inconsistent.because == "surface_necrosee", 1, 0),
        oiv=np.where(df_inconsistent.because == "oiv", 1, 0),
    )
    .drop(["because"], axis=1)
    .groupby(["experiment", "sheet"])
    .agg("sum")
    .reset_index(drop=False)
    .drop_duplicates()
)

df_inconsistent.to_csv(
    os.path.join("..", "data_in", "inconsistent_excels.csv"),
    index=False,
    sep=";",
)

df_inconsistent


Merge all dataframes and:
- Remove rows with unwanted values
- Drop unwanted columns
- Change column names
- Set numerical columns
- Remove duplicates

In [None]:
df_merged = (
    df_clean_merged.sort_values(["image_name"])
    .assign(
        colonne=lambda x: x.colonne.astype("Int64"),
        necrose=lambda x: x.necrose.astype("Int64"),
        oiv=lambda x: x.oiv.astype("Int64"),
        sporulation=lambda x: x.sporulation.astype("Int64"),
        surface_necrosee=lambda x: x.surface_necrosee.astype("Int64"),
        densite_sporulation=lambda x: x.densite_sporulation.astype("Int64"),
        taille_necrose=lambda x: x.taille_necrose.astype("Int64"),
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
cols = df_merged.columns.to_list()
cols = [cols[1] ,cols[8] ,cols[5] ,cols[3] ,cols[0] ,cols[4] ,cols[2] ,cols[7] ,cols[9] ,cols[10] , cols[6] ]
df_merged = df_merged[cols]

df_merged


In [None]:
plot_inconsistencies(df_merged)

There are more NaN values for _taille_necrose_ and _surface_necrose_ than there are plants with necrosis

##### Set balance

In [None]:
px.histogram(
    x=df_merged.oiv.sort_values().astype(str),
    color=df_merged.oiv.sort_values().astype(str),
    text_auto=True,
    width=1000,
    height=600,
)


##### NAs

In [None]:
nan_count = []
for c in df_merged.columns:
    nan_count.append((c, df_merged[c].isna().sum()))
nan_count

## Data overview

### Dataframe

In [None]:
df_num = (
    df_merged.drop(["colonne"], axis=1)
    .select_dtypes(exclude=object)
    .drop_duplicates()
    .reset_index(drop=True)
)
df_num


In [None]:
df_num.dtypes

### Violin

In [None]:
fig = make_subplots(rows=1, cols=len(df_num.columns))
for i, var in enumerate(df_num.columns):
    fig.add_trace(
        go.Violin(y=df_num[var], name=var),
        row=1,
        col=i + 1,
    )
fig.update_layout(
    height=600,
    width=1200,
)
fig

### Pairwise

In [None]:
fig = px.scatter_matrix(
    df_num,
    color=get_oiv_cat(df_num),
    height=800,
    width=1000,
    dimensions=df_num.select_dtypes(np.number).columns,
)
fig.update_yaxes(tickangle=45, tickfont=dict(family="Rockwell", color="crimson", size=14))


### Heat map

In [None]:
corr_matrix = df_num.drop_duplicates().corr()
corr_matrix

In [None]:
px.imshow(
    corr_matrix,
    text_auto=True,
    height=600,
    width=800,
)


In [None]:
df_num.dtypes

Prepare dataframe for models

In [None]:
X = df_num.dropna().drop_duplicates().reset_index(drop=True)

y = X.oiv
X = X.drop(["oiv"], axis=1)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X.shape


### PCA

In [None]:
pca_data = PCA()
x_new = pca_data.fit_transform(X)

In [None]:
px.scatter(
    x=x_new[:, 0] / x_new[:, 0].max(),
    y=x_new[:, 1] / x_new[:, 1].max(),
    color=y.astype(str),
    height=800,
    width=1000,
    title="PCA 3D",
)    

In [None]:
plot_variance(
    df_ev=pd.DataFrame.from_dict(
        {
            "pc": [f"PC{i}" for i in range(len(pca_data.explained_variance_ratio_))],
            "exp_var_per": pca_data.explained_variance_ratio_ * 100,
        }
    )
)

In [None]:
px.scatter_3d(
    x=x_new[:, 0] / x_new[:, 0].max(),
    y=x_new[:, 1] / x_new[:, 1].max(),
    z=x_new[:, 2] / x_new[:, 2].max(),
    color=y.astype(str),
    height=800,
    width=1000,
    title="PCA 3D",
)    


### PLs-DA

In [None]:
pls_data_all = PLSRegression(n_components=X.shape[1])
x_new = pls_data_all.fit(X, y).transform(X)

pls_data_all.score(X, y)


#### Scatter 2D

In [None]:
fig = px.scatter(
    x=pls_data_all.x_scores_[:, 0] / pls_data_all.x_scores_[:, 0].max(),
    y=pls_data_all.x_scores_[:, 1] / pls_data_all.x_scores_[:, 1].max(),
    color=y.astype(str),
    height=800,
    width=1000,
)

fig.update_traces(
    marker=dict(
        size=12,
        line=dict(width=2, color="DarkSlateGrey"),
    ),
    selector=dict(mode="markers"),
)

fig


#### Scatter 3D

In [None]:
px.scatter_3d(
    x=pls_data_all.x_scores_[:, 0] / pls_data_all.x_scores_[:, 0].max(),
    y=pls_data_all.x_scores_[:, 1] / pls_data_all.x_scores_[:, 1].max(),
    z=pls_data_all.x_scores_[:, 2] / pls_data_all.x_scores_[:, 2].max(),
    color=y.astype(str),
    height=800,
    width=1000,
    title="PCA 3D",
)    


## Inverting the scale

This has not been successful, were going o try switching from a resistance scale to a susceptibility scale, this allows us to keep all dimensions for all observations.

### Dataframe

In [None]:
df_inverted = (
    df_merged.assign(
        surface_necrosee=lambda x: 10 - x.surface_necrosee,
        densite_sporulation=lambda x: 10 - x.densite_sporulation,
        taille_necrose=lambda x: 10 - x.taille_necrose,
        oiv=lambda x: 10 - x.oiv,
    )
    .assign(
        surface_necrosee=lambda x: x.surface_necrosee.fillna(0),
        densite_sporulation=lambda x: x.densite_sporulation.fillna(0),
        taille_necrose=lambda x: x.taille_necrose.fillna(0),
        sporulation=lambda x: x.sporulation.fillna(0),
    )
    .drop_duplicates()
    .sort_values(
        [
            "oiv",
            "experiment",
            "sheet",
        ]
    )
)
df_inverted


### Build a numeric dataframe without duplicates

We keep only the target variables

In [None]:
df_inv_num = (
    df_inverted.drop(["colonne"], axis=1)
    .select_dtypes(exclude=object)
    .drop_duplicates()
)
df_inv_num


### Violin plot

In [None]:

fig = make_subplots(rows=1, cols=len(df_inverted.columns))
for i, var in enumerate(df_inverted.columns):
    fig.add_trace(
        go.Violin(y=df_inverted[var], name=var),
        row=1,
        col=i + 1,
    )
fig.update_traces(points="all", jitter=0.3).update_layout(
    height=1000,
    width=1400,
)
fig

### OIV distribution

In [None]:
px.histogram(
    x=df_inv_num.oiv.sort_values().astype(str),
    color=df_inv_num.oiv.sort_values().astype(str),
    text_auto=True,
    width=1000,
    height=600,
)


### Prepare data for models

In [None]:
Xi = df_inv_num
yi = df_inv_num.oiv
Xi = Xi.drop(["oiv"], axis=1)
scaler = StandardScaler()
scaler.fit(Xi)
Xi = scaler.transform(Xi)

Xi.shape


### Build models

#### PCA

In [None]:
pca_data = PCA()
x_new = pca_data.fit_transform(Xi)

df_inv_num["x_pca"] = x_new[:, 0]
df_inv_num["y_pca"] = x_new[:, 1]
df_inv_num["z_pca"] = x_new[:, 2]

#### Pls-DA

In [None]:
pls_data_all_inv = PLSRegression(n_components=Xi.shape[1])
x_new = pls_data_all_inv.fit(Xi, yi).transform(Xi)

df_inv_num["x_pls"] = pls_data_all_inv.x_scores_[:, 0]
df_inv_num["y_pls"] = pls_data_all_inv.x_scores_[:, 1]
df_inv_num["z_pls"] = pls_data_all_inv.x_scores_[:, 2]

pls_data_all_inv.score(Xi, yi)

### Plots

In [None]:
col_pal = px.colors.qualitative.Plotly


#### Scatter 2D

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=["PCA", "PLS"])
col_pal_iterator = itertools.cycle(col_pal)

for i in odd_numbers:
    new_colour = next(col_pal_iterator)
    df_tmp = df_inv_num[df_inv_num.oiv == i]
    fig.add_trace(
        go.Scatter(
            x=df_tmp["x_pca"],
            y=df_tmp["y_pca"],
            mode="markers",
            # name=f"OIV {i}",
            text=df_tmp.index,
            line=dict(color=new_colour),
            showlegend=False,
        ),
        row=1,
        col=1,
    )
    df_tmp = df_inv_num[df_inv_num.oiv == i]
    fig.add_trace(
        go.Scatter(
            x=df_tmp["x_pls"],
            y=df_tmp["y_pls"],
            mode="markers",
            name=f"OIV {i}",
            text=df_tmp.index,
            line=dict(color=new_colour),
        ),
        row=1,
        col=2,
    )

fig.update_layout(height=800, width=1400, title="PCA vs PLS")
fig


#### Scatter 3D

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=["PCA", "PLS"])
col_pal_iterator = itertools.cycle(col_pal)

for i in odd_numbers:
    new_colour = next(col_pal_iterator)
    df_tmp = df_inv_num[df_inv_num.oiv == i]
    fig.add_trace(
        go.Scatter3d(
            x=df_tmp["x_pca"],
            y=df_tmp["y_pca"],
            z=df_tmp["z_pca"],
            mode="markers",
            # name=f"OIV {i}",
            text=df_tmp.index,
            line=dict(color=new_colour),
            showlegend=False,
        ),
        row=1,
        col=1,
    )
    df_tmp = df_inv_num[df_inv_num.oiv == i]
    fig.add_trace(
        go.Scatter3d(
            x=df_tmp["x_pls"],
            y=df_tmp["y_pls"],
            z=df_tmp["z_pls"],
            mode="markers",
            name=f"OIV {i}",
            text=df_tmp.index,
            line=dict(color=new_colour),
        ),
        row=1,
        col=2,
    )

fig.update_layout(height=800, width=1400, title="PCA vs PLS")
fig


In [None]:
plot_variance(
    df_ev=pd.DataFrame.from_dict(
        {
            "pc": [f"PC{i}" for i in range(len(pca_data.explained_variance_ratio_))],
            "exp_var_per": pca_data.explained_variance_ratio_ * 100,
        }
    )
)

In [None]:
px.scatter_3d(
    x=x_new[:, 0] / x_new[:, 0].max(),
    y=x_new[:, 1] / x_new[:, 1].max(),
    z=x_new[:, 2] / x_new[:, 2].max(),
    color=y.astype(str),
    height=800,
    width=1000,
    title="PCA 3D",
)    


### Pls-da

In [None]:
pls_data_all = PLSRegression(n_components=X.shape[1])
x_new = pls_data_all.fit(X, y).transform(X)

pls_data_all.score(X, y)

In [None]:
# df_inv_num["x"] = 

fig = px.scatter(
    x=pls_data_all.x_scores_[:, 0] / pls_data_all.x_scores_[:, 0].max(),
    y=pls_data_all.x_scores_[:, 1] / pls_data_all.x_scores_[:, 1].max(),
    color=y.astype(str),
    height=800,
    width=1000,
)

fig.update_traces(
    marker=dict(
        size=12,
        line=dict(width=2, color="DarkSlateGrey"),
    ),
    selector=dict(mode="markers"),
)

fig


In [None]:
px.scatter_3d(
    x=pls_data_all.x_scores_[:, 0] / pls_data_all.x_scores_[:, 0].max(),
    y=pls_data_all.x_scores_[:, 1] / pls_data_all.x_scores_[:, 1].max(),
    z=pls_data_all.x_scores_[:, 2] / pls_data_all.x_scores_[:, 2].max(),
    color=y.astype(str),
    height=800,
    width=1000,
    title="PCA 3D",
)    


### Sheet by sheet data Pls-da

In [None]:
df_sheet_plsda = pd.DataFrame(
    columns=["experiment", "sheet", "row_count", "score"]
)
failures = []

for idx, row in df_inverted[["experiment", "sheet"]].drop_duplicates().iterrows():
    try:
        df = (
            df_inverted[
                (df_inverted.experiment == row["experiment"])
                & (df_inverted.sheet == row["sheet"])
            ]
            .select_dtypes(exclude=object)
            .drop(["colonne"], axis=1)
            .drop_duplicates()
        )
        X = df.drop(["oiv"], axis=1)
        y = df.oiv
        X = StandardScaler().fit(X).transform(X)
        cur_pls_da = PLSRegression(n_components=X.shape[1])
        cur_pls_da.fit(X, y).transform(X)

        df_sheet_plsda = df_sheet_plsda.append(
            {
                "experiment": row["experiment"],
                "sheet": row["sheet"],
                "row_count": df.shape[0],
                "score": cur_pls_da.score(X, df.oiv),
            },
            ignore_index=True,
        )
    except:
        failures.append((row["experiment"], row["sheet"]))

df_sheet_plsda = df_sheet_plsda.sort_values(
    [
        "row_count",
        "score",
        "experiment",
        "sheet",
    ],
    ascending=False,
).reset_index(drop=True)
df_sheet_plsda


In [None]:
failures

In [None]:
px.scatter(
    data_frame=df_sheet_plsda[df_sheet_plsda.score > 0],
    x="row_count",
    y="score",
)

In [None]:
df_inv_num[(df_inv_num < 0).all(1)]

In [None]:
[
            df_num[df_num[c] == df_num[c].max]
            for c in [
                "necrose",
                "surface_necrosee",
                "sporulation",
                "densite_sporulation",
                "taille_necrose",
            ]
        ]

In [None]:
df_num[df_num["necrose"] == df_num["necrose"].max()]