In [None]:
import pandas as pd
from pathlib import Path
import os

%load_ext autoreload
%autoreload 2

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from matplotlib import pyplot as plt

import plotly.express as px
import numpy as np

import sys  
sys.path.insert(0, os.path.join("..", "scripts"))

import gav_oidium_func as gof
import gav_oidium_const as goc
import gav_oidium_text as got
import gav_oidium_plot_plotly as gop

In [None]:
df_col = (
    pd.read_csv("/Users/mavi/Data/iRoCS for Jessis course/Col0root2_fixed.csv", sep=",")
    .drop(["x (micron)", "y (micron)", "z (micron)"], axis=1)
    .sort_values("label")
)
X_col = df_col
y_col = df_col.label.astype(int) 
X_col = X_col.drop(["label"], axis=1)
scaler = StandardScaler()
scaler.fit(X_col)
X_col = scaler.transform(X_col)

gop.plot_model(
    X=PCA().fit_transform(X_col),
    color=y_col.astype(str),
    title="Inverted PCA 2D",
    height=800,
    # height=default_plot_height,
)


In [None]:
X_train_col, X_test_col, y_train_col, y_test_col = train_test_split(X_col, y_col, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_col, y_train_col)

In [None]:
rf.feature_importances_

In [None]:
sorted_idx = rf.feature_importances_.argsort()
plt.barh(df_col.drop(["label"], axis=1).columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

In [None]:
y_pred = rf.predict(X_test_col)
accuracy = accuracy_score(y_test_col, y_pred)
print("Accuracy (train) for %s: %0.1f%% " % ("Random Forest", accuracy * 100))

In [None]:
df_c8 = (
    pd.read_csv("/Users/mavi/Data/iRoCS for Jessis course/C28root3_fixed.csv", sep=",")
    .drop(["x (micron)", "y (micron)", "z (micron)"], axis=1)
    .sort_values("label")
)
df_c8 = df_c8[df_c8.label > 0]

X_c8 = df_c8
y_c8 = df_c8.label.astype(int)
X_c8 = X_c8.drop(["label"], axis=1)
scaler = StandardScaler()
scaler.fit(X_c8)
X_c8 = scaler.transform(X_c8)

import plotly.express as px
import numpy as np

gop.plot_model(
    X=PCA().fit_transform(X_c8),
    color=y_c8.astype(str),
    title="Inverted PCA 2D",
    height=800,
    # height=default_plot_height,
)


In [None]:
X_train_c8, X_test_c8, y_train_c8, y_test_c8 = train_test_split(X_c8, y_c8, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_c8, y_train_c8)

sorted_idx = rf.feature_importances_.argsort()
plt.barh(df_c8.drop(["label"], axis=1).columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

In [None]:
y_pred = rf.predict(X_test_c8)
accuracy = accuracy_score(y_test_c8, y_pred)
print("Accuracy (train) for %s: %0.1f%% " % ("Random Forest", accuracy * 100))

In [None]:

# lbl = df_c8.label
# df_c8=(df_c8-df_c8.min())/(df_c8.max()-df_c8.min())

df_all = pd.concat([df_c8.assign(src="col"), df_col.assign(src="col8")])
df_all

In [None]:
df_all.drop_duplicates()

In [None]:
X_all = df_all.drop("src", axis=1)
y_all = df_all.label.astype(int)
X_all = X_all.drop(["label"], axis=1)
scaler = StandardScaler()
scaler.fit(X_all)
X_all = scaler.transform(X_all)
gop.plot_model(
    X=PCA().fit_transform(X_all),
    color=y_all.astype(str),
    title="PCA 2D",
    height=800,
)

In [None]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_all, y_train_all)

sorted_idx = rf.feature_importances_.argsort()
plt.barh(df_all.drop(["label"], axis=1).columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

In [None]:
y_pred = rf.predict(X_test_all)
accuracy = accuracy_score(y_test_all, y_pred)
print("Accuracy (train) for %s: %0.1f%% " % ("Random Forest", accuracy * 100))

## All together now!

In [None]:
Path.cwd()

In [None]:
df_glob = (
    pd.concat(
        [
            (
                pd.read_csv(str(csv), sep=";")
                .drop(["x (micron)", "y (micron)", "z (micron)"], axis=1)
                .drop(["normals", "indices", "vertices (micron)"], axis=1)
                .sort_values(["label", "volume (cube microns)"])
                .assign(src=csv.stem)
            )
            for csv in Path.cwd()
            .joinpath(
                "..",
                "data_in",
                "iRoCS for Jessis course",
                "train",
            )
            .glob("*.csv")
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
df_glob = df_glob[
    (df_glob.label < 8)
    & (df_glob["distance from QC (z) (micron)"] < 200)
    & (df_glob.label > 0)
]

df_glob


In [None]:
X = df_glob.drop("src", axis=1)
y = df_glob.label.astype(int)
X = X.drop(["label"], axis=1)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
pca = PCA()
pca_transformed = pca.fit_transform(X)

gop.plot_model(
    X=pca_transformed,
    color=y.astype(str),
    title="Inverted PCA 2D",
    height=800,
    hover_data=df_glob.index.to_list(),
    # height=default_plot_height,
)


In [None]:
df_glob.loc[12748]

In [None]:
df_glob.loc[4572]

In [None]:
df_final = df_glob.drop(index=[12748,4572], axis=0)

In [None]:
from pca import pca

In [None]:
model = pca(n_components=X.shape[0])
results = model.fit_transform(X)

In [None]:
fig, ax = model.plot()

In [None]:
fig, ax = model.scatter3d()

In [None]:
fig, ax = model.biplot(n_feat=2, PC=[0,1,2], y=y)

In [None]:
model.scatter(legend=True, SPE=True, hotellingt2=True)
model.scatter3d(legend=True, SPE=True, hotellingt2=True)

In [None]:
X[results['outliers']['y_bool'],:]

In [None]:
df_final

In [None]:
df_final.label.value_counts()

In [None]:
# plt.matshow(df_final.corr())

px.imshow(df_final.drop_duplicates().corr(), text_auto=True, height=1000)


In [None]:
X = df_final.drop("src", axis=1)
y = df_final.label.astype(int)
X = X.drop(["label"], axis=1)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

model = pca(n_components=X.shape[0]);
results = model.fit_transform(X);

fig, ax = model.biplot(n_feat=2, PC=[0,1,2], y=y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

sorted_idx = rf.feature_importances_.argsort()
plt.barh(
    df_all.drop(["label"], axis=1).columns[sorted_idx],
    rf.feature_importances_[sorted_idx],
)
plt.xlabel("Random Forest Feature Importance")


In [None]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for %s: %0.1f%% " % ("Random Forest", accuracy * 100))

In [None]:
plt.scatter(y_pred, y_test)
plt.scatter(y_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
y_test.shape

In [None]:
dft_path = Path.cwd().joinpath(
    "..",
    "data_in",
    "iRoCS for Jessis course",
    "test",
    "C28root1 old version.csv",
)
# dft_path.is_file()
df_test = pd.read_csv(str(dft_path), sep=";")
# df_test = df_test[
#     (df_test.label < 8)
#     & (df_test["distance from QC (z) (micron)"] < 200)
#     & (df_test.label > 0)
# ]

dropped_data = df_test[
    [
        "x (micron)",
        "y (micron)",
        "z (micron)",
        "normals",
        "indices",
        "vertices (micron)",
    ]
]

df_test = (
    df_test.assign(y=lambda x: x.label)
    .drop(["x (micron)", "y (micron)", "z (micron)"], axis=1)
    .drop(["normals", "indices", "vertices (micron)"], axis=1)
).drop(["label"], axis=1)

df_test


In [None]:
Xt = df_test.drop(["y"], axis=1)
Xt = scaler.transform(Xt)

yt = rf.predict(Xt)

In [None]:
cm = confusion_matrix(yt, df_test.y)

cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
yt.shape

In [None]:
np.unique(yt, df_test.y.to_list())

In [None]:
plt.scatter(yt, df_test.y.to_list())

In [None]:
for c in [
    "x (micron)",
    "y (micron)",
    "z (micron)",
    "normals",
    "indices",
    "vertices (micron)",
]:
    df_test[c] = dropped_data[c]

df_test.assign(label=yt).to_csv(
    str(
        Path.cwd().joinpath(
            "..",
            "data_in",
            "iRoCS for Jessis course",
            "predicted",
            "C28root1 old version.csv",
        )
    ),
    sep=";",
    index=False,
)


In [None]:
pd.Da