In [None]:
import os
import sys

# Go up one directory level to reach root
root_dir = os.path.dirname(os.getcwd())
os.chdir(root_dir)

# Add root to Python path
sys.path.insert(0, root_dir)

print(f"Current working directory: {os.getcwd()}")
print(f"Root directory added to path: {root_dir}")

In [None]:
%load_ext autoreload
%autoreload 2
from functions._utils_ import *


### DATASET PROCESSING

In [None]:
# https://datadryad.org/dataset/doi:10.5061/dryad.cjsxksn3p
PROSTATE_CANCER_DATASET_PATH = os.path.join(
    CURRENT_DIR,
    "test_rawdata",
    "A1-dataset_prostate_cancer",
    "Benign_vs_Cancer.pkl",
)

load_dataset = RamanDataLoader(PROSTATE_CANCER_DATASET_PATH)
rawdata = load_dataset.data

In [None]:
# create a subset for a given key value
chum_df = rawdata[rawdata['Cohort'] == 'CHUM']
uhn_df = rawdata[rawdata['Cohort'] == 'UHN']
chuq_df = rawdata[rawdata['Cohort'] == 'CHUQc-UL']
console_log(rawdata.shape, chum_df.shape, uhn_df.shape, chuq_df.shape)

In [None]:
# hirushu_dir = os.getcwd() + '/test_rawdata/Tamura/引き継ぎ/ヒルシュ'
# hirushu_dir

# normal_dfs = []
# window_size = 50s
# for k in range(1, 4):
#     csv_path = os.path.join(hirushu_dir, 'データ', 'merged_data_raw', f'Case{k}', 'normal', 'normal.csv')
#     loader = RamanDataLoader(csv_path)
#     df = loader.data
#     processor = RamanNoiseProcessor(df)
#     processed_df = processor.baselineAndGaussianNoise(window_size=window_size)
#     normal_dfs.append(processed_df)

In [None]:
region = (600, 1600)  # Raman region of interest
labels = ["benign", "cancer"]
# https://ramanspy.readthedocs.io/en/latest/preprocessing.html
# https://www.nature.com/articles/s41377-024-01394-5
method_name = "ModPoly"


def custom_params(index=1):
    if index == 1:  # flexible approach
        return {
            "poly_order": 6,        # Your current value
            "tol": 0.0005,         # Tighter tolerance
            "max_iter": 400,       # More iterations
            "lam": 1e7,            # Very strong penalty
            "p": 0.005       # Slight asymmetry s
        }

    elif index == 2:  # More conservative approach
        return {
            "poly_order": 4,        # Lower order for smoother baseline
            "tol": 0.0001,         # Tighter tolerance for precision
            "max_iter": 300,       # More iterations for convergence
            "lam": 1e5,            # Penalty parameter (if supported)
            "p": 0.001             # Asymmetry parameter (if supported)
        }
    elif index == 3:  # Balanced approach
        return {
            "poly_order": 5,        # Moderate flexibility
            "tol": 0.001,          # Your current value
            "max_iter": 250,       # Increased iterations
            "lam": 1e6,            # Strong penalty
            "p": 0.01              # Slight asymmetry
        }
    else:  # Default parameters
        return {
            "poly_order": 6,
            "tol": 0.001,
            "max_iter": 200
        }
        

baseline_corrector = BaselineCorrection(region=region)
baseline_corrector_method = baseline_corrector.get_method(
    method_name=method_name, custom_params={"poly_order": 6, "tol": 0.001})
# baseline_corrector_method = Transformer1DBaseline()
pprint(baseline_corrector_method, indent=2)

preprocess_steps_test = [
    rp.preprocessing.misc.Cropper(region=region),
    rp.preprocessing.despike.WhitakerHayes(),
    rp.preprocessing.denoise.SavGol(window_length=11, polyorder=3),
    baseline_corrector_method,
    # rp.preprocessing.normalise.Vector(),
    SNV()   # Use SNV normalization as in the Readme
]

In [None]:
save_pkl = True
duplicate_save = True
chumDF_benign = load_pickle(os.path.join(
    CURRENT_DIR,
    "data", method_name,
    "preprocessed_data",
    "chum_benign.pkl"
))

if chumDF_benign is None or duplicate_save:
    chumDF_benign = RamanPipeline().preprocess(
        dfs=[processDFA1(chum_df[chum_df['Label'] == 'Benign'])],
        label=labels[0],
        region=region,
        preprocessing_steps=preprocess_steps_test,
        visualize_steps=False,
        save_pkl=save_pkl,
        save_pkl_name="chum_benign"
    )

chumDF_cancer = load_pickle(os.path.join(
    CURRENT_DIR,
    "data", method_name,
    "preprocessed_data",
    "chum_cancer.pkl"
))

if chumDF_cancer is None or duplicate_save:
    chumDF_cancer = RamanPipeline().preprocess(
        dfs=[processDFA1(chum_df[chum_df['Label'] == 'Cancer'])],
        label=labels[1],
        region=region,
        preprocessing_steps=preprocess_steps_test,
        visualize_steps=True,
        save_pkl=save_pkl,
        save_pkl_name="chum_cancer",
        max_plot_visualize_steps=20,
        show_parameters_in_title=True,
    )

In [None]:
uhnDF_benign = load_pickle(os.path.join(
    CURRENT_DIR,
    "data", method_name,
    "preprocessed_data",
    "uhn_benign.pkl"
))

if uhnDF_benign is None or duplicate_save:
    uhnDF_benign = RamanPipeline().preprocess(
        dfs=[processDFA1(uhn_df[uhn_df['Label'] == 'Benign'])],
        label=labels[0],
        region=region,
        preprocessing_steps=preprocess_steps_test,
        visualize_steps=False,
        save_pkl=save_pkl,
        save_pkl_name="uhn_benign"
    )

uhnDF_cancer = load_pickle(os.path.join(
    CURRENT_DIR,
    "data", method_name,
    "preprocessed_data",
    "uhn_cancer.pkl"
))

if uhnDF_cancer is None or duplicate_save:
    uhnDF_cancer = RamanPipeline().preprocess(
        dfs=[processDFA1(uhn_df[uhn_df['Label'] == 'Cancer'])],
        label=labels[1],
        region=region,
        preprocessing_steps=preprocess_steps_test,
        visualize_steps=False,
        save_pkl=save_pkl,
        save_pkl_name="uhn_cancer"
    )

In [None]:
chuqDF_benign = load_pickle(os.path.join(
    CURRENT_DIR,
    "data", method_name,
    "preprocessed_data",
    "chuq_benign.pkl"
))

if chuqDF_benign is None or duplicate_save:
    chuqDF_benign = RamanPipeline().preprocess(
        dfs=[processDFA1(chuq_df[chuq_df['Label'] == 'Benign'])],
        label=labels[0],
        region=region,
        preprocessing_steps=preprocess_steps_test,
        visualize_steps=False,
        save_pkl=save_pkl,
        save_pkl_name="chuq_benign"
    )

chuqDF_cancer = load_pickle(os.path.join(
    CURRENT_DIR,
    "data", method_name,
    "preprocessed_data",
    "chuq_cancer.pkl"
))

if chuqDF_cancer is None or duplicate_save:
    chuqDF_cancer = RamanPipeline().preprocess(
        dfs=[processDFA1(chuq_df[chuq_df['Label'] == 'Cancer'])],
        label=labels[1],
        region=region,
        preprocessing_steps=preprocess_steps_test,
        visualize_steps=False,
        save_pkl=save_pkl,
        save_pkl_name="chuq_cancer"
    )

### MODEL TRAINING PREPARATION

In [None]:
cancer_spectra = [(uhnDF_cancer, "uhndf-cancer"),
                  (chuqDF_cancer, "chuqdf-cancer")]
benign_spectra = [(chuqDF_benign, "chuqdf-benign"),
                  (uhnDF_benign, "uhndf-benign")]
labels_spectra = [k[1] for k in cancer_spectra + benign_spectra]
test_spectra = [k[0]["processed"] for k in cancer_spectra + benign_spectra]
true_labels = []
for k in cancer_spectra + benign_spectra:
    true_labels.extend(k[0]["labels"])

ramanML = RamanML()

In [None]:
kernel = "linear"
MODEL_SHORT = f"CCCV-SVC-{kernel.upper()}"
# MODEL_SHORT = "RF"  # Change to "SVC" for SVC model

if "SVC" in MODEL_SHORT:
    if "CCCV" in MODEL_SHORT:
        mlresult = ramanML.train_svc(normal_data=([chumDF_benign["processed"]], labels[0]),
                                    disease_data=([chumDF_cancer["processed"]], labels[1]), param_search=False, test_size=0.2,
                                    SVC_model=ramanML.SVCMODEL(
                                        kernel=kernel, C=1.0, gamma='scale', class_weight='balanced', probability=True),
         calibrate={"method": 'sigmoid', "cv": 2, "ensemble": False})  # C=1.0 before
    else:
        mlresult = ramanML.train_svc(normal_data=([chumDF_benign["processed"]], labels[0]),
                                     disease_data=([chumDF_cancer["processed"]], labels[1]), param_search=False, test_size=0.2,
                                     SVC_model=ramanML.SVCMODEL(
            kernel=kernel, C=1.0, gamma='scale', class_weight='balanced', probability=True),)
            

else:
    mlresult = ramanML.train_rf(normal_data=([chumDF_benign["processed"]], labels[0]),
                                disease_data=([chumDF_cancer["processed"]], labels[1]), param_search=False, test_size=0.2,
                                RF_model=ramanML.RFMODEL(n_estimators=200,           # More trees for stability
                                                         criterion="gini",           # Or "entropy" for information gain
                                                         # Let trees grow deep (or set to 10-30 to reduce overfitting)
                                                         max_depth=None,
                                                         min_samples_split=5,
                                                         min_samples_leaf=10,
                                                         max_features="sqrt",        # Good default for classification
                                                         class_weight="balanced_subsample",    # Important for imbalanced classes
                                                         random_state=42,            # For reproducibility
                                                         n_jobs=-1,                  # Use all CPU cores
                                                         bootstrap=True,             # Default
                                                         oob_score=True,),)


pprint(translate_confusion_matrix(
    mlresult["confusion_matrix"], labels), indent=2)
pprint(mlresult["classification_report"], indent=2)
console_log(
    f"CV Accuracy: {mlresult['cross_val_score'].mean():.3f} ± {mlresult['cross_val_score'].std():.3f}")
# console_log(f"Decision Function Score: {mlresult['decision_function_score'].mean():.3f} ± {mlresult['decision_function_score'].std():.3f}")

### TRAINED MODEL PREDICT

In [None]:
sample_indices = None

if "SVC" in MODEL_SHORT:
    predict_data = ramanML.predict(
        test_spectra=test_spectra,
        true_labels=true_labels,
        # model = mlresult["model"]
        sample_indices=sample_indices,
        calculate_pca_boundary=True
    )
else:
    predict_data = ramanML.predict(
        test_spectra=test_spectra,
        positive_label=labels[1],
        true_labels=true_labels,
        # model = mlresult["model"],
        threshold=0.62,
        use_threshold=True,
    )

pprint(predict_data["label_percentages"], indent=2)
# pprint(predict_data["most_common_label"])
if 'y_true' in predict_data:
    pprint(f"y_true exist: {len(predict_data['y_true'])}")
    # pprint(predict_data["prediction_breakdown"], indent=2)

### MODEL AND PREDICTED RESULT VISUALIZATION

In [None]:
visualizer = RamanVisualizer(ML_PROPERTY=ramanML)
predict_accuracy, plotdata = visualizer.confusion_matrix_heatmap(
    y_true=predict_data["y_true"],          # true labels for test set
    y_pred=predict_data["y_pred"],  # predicted labels for test set
    # class names, e.g. ["benign", "cancer"]
    class_labels=labels,
    title=f"{MODEL_SHORT} Confusion Matrix",
    normalize=False,
    cmap="Blues",
    figsize=(8, 6),
    fmt="d",
)
console_log("Predict Accuracy:")
console_log("=====================================")
for i, (label, percentage) in enumerate(predict_accuracy.items()):
    console_log(f"{label}: {percentage:.2f}%")
# console_log("=====================================")
# pprint(predict_data["prediction_breakdown"], indent=2)

In [None]:
# get_shap = visualizer.shap_explain(reduce_features=False,
#                                    # nsamples=30,
#                                    # max_background_samples=30,
#                                    # max_test_samples=15,
#                                    show_plots=True,
#                                    fast_mode=True)

In [None]:
sample_indices = [15, 2565, 2581, 315]  # [1538, 2565, 2678, 315]
if len(sample_indices) == 0:
    for key, value in predict_data["prediction_breakdown"].items():
        try:
            if len(value) > 0:
                sample_indices.append(value[random.randint(0, len(value))][0])
        except Exception as e:
            console_log(f"Error processing value for key {key}: {e}")

console_log(f"Sample indices for inspection: {sample_indices}")
inspect_spectra = visualizer.inspect_spectra(test_spectra=test_spectra,
                                             true_labels=true_labels,
                                             n_samples=1,
                                             positive_label=labels[1],
                                             negative_label=labels[0],
                                             show_lime_plots=False,
                                             sample_indices=sample_indices,
                                             )

In [None]:
visualizer.plot_container_distribution(
    spectral_containers=test_spectra,
    container_labels=labels_spectra,
    title="Spectral Container Distribution",
)

In [None]:
# lime_spectra = visualizer.lime_explain(
#     test_spectra=test_spectra,
#     true_labels=true_labels,
#     sample_indices=[345],
#     positive_label=labels[1],
#     negative_label=labels[0],
#     show_plots=True,
# )

In [None]:
visualizer.pca2d(
    title=f"{MODEL_SHORT} PCA of Training Data",
    sample_limit=500,  # Limit samples for faster plotting
    show_decision_boundary=True,
)

### SAVING MODEL

In [None]:
save_model = MLModel()
save_model.save(
    model=ramanML._model,
    labels=labels,
    filename=f"{MODEL_SHORT}_raman_prostate_model",
    common_axis=ramanML.common_axis,
    n_features_in=ramanML.n_features_in,
    meta={
        "model_short": MODEL_SHORT,
        "model_type": f"{ramanML._model.__class__.__name__}",
        "model_name": f"{MODEL_SHORT} Raman Prostate Cancer Model",
        "model_version": "1.0",
        "model_description": f"{ramanML._model.__class__.__name__} model for prostate cancer classification based on Raman spectroscopy data",
        "model_author": "MUHAMMAD HELMI BIN ROZAIN",
    },
    other_meta={
        "dataset": {"url": "https://datadryad.org/dataset/doi:10.5061/dryad.cjsxksn3p",
                    "description": "Prostate cancer dataset from Dryad"},
        "predict_accuracy": predict_accuracy,
        "cross_val_score": {
            "std": mlresult["cross_val_score"].std(),
            "mean": mlresult["cross_val_score"].mean(),
        },
        "classification_report": mlresult["classification_report"],
        "training_time": mlresult["training_time"],
        "region": region,
        "preprocessing_info": chumDF_cancer.get("preprocessing_info", None),
    }

)

In [None]:
# MODEL_SHORTS = ["SVC-LINEAR", "RF"]
# for mshort in MODEL_SHORTS:
#     ramanML = RamanML()
#     if mshort == "SVC-LINEAR":
#         mlresult = ramanML.train_svc(normal_data=([chumDF_benign["processed"]], labels[0]),
#                                      disease_data=([chumDF_cancer["processed"]], labels[1]), param_search=False, test_size=0.2,
#                                      SVC_model=ramanML.SVCMODEL(kernel='linear', C=1.0, gamma='scale', class_weight='balanced'),)
#     else:
#         mlresult = ramanML.train_rf(normal_data=([chumDF_benign["processed"]], labels[0]),
#                                     disease_data=([chumDF_cancer["processed"]], labels[1]), param_search=False, test_size=0.2,
#                                     RF_model=ramanML.RFMODEL(n_estimators=200,           # More trees for stability
#                                                              criterion="gini",           # Or "entropy" for information gain
#                                                              # Let trees grow deep (or set to 10-30 to reduce overfitting)
#                                                              max_depth=None,
#                                                              min_samples_split=2,        # Default, can increase to reduce overfitting
#                                                              min_samples_leaf=1,         # Default, can increase to reduce overfitting
#                                                              max_features="log2",        # Good default for classification
#                                                              class_weight="balanced",    # Important for imbalanced classes
#                                                              random_state=42,            # For reproducibility
#                                                              n_jobs=-1,                  # Use all CPU cores
#                                                              bootstrap=True,             # Default
#                                                              oob_score=True,),)

#     predict_data = ramanML.predict(
#         test_spectra=test_spectra,
#         # model = mlresult["model"]
#     )

#     predict_accuracy, plotdata = RamanVisualizer(None).confusion_matrix_heatmap(
#         y_true=true_labels,          # true labels for test set
#         y_pred=predict_data["y_pred"],  # predicted labels for test set
#         # class names, e.g. ["benign", "cancer"]
#         class_labels=labels,
#         title="SVC Confusion Matrix",
#         normalize=False,
#         cmap="Blues",
#         figsize=(8, 6),
#         fmt="s",
#         show_heatmap=False,
#     )

#     save_to_onnx = MLModel()
#     save_to_onnx.save(
#         model=mlresult["model"],
#         labels=labels,
#         filename=f"{mshort}_raman_prostate_model",
#         common_axis=ramanML.common_axis,
#         n_features_in=ramanML.n_features_in,
#         meta={
#             "model_type": mlresult["model"].__class__.__name__,
#             "model_name": f"{mshort} raman prostate",
#             "model_version": "1.0",
#             "model_description": "SVC model for prostate cancer classification based on Raman spectroscopy data",
#             "model_author": "MUHAMMAD HELMI BIN ROZAIN",
#         },
#         other_meta={
#             "dataset": {"url": "https://datadryad.org/dataset/doi:10.5061/dryad.cjsxksn3p",
#                         "description": "Prostate cancer dataset from Dryad"},
#             "predict_accuracy": predict_accuracy,
#             "cross_val_score": {
#                 "std": mlresult["cross_val_score"].std(),
#                 "mean": mlresult["cross_val_score"].mean(),
#             },
#             "classification_report": mlresult["classification_report"],
#         }

#     )