In [1]:
from itertools import product

import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             roc_auc_score, confusion_matrix)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     train_test_split)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from scipy import stats
from statsmodels.stats.contingency_tables import mcnemar

from src.radiomics.models.data import load_data, get_formatted_data
from src.radiomics.models.utils import RemoveHighlyCorrelatedFeatures

In [2]:

clinical_df = pd.read_csv("/home/valentin/python_wkspce/plc_segmentation/data/clinical_info_updated.csv").set_index("patient_id")

In [3]:
ids = clinical_df.index
ids = [
    i for i in ids if i not in
    ["PatientLC_71", "PatientLC_21", "PatientLC_63", "PatientLC_72"]
]
clinical_df = clinical_df.loc[ids, :]


In [4]:
clinical_df.shape

(105, 16)

In [5]:
def get_gridsearch():
    scaler = StandardScaler()

    clf_lr = LogisticRegression(penalty="none", solver='sag', max_iter=1000)
    clf_rf = RandomForestClassifier()

    pipe = Pipeline(steps=[
        ('normalization', scaler),
        ('feature_removal', RemoveHighlyCorrelatedFeatures()),
        ('feature_selection', None),
        ('classifier', None),
    ])

    F_OPTIONS = [1, 2, 3, 5]
    K_OPTIONS = [k for k in range(1, 11)]

    param_grid = [
        {
            'feature_selection': [SelectKBest(f_classif)],
            'feature_selection__k': F_OPTIONS,
            'classifier': [clf_rf],
            'classifier__n_estimators': [100, 150]
        },
        {
            'feature_selection': [SelectKBest(f_classif)],
            'feature_selection__k': F_OPTIONS,
            'classifier': [clf_lr],
        },
        {
            'feature_selection': [PCA()],
            'feature_selection__n_components': K_OPTIONS,
            'classifier': [clf_lr],
        },
        {
            'feature_selection': [PCA()],
            'feature_selection__n_components': K_OPTIONS,
            'classifier': [clf_rf],
        },
    ]

    return GridSearchCV(pipe,
                        param_grid,
                        cv=StratifiedKFold(),
                        n_jobs=23,
                        refit=True,
                        verbose=1,
                        scoring="roc_auc")



In [6]:
search = get_gridsearch()

In [7]:
df  = load_data(
    "../data/processed/radiomics/extracted_features.csv",
    # "../data/processed/radiomics/extracted_features_auto.csv",
    "../data/clinical_info_updated.csv",
)

In [48]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

In [49]:
def get_scores(*, search, skf, modality, voi):
    X, y = get_formatted_data(df, modality=modality, voi=voi, return_df=True)
    y_output = y.copy()
    y_output["y_pred"] = 0
    y_output["y_pred_proba"] = 0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train = np.squeeze(y.iloc[train_index].values)

        search.fit(X_train, y_train)

        y_output.iloc[test_index,
                      y_output.columns.get_loc("y_pred")] = np.squeeze(
                          search.predict(X_test))
        y_output.iloc[test_index,
                      y_output.columns.get_loc("y_pred_proba")] = np.squeeze(
                          search.predict_proba(X_test)[:, 1])

    return y_output


In [50]:
def get_scores_fusion(*, skf, voi):
    X_pt, _ = get_formatted_data(df, modality="PT", voi=voi, return_df=True)
    X_ct, y = get_formatted_data(df, modality="CT", voi=voi, return_df=True)
    y_output = y.copy()
    y_output["y_pred"] = 0
    y_output["y_pred_proba"] = 0
    search_pt = get_gridsearch()
    search_ct = get_gridsearch()
    model_fusion = LogisticRegression(penalty="none",
                                      solver='sag',
                                      max_iter=1000)
    for train_index, test_index in skf.split(X_pt, y):
        X_train_pt, X_test_pt = X_pt.iloc[train_index], X_pt.iloc[test_index]
        X_train_ct, X_test_ct = X_ct.iloc[train_index], X_ct.iloc[test_index]
        y_train = np.squeeze(y.iloc[train_index].values)

        search_pt.fit(X_train_pt, y_train)
        search_ct.fit(X_train_ct, y_train)
        model_fusion.fit(
            np.stack(
                [
                    search_pt.predict_proba(X_train_pt)[:, 1],
                    search_ct.predict_proba(X_train_ct)[:, 1],
                ],
                axis=1,
            ), y_train)

        def make_prediction(x_pt, x_ct):
            preds = np.stack([
                search_pt.predict_proba(x_pt)[:, 1],
                search_ct.predict_proba(x_ct)[:, 1],
            ],
                             axis=1)
            return model_fusion.predict(preds), model_fusion.predict_proba(
                preds)[:, 1]

        predictions, predictions_proba = make_prediction(X_test_pt, X_test_ct)
        y_output.iloc[test_index,
                      y_output.columns.get_loc("y_pred")] = predictions
        y_output.iloc[
            test_index,
            y_output.columns.get_loc("y_pred_proba")] = predictions_proba

    return y_output



In [51]:
def compute_all_mcnemar(x1, x2):
    cm = confusion_matrix(x1, x2)
    return {
        "confusion_matrix": cm,
        "pvalue": mcnemar(cm, exact=False, correction=False).pvalue,
        "pvalue_corrected": mcnemar(cm, exact=False, correction=True).pvalue,
        "pvalue_exact": mcnemar(cm, exact=True, correction=False).pvalue,
    }

In [52]:
cm = np.array([[100000, 8], [1, 100000]])
mcnemar(cm, exact=True, correction=False).pvalue

0.0390625

In [53]:
# output_1 = get_scores_fusion(skf=skf, voi="GTV_L")


In [54]:
output_1 = get_scores(search=search, skf=skf, modality="CT", voi="GTV_L")

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [55]:
output_2 = get_scores(search=search, skf=skf, modality="PT", voi="GTV_L")

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [56]:
patient_ids = output_1.index

In [57]:
compute_all_mcnemar(
    output_1.loc[patient_ids, "y_pred"].values,
    output_2.loc[patient_ids, "y_pred"].values,
)


{'confusion_matrix': array([[26, 10],
        [13, 56]]),
 'pvalue': 0.5316145768816123,
 'pvalue_corrected': 0.6766573217164242,
 'pvalue_exact': 0.6776394844055175}

In [18]:
accuracy_score(output_1.plc_status, output_1.y_pred)

0.8

In [19]:
accuracy_score(output_2.plc_status, output_2.y_pred)

0.8380952380952381

In [20]:
roc_auc_score(output_1.plc_status, output_1.y_pred_proba)

0.8326981707317074

In [21]:
roc_auc_score(output_2.plc_status, output_2.y_pred_proba)

0.8405106707317073

In [22]:
clinical_df.columns

Index(['plc_status', 'patient_age', 'patient_sex', 'SUVmax_lesion',
       'SUVmean_lesion', 'MTV', 'TLG', 'PET_lymphangitis_Visual_analysis',
       'Peri bronchovascular thickening', 'LymphangitisCT', 'pT', 'pN', 'M',
       'stage', 'pathologic type', 'is_chuv'],
      dtype='object')

In [30]:
# col_clin = "PET_lymphangitis_Visual_analysis"
# col_clin = "LymphangitisCT"
col_clin = "Peri bronchovascular thickening"

In [31]:
patient_ids = clinical_df[~clinical_df[col_clin].isna()].index

In [32]:
compute_all_mcnemar(
    clinical_df.loc[patient_ids, col_clin] != 0,
    output_1.loc[patient_ids, "y_pred"],
)


{'confusion_matrix': array([[28, 26],
        [ 5, 39]]),
 'pvalue': 0.00016213175204392638,
 'pvalue_corrected': 0.00032801631501352865,
 'pvalue_exact': 0.00019219517707824712}

In [35]:
roc_auc_score(output_1.loc[patient_ids, "plc_status"],output_1.loc[patient_ids, "y_pred_proba"])

0.8228070175438597

In [36]:
roc_auc_score(clinical_df.loc[patient_ids, "plc_status"],
              clinical_df.loc[patient_ids, col_clin])


0.7807017543859649

In [37]:
compute_all_mcnemar(
    clinical_df.loc[patient_ids, col_clin] != 0,
    output_2.loc[patient_ids, "y_pred"],
)

{'confusion_matrix': array([[30, 24],
        [ 6, 38]]),
 'pvalue': 0.0010150009471130682,
 'pvalue_corrected': 0.0019107751373644388,
 'pvalue_exact': 0.0014309063553810128}

In [None]:
roc_auc_score(output_2.loc[patient_ids, "plc_status"],output_2.loc[patient_ids, "y_pred_proba"])