In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             roc_auc_score, confusion_matrix)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     train_test_split)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA


In [2]:
clinical_df = pd.read_csv("/home/valentin/python_wkspce/plc_segmentation/data/clinical_info_updated.csv").set_index("patient_id")

In [3]:
clinical_df.shape

(107, 16)

In [4]:
ids = clinical_df.index
ids = [
    i for i in ids if i not in
    ["PatientLC_71", "PatientLC_21", "PatientLC_63", "PatientLC_72"]
]
clinical_df = clinical_df.loc[ids, :]


In [10]:
len(list(set(ids).intersection(ids_test)))

82

In [11]:
clinical_df[(clinical_df.plc_status == 1) & (
    clinical_df.plc_status != clinical_df["PET_lymphangitis_Visual_analysis"])
            & (clinical_df.is_chuv == 1) &
            (~clinical_df["PET_lymphangitis_Visual_analysis"].isna())].shape


(2, 16)

In [12]:
df_test = clinical_df.loc[ids_test, :]

In [13]:
roc_auc_score(df_test["plc_status"], df_test["LymphangitisCT"])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [14]:
def append_score(df, y_true, y_pred, y_score, pos_label=1, neg_label=0):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = recall_score(y_true, y_pred, pos_label=pos_label)
    precision = precision_score(y_true, y_pred, pos_label=pos_label)
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) != 0 else 0
    accuracy = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_score)
    return df.append(
        {
            'roc_auc': roc_auc,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'accuracy': accuracy,
            'precision': precision,
            'npv': npv,
        },
        ignore_index=True)


In [15]:
df_test.columns

Index(['plc_status', 'patient_age', 'patient_sex', 'SUVmax_lesion',
       'SUVmean_lesion', 'MTV', 'TLG', 'PET_lymphangitis_Visual_analysis',
       'Peri bronchovascular thickening', 'LymphangitisCT', 'pT', 'pN', 'M',
       'stage', 'pathologic type', 'is_chuv'],
      dtype='object')

In [16]:
def print_results(d):
    for key, item in d.items():
        print(f"{key}: {item[0]:0.2f} ({item[1]:0.2f} - {item[2]:0.2f})")



In [17]:
def bootstrap_pred(y_true, y_pred, y_score, bootstraps=1000):
    assert len(y_true) == len(y_pred)
    assert len(y_true) == len(y_score)

    score = pd.DataFrame()
    for _ in range(bootstraps):
        y_true_resampled, y_pred_resampled, y_score_resampled = resample(
            y_true,
            y_pred,
            y_score,
            replace=True,
            n_samples=len(y_true),
            stratify=y_true,
        )

        score = append_score(score,
                             y_true_resampled,
                             y_pred_resampled,
                             y_score_resampled,
                             pos_label=1)

    ic_score = {
        'roc_auc': [],
        'specificity': [],
        'sensitivity': [],
        'accuracy': [],
        'precision': [],
        'npv': []
    }

    for col in score.columns:
        ic_score[col].append(np.mean(score[col].values))
        ic_score[col].append(np.percentile(score[col].values, 2.5))
        ic_score[col].append(np.percentile(score[col].values, 97.5))
    return ic_score


In [49]:
df_test.columns

Index(['plc_status', 'patient_age', 'patient_sex', 'SUVmax_lesion',
       'SUVmean_lesion', 'MTV', 'TLG', 'PET_lymphangitis_Visual_analysis',
       'Peri bronchovascular thickening', 'LymphangitisCT', 'pT', 'pN', 'M',
       'stage', 'pathologic type', 'is_chuv'],
      dtype='object')

In [50]:
# X_test = df_test[''].values
# X_test = df_test['Peri bronchovascular thickening'].values
X_test = df_test['PET_lymphangitis_Visual_analysis'].values
y_test = df_test.plc_status.values

In [51]:
df_test["PET_lymphangitis_Visual_analysis"]

patient_id
PatientLC_14    1.0
PatientLC_17    0.0
PatientLC_47    0.0
PatientLC_49    1.0
PatientLC_35    1.0
               ... 
PatientLC_86    1.0
PatientLC_91    0.0
PatientLC_36    1.0
PatientLC_6     1.0
PatientLC_4     1.0
Name: PET_lymphangitis_Visual_analysis, Length: 82, dtype: float64

In [52]:
print_results(bootstrap_pred(y_test, (X_test != 0), X_test))


roc_auc: 0.88 (0.78 - 0.96)
specificity: 0.80 (0.60 - 0.95)
sensitivity: 0.95 (0.89 - 1.00)
accuracy: 0.91 (0.85 - 0.96)
precision: 0.94 (0.88 - 0.98)
npv: 0.85 (0.70 - 1.00)


In [22]:
def clean_df(df):
    col_to_drop = [c for c in df.columns if c.startswith('diagnostics')]
    col_to_drop.extend(
        [c for c in df.columns if 'glcm' in c and 'original' not in c])
    col_to_drop.extend([c for c in df.columns if 'RootMeanSquared' in c])
    col_to_drop.extend(
        [c for c in df.columns if '_MeanAbsoluteDeviation' in c])
    col_to_drop.extend([c for c in df.columns if '_Median' in c])
    col_to_drop.extend([c for c in df.columns if '_Range' in c])
    col_to_drop.extend([c for c in df.columns if '_InterquartileRange' in c])
    col_to_drop.extend([c for c in df.columns if 'Percentile' in c])
    col_to_drop.extend([c for c in df.columns if '_RootMeanSquared' in c])
    col_to_drop.extend([c for c in df.columns if '_TotalEnergy' in c])
    col_to_drop.extend([c for c in df.columns if '_Uniformity' in c])
    col_to_drop.extend([c for c in df.columns if 'wavelet' in c])
    col_to_drop.extend([c for c in df.columns if 'shape' in c])

    return df.drop(col_to_drop, axis=1)



In [23]:
def load_data(path_to_features, path_to_outcomes):
    df = pd.read_csv(path_to_features)
    df = df[~df.patient_id.isin(
        ["PatientLC_71", "PatientLC_21", "PatientLC_63", "PatientLC_72"])]
    clinical_df = pd.read_csv(path_to_outcomes).set_index("patient_id")
    df["plc_status"] = df["patient_id"].map(
        lambda x: clinical_df.loc[x, "plc_status"])

    df["is_chuv"] = df["patient_id"].map(
        lambda x: clinical_df.loc[x, "is_chuv"])

    df = df.drop(["Unnamed: 0"], axis=1)
    df = clean_df(df)

    return df


def get_formatted_data(
    df,
    modality="CT",
    voi="GTV_L",
    ids_train=None,
    ids_test=None,
):
    df = df[(df["modality"] == modality) & (df["voi"] == voi)]
    ids = df["patient_id"].values
    df = df.set_index("patient_id")

    outcomes_df = df["plc_status"]
    df = df.drop(["plc_status", "voi", "modality"], axis=1)

    X_train = df.loc[ids_train, :].values
    X_test = df.loc[ids_test, :].values
    feature_names = df.columns

    y_train = outcomes_df.loc[ids_train].values
    y_test = outcomes_df.loc[ids_test].values

    return X_train, X_test, y_train, y_test, feature_names

In [24]:
def get_gridsearch():
    scaler = StandardScaler()

    clf_lr = LogisticRegression(penalty="none", solver='sag', max_iter=1000)
    clf_rf = RandomForestClassifier()

    pipe = Pipeline(steps=[
        ('normalization', scaler),
        ('feature_selection', None),
        ('classifier', None),
    ])

    F_OPTIONS = [1, 2, 3, 5]
    K_OPTIONS = [k for k in range(1, 11)]

    param_grid = [
        {
            'feature_selection': [SelectKBest(f_classif)],
            'feature_selection__k': F_OPTIONS,
            'classifier': [clf_rf],
            'classifier__n_estimators': [100, 150]
        },
        {
            'feature_selection': [SelectKBest(f_classif)],
            'feature_selection__k': F_OPTIONS,
            'classifier': [clf_lr],
        },
        {
            'feature_selection': [PCA()],
            'feature_selection__n_components': K_OPTIONS,
            'classifier': [clf_lr],
        },
        {
            'feature_selection': [PCA()],
            'feature_selection__n_components': K_OPTIONS,
            'classifier': [clf_rf],
        },
    ]

    return GridSearchCV(pipe,
                        param_grid,
                        cv=StratifiedKFold(),
                        n_jobs=23,
                        refit=True,
                        verbose=1,
                        scoring="roc_auc")



In [25]:
search = get_gridsearch()

In [30]:
df  = load_data(
    "../data/processed/radiomics/extracted_features.csv",
    # "../data/processed/radiomics/extracted_features_auto.csv",
    "../data/clinical_info_updated.csv",
)

In [31]:
df["voi"]

0      GTV_L
1      GTV_T
2      GTV_N
3      GTV_L
4      GTV_T
       ...  
649    GTV_T
650    GTV_N
651    GTV_L
652    GTV_T
653    GTV_N
Name: voi, Length: 630, dtype: object

In [32]:
X_train, X_test, y_train, y_test, feature_names = get_formatted_data(df, modality="PT", voi="GTV_L",ids_test=ids_test, ids_train=ids_train)

In [33]:
X_train.shape

(23, 77)

In [34]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('normalization', StandardScaler()),
                                       ('feature_selection', None),
                                       ('classifier', None)]),
             n_jobs=23,
             param_grid=[{'classifier': [RandomForestClassifier()],
                          'classifier__n_estimators': [100, 150],
                          'feature_selection': [SelectKBest(k=1)],
                          'feature_selection__k': [1...
                          'feature_selection__k': [1, 2, 3, 5]},
                         {'classifier': [LogisticRegression(max_iter=1000,
                                                            penalty='none',
                                                            solver='sag')],
                          'feature_selection': [PCA()],
                          'feature_selection__n_components': [1, 2, 3, 4, 5, 6,
     

In [35]:
search.best_estimator_

Pipeline(steps=[('normalization', StandardScaler()),
                ('feature_selection', SelectKBest(k=1)),
                ('classifier', RandomForestClassifier())])

In [36]:
print_results(bootstrap_pred(y_test, search.predict(X_test), search.predict_proba(X_test)[:, 1]))

roc_auc: 0.65 (0.55 - 0.73)
specificity: 0.90 (0.75 - 1.00)
sensitivity: 0.31 (0.19 - 0.42)
accuracy: 0.45 (0.37 - 0.54)
precision: 0.91 (0.76 - 1.00)
npv: 0.30 (0.25 - 0.34)


In [37]:
X_train, X_test, y_train, y_test, feature_names = get_formatted_data(df, modality="PT", voi="autoGTV_L",ids_test=ids_test, ids_train=ids_train)

KeyError: "None of [Index(['Case_14', 'Case_4', 'Case_5', 'Case_10', 'Case_1', 'Case_9', 'Case_17',\n       'Case_16', 'PatientLC_54', 'Case_18', 'PatientLC_94', 'Case_19',\n       'Case_3', 'Case_7', 'Case_11', 'PatientLC_81', 'Case_13',\n       'PatientLC_93', 'Case_12', 'PatientLC_61', 'Case_2', 'Case_20',\n       'Case_15'],\n      dtype='object', name='patient_id')] are in the [index]"

In [38]:
suvmax_test = X_test[:, feature_names == "original_firstorder_Maximum"]

In [39]:
np.mean(suvmax_test)

2.780850169498746

In [40]:
print_results(bootstrap_pred(y_test, (suvmax_test>0.7), suvmax_test))

roc_auc: 0.78 (0.63 - 0.91)
specificity: 0.20 (0.05 - 0.40)
sensitivity: 0.97 (0.92 - 1.00)
accuracy: 0.78 (0.73 - 0.83)
precision: 0.79 (0.76 - 0.83)
npv: 0.67 (0.25 - 1.00)
