In [None]:
import pandas as pd
import numpy as np
import statistics

from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold

from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.svm import FastSurvivalSVM
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis

# Histopathological features

In [None]:
df_histopathological_features = pd.read_csv('./histopathological_features.csv')
PATHOLOGY = list(df_histopathological_features['pathology'])

## preprocess

In [None]:
# tissue: [ADI, BACK, DEB, LYM, MUC, MUS, NORM, STR, TUM]

def datapreprocess(dataframe = df_histopathological_features, tissue = [0,0,0,1,1,0,0,1,1], weight = np.ones((455,9)).tolist()):
    X = []
    y = []
    
    for i in range(len(dataframe['pathology'])):
        
        # X
        temp = []
        for j in range(len(tissue)):
            if tissue[j] == 1:
                feature = dataframe.iloc[i][12+j*32:12+(j+1)*32].tolist()
                temp = temp + [x*weight[i][j] for x in feature]
                
        X.append(temp)
        
        # y
        OS = dataframe['OS'][i]
        OS_time = dataframe['OS.time'][i]
        label = (OS, OS_time)
        
        y.append(label)
    
    X = np.array(X)
    y = np.array(y, dtype=[('OS', '?'), ('OS_time', 'f')])
        
    return X, y

In [None]:
weight = []

for i in range(len(PATHOLOGY)):
    tile_number_list = df_histopathological_features.iloc[i][3:12].tolist()
    tile_number_sum = 0
    for j in range(9):
        tile_number_sum = tile_number_sum + tile_number_list[j]
    
    weight.append([x/tile_number_sum for x in tile_number_list])

In [None]:
X_hf, y = datapreprocess(dataframe = df_histopathological_features, tissue = [0,0,0,1,1,0,0,1,1], weight = weight)

# Segmentation features

In [None]:
df_segmentation_features = pd.read_csv('./segmentation_features.csv', index_col = 'pathology')

## preprocess

In [None]:
X_sf = []

for i in range(len(PATHOLOGY)):
    X_sf.append([df_segmentation_features.at[PATHOLOGY[i], 'max_tumor_area'],
                 df_segmentation_features.at[PATHOLOGY[i], 'lymphocyte_inside_tumor'],
                 df_segmentation_features.at[PATHOLOGY[i], 'lymphocyte_around_tumor'],
                 df_segmentation_features.at[PATHOLOGY[i], 'around_inside_ratio'],
                 df_segmentation_features.at[PATHOLOGY[i], 'total_stroma_area']])
    
X_sf = np.array(X_sf)

# Concatenation

In [None]:
X = np.concatenate((X_hf, X_sf), axis=1)

# Fold splitting

In [None]:
max_tumor_area = []
lymphocyte_inside_tumor = []
lymphocyte_around_tumor = []
around_inside_ratio = []
total_stroma_area = []
OS = []
OS_time = []

for i in range(len(PATHOLOGY)):
    max_tumor_area.append(df_segmentation_features.at[PATHOLOGY[i], 'max_tumor_area'])
    lymphocyte_inside_tumor.append(df_segmentation_features.at[PATHOLOGY[i], 'lymphocyte_inside_tumor'])
    lymphocyte_around_tumor.append(df_segmentation_features.at[PATHOLOGY[i], 'lymphocyte_around_tumor'])
    around_inside_ratio.append(df_segmentation_features.at[PATHOLOGY[i], 'around_inside_ratio'])
    total_stroma_area.append(df_segmentation_features.at[PATHOLOGY[i], 'total_stroma_area'])
    OS.append(df_segmentation_features.at[PATHOLOGY[i], 'OS'])
    OS_time.append(df_segmentation_features.at[PATHOLOGY[i], 'OS.time'])
    
df_clustering = pd.DataFrame({
    'pathology': PATHOLOGY,
    'max_tumor_area': max_tumor_area,
    'lymphocyte_inside_tumor': lymphocyte_inside_tumor,
    'lymphocyte_around_tumor': lymphocyte_around_tumor,
    'around_inside_ratio': around_inside_ratio,
    'total_stroma_area': total_stroma_area,
    'OS': OS,
    'OS.time': OS_time,
    'index': np.arange(258),
    'label_cluster': np.zeros(258),
    'kmeans_cluster': np.zeros(258)
})

## label_cluster

In [None]:
Q1_0 = np.quantile(np.array(df_clustering[df_clustering['OS'] == 0]['OS.time']), 0.25)
Q2_0 = np.quantile(np.array(df_clustering[df_clustering['OS'] == 0]['OS.time']), 0.5)
Q3_0 = np.quantile(np.array(df_clustering[df_clustering['OS'] == 0]['OS.time']), 0.75)
Q2_1 = np.quantile(np.array(df_clustering[df_clustering['OS'] == 1]['OS.time']), 0.5)

for i in range(0, len(df_clustering.index)):
    if df_clustering.at[i, 'OS'] == 0:
        if df_clustering.at[i, 'OS.time'] <= Q1_0:
            df_clustering.at[i, 'label_cluster'] = 0
        elif df_clustering.at[i, 'OS.time'] > Q1_0 and df_clustering.at[i, 'OS.time'] <= Q2_0:
            df_clustering.at[i, 'label_cluster'] = 1
        elif df_clustering.at[i, 'OS.time'] > Q2_0 and df_clustering.at[i, 'OS.time'] <= Q3_0:
            df_clustering.at[i, 'label_cluster'] = 2
        elif df_clustering.at[i, 'OS.time'] > Q3_0:
            df_clustering.at[i, 'label_cluster'] = 3
        
    if df_clustering.at[i, 'OS'] == 1:
        if df_clustering.at[i, 'OS.time'] <= Q2_1:
            df_clustering.at[i, 'label_cluster'] = 4
        else:
            df_clustering.at[i, 'label_cluster'] = 5

## kmeans_cluster

In [None]:
# label_cluster0
df_0 = df_clustering[df_clustering['label_cluster'] == 0]
X_kmeans_0 = np.array(df_0[['max_tumor_area', 'lymphocyte_inside_tumor', 'lymphocyte_around_tumor', 'around_inside_ratio', 'total_stroma_area']])
kmeans_0 = KMeans(n_clusters=4, random_state=0).fit(X_kmeans_0)
y_kmeans_0 = kmeans_0.labels_

for i in range(0, len(df_0.index)):
    index = df_0.iat[i, 8]
    
    if y_kmeans_0[i] == 0:
        df_clustering.at[index, 'kmeans_cluster'] = 0
    elif y_kmeans_0[i] == 1:
        df_clustering.at[index, 'kmeans_cluster'] = 1
    elif y_kmeans_0[i] == 2:
        df_clustering.at[index, 'kmeans_cluster'] = 2
    elif y_kmeans_0[i] == 3:
        df_clustering.at[index, 'kmeans_cluster'] = 3
        
# label_cluster1
df_1 = df_clustering[df_clustering['label_cluster'] == 1]
X_kmeans_1 = np.array(df_1[['max_tumor_area', 'lymphocyte_inside_tumor', 'lymphocyte_around_tumor', 'around_inside_ratio', 'total_stroma_area']])
kmeans_1 = KMeans(n_clusters=3, random_state=0).fit(X_kmeans_1)
y_kmeans_1 = kmeans_1.labels_

for i in range(0, len(df_1.index)):
    index = df_1.iat[i, 8]
    
    if y_kmeans_1[i] == 0:
        df_clustering.at[index, 'kmeans_cluster'] = 4
    elif y_kmeans_1[i] == 1:
        df_clustering.at[index, 'kmeans_cluster'] = 5
    elif y_kmeans_1[i] == 2:
        df_clustering.at[index, 'kmeans_cluster'] = 6

# label_cluster2
df_2 = df_clustering[df_clustering['label_cluster'] == 2]
X_kmeans_2 =np.array(df_2[['max_tumor_area', 'lymphocyte_inside_tumor', 'lymphocyte_around_tumor', 'around_inside_ratio', 'total_stroma_area']])
kmeans_2 = KMeans(n_clusters=3, random_state=0).fit(X_kmeans_2)
y_kmeans_2 = kmeans_2.labels_

for i in range(0, len(df_2.index)):
    index = df_2.iat[i, 8]
    
    if y_kmeans_2[i] == 0:
        df_clustering.at[index, 'kmeans_cluster'] = 7
    elif y_kmeans_2[i] == 1:
        df_clustering.at[index, 'kmeans_cluster'] = 8
    elif y_kmeans_2[i] == 2:
        df_clustering.at[index, 'kmeans_cluster'] = 9
        
# label_cluster3
df_3 = df_clustering[df_clustering['label_cluster'] == 3]
X_kmeans_3 = np.array(df_3[['max_tumor_area', 'lymphocyte_inside_tumor', 'lymphocyte_around_tumor', 'around_inside_ratio', 'total_stroma_area']])
kmeans_3 = KMeans(n_clusters=2, random_state=0).fit(X_kmeans_3)
y_kmeans_3 = kmeans_3.labels_

for i in range(0, len(df_3.index)):
    index = df_3.iat[i, 8]
    
    if y_kmeans_3[i] == 0:
        df_clustering.at[index, 'kmeans_cluster'] = 10
    elif y_kmeans_3[i] == 1:
        df_clustering.at[index, 'kmeans_cluster'] = 11

# label_cluster4
df_4 = df_clustering[df_clustering['label_cluster'] == 4]
X_kmeans_4 = np.array(df_4[['max_tumor_area', 'total_stroma_area', 'lymphocyte_inside_tumor', 'lymphocyte_around_tumor', 'around_inside_ratio']])
kmeans_4 = KMeans(n_clusters=3, random_state=0).fit(X_kmeans_4)
y_kmeans_4 = kmeans_4.labels_

for i in range(0, len(df_4.index)):
    index = df_4.iat[i, 8]
    
    if y_kmeans_4[i] == 0:
        df_clustering.at[index, 'kmeans_cluster'] = 12
    elif y_kmeans_4[i] == 1:
        df_clustering.at[index, 'kmeans_cluster'] = 13
    elif y_kmeans_4[i] == 2:
        df_clustering.at[index, 'kmeans_cluster'] = 14

# label_cluster5
df_5 = df_clustering[df_clustering['label_cluster'] == 5]
X_kmeans_5 = np.array(df_5[['max_tumor_area', 'lymphocyte_inside_tumor', 'lymphocyte_around_tumor', 'around_inside_ratio', 'total_stroma_area']])
kmeans_5 = KMeans(n_clusters=2, random_state=0).fit(X_kmeans_5)
y_kmeans_5 = kmeans_5.labels_

for i in range(0, len(df_5.index)):
    index = df_5.iat[i, 8]
    
    if y_kmeans_5[i] == 0:
        df_clustering.at[index, 'kmeans_cluster'] = 15
    elif y_kmeans_5[i] == 1:
        df_clustering.at[index, 'kmeans_cluster'] = 16
        
# kmeans_cluster
kmeans_cluster = np.array(df_clustering['kmeans_cluster'])

## Bad folds splitting

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
split_index = list(skf.split(X, kmeans_cluster))

badfolds = np.concatenate((split_index[3][1], split_index[4][1]))
df_badfolds = df_clustering.iloc[badfolds]

skf_badfolds = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
k = 3
for remain, newfold_index in skf_badfolds.split(badfolds, df_badfolds['OS']):
    split_index[k] = (np.setdiff1d(np.arange(258), badfolds[newfold_index]), np.sort(badfolds[newfold_index]))
    k = k + 1

# Survival model

In [None]:
def lasso_cox(X, y, split_index):
    
    test_c_index = []
    for train_index, test_index in split_index:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        cox_lasso = CoxnetSurvivalAnalysis(l1_ratio=1.0, max_iter=100000)
        cox_lasso.fit(X_train, y_train)
        test_c_index.append(cox_lasso.score(X_test, y_test))
        
    return(test_c_index)

In [None]:
def ridge_cox(X, y, split_index):
    
    test_c_index = []
    for train_index, test_index in split_index:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        cox_ridge = CoxPHSurvivalAnalysis(alpha=1e-2, n_iter=100)
        cox_ridge.fit(X_train, y_train)
        test_c_index.append(cox_ridge.score(X_test, y_test))
        
    return(test_c_index)

In [None]:
def en_cox(X, y, split_index):
    
    test_c_index = []
    for train_index, test_index in split_index:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        cox_en = CoxnetSurvivalAnalysis(l1_ratio=0.9, max_iter=100000)
        cox_en.fit(X_train, y_train)
        test_c_index.append(cox_en.score(X_test, y_test))
        
    return(test_c_index)

In [None]:
def survival_svm(X, y, split_index):
    
    test_c_index = []
    for train_index, test_index in split_index:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        ssvm = FastSurvivalSVM(random_state=0)
        ssvm.fit(X_train, y_train)
        test_c_index.append(ssvm.score(X_test, y_test))
        
    return(test_c_index)

In [None]:
def random_survival_forest(X, y, split_index):
    
    test_c_index = []
    for train_index, test_index in split_index:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        rsf = RandomSurvivalForest(random_state=0)
        rsf.fit(X_train, y_train)
        test_c_index.append(rsf.score(X_test, y_test))
        
    return(test_c_index)

In [None]:
def gradient_boosting(X, y, split_index):
    
    test_c_index = []
    for train_index, test_index in split_index:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        gbrt = GradientBoostingSurvivalAnalysis(random_state=0)
        gbrt.fit(X_train, y_train)
        test_c_index.append(gbrt.score(X_test, y_test))
        
    return(test_c_index)

## 5-fold cv result

In [None]:
LASSO_Cox = lasso_cox(X, y, split_index)
Ridge_Cox = ridge_cox(X, y, split_index)
EN_Cox = en_cox(X, y, split_index)
SSVM = survival_svm(X, y, split_index)
RSF = random_survival_forest(X, y, split_index)
GBRT = gradient_boosting(X, y, split_index)

print('Folds result:')
print(f'    LASSO_Cox: {LASSO_Cox}')
print(f'    Ridge_Cox: {Ridge_Cox}')
print(f'    EN_Cox:    {EN_Cox}')
print(f'    SSVM:      {SSVM}')
print(f'    RSF:       {RSF}')
print(f'    GBRT:      {GBRT}')
FOLDS_AVERAGE = []
for i in range(0, 5):
    FOLDS_AVERAGE.append((LASSO_Cox[i]+Ridge_Cox[i]+EN_Cox[i]+SSVM[i]+RSF[i]+GBRT[i])/6)
print(f'    Average:   {FOLDS_AVERAGE}')

print()

print('Folds average:')
print(f'    LASSO_Cox: {statistics.mean(LASSO_Cox)}')
print(f'    Ridge_Cox: {statistics.mean(Ridge_Cox)}')
print(f'    EN_Cox:    {statistics.mean(EN_Cox)}')
print(f'    SSVM:      {statistics.mean(SSVM)}')
print(f'    RSF:       {statistics.mean(RSF)}')
print(f'    GBRT:      {statistics.mean(GBRT)}')
AVERAGE = (statistics.mean(LASSO_Cox)+statistics.mean(Ridge_Cox)+statistics.mean(EN_Cox)+statistics.mean(SSVM)+statistics.mean(RSF)+statistics.mean(GBRT))/6
print(f'    Average:   {AVERAGE}')