Imports

In [None]:
import os
import numpy as np
from collections import Counter
from tqdm import tqdm
import shutil

import tensorflow as tf
from keras import layers
from keras.models import Model, Sequential
from keras.preprocessing.image import load_img 
from keras.applications.xception import preprocess_input 
from keras.applications.xception import Xception 

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from k_means_constrained import KMeansConstrained

In [None]:
ROOT_DIR = r"Kather_norm"
TARGET_DIR = r'Kather_decomposed_v2'
INPUT_SHAPE = (299,299,3)

In [None]:
def get_model():
    model = Xception(weights='imagenet', include_top=True, input_shape=INPUT_SHAPE)
#     model = Sequential()
#     model.add(base)
#     model.add(layers.GlobalAveragePooling2D())
#     model.add(layers.Dense(1024, activation='relu')) 
#     model.add(layers.Dense(8, activation='softmax'))
#     model.load_weights("xception_weights.h5")
    return model

In [None]:
def extract_features(img_path, extractor):
    img = load_img(img_path, target_size=(299, 299))
    img = np.asarray(img)
    img = preprocess_input(img)
    img = np.expand_dims(img, axis=0)
    assert img.shape == (1, 299, 299, 3)
    features = extractor.predict(img).reshape(-1)
    assert features.shape == (2048,)
    return features

In [None]:
def get_cluster_labels(folder_path, extractor, 
                  n_components, n_clusters, random_state):
    feat_dict = {}
    files = os.listdir(folder_path)
    for file in tqdm(files):
        file_path = os.path.join(folder_path, file)
        feat_dict[file] = extract_features(file_path, extractor)
    
    # recreate list from keys to make sure ordering is parallel
    fnames = np.array(list(feat_dict.keys()))
    features = np.array(list(feat_dict.values()))
    
    pca = PCA(n_components=n_components, random_state=random_state)
    pca.fit(features)
    features_t = pca.transform(features)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    kmeans.fit(features_t)
    labels = kmeans.labels_
    
#     kmeans_c = KMeansConstrained(n_clusters=2, size_min=250, random_state=123)
#     kmeans_c.fit(features_t)
#     labels = kmeans_c.labels_
    
    file_label_dict = dict(zip(fnames, labels))
    return file_label_dict

In [None]:
def write_new_classes(source_folder_path, extractor, 
                     n_components=0.95, n_clusters=2, random_state=123):
    source_folder = source_folder_path.split('\\')[-1]
    target_folder_path = os.path.join(TARGET_DIR, source_folder)
    new_folders = [f'{target_folder_path}_CLUSTER_{i}' 
                   for i in range(n_clusters)]
    for folder in new_folders:
        if not os.path.exists(folder):
            os.mkdir(folder)
    cluster_labels = get_cluster_labels(
        source_folder_path, extractor, n_components, n_clusters, random_state)
    for fname in cluster_labels.keys():
        label = cluster_labels[fname]
        src = os.path.join(source_folder_path, fname)
        dst = os.path.join(new_folders[label], fname)
        
        shutil.copyfile(src, dst)

In [None]:
def main():
    model = get_model()
    extractor = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    extractor.summary()
    if not os.path.exists(TARGET_DIR):
        os.mkdir(TARGET_DIR)
    for folder in os.listdir(ROOT_DIR):
        folder_path = os.path.join(ROOT_DIR, folder)
        print(f"Working folder {folder}")
        write_new_classes(folder_path, extractor)
    print("Finished")

In [None]:
main()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 149, 149, 32) 864         input_4[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 149, 149, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 149, 149, 32) 0           block1_conv1_bn[0][0]            
____________________________________________________________________________________________

100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:33<00:00, 18.83it/s]


Working folder 02_STROMA


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:31<00:00, 19.74it/s]


Working folder 03_COMPLEX


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:32<00:00, 19.29it/s]


Working folder 04_LYMPHO


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:32<00:00, 19.20it/s]


Working folder 05_DEBRIS


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:33<00:00, 18.78it/s]


Working folder 06_MUCOSA


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:32<00:00, 19.02it/s]


Working folder 07_ADIPOSE


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:21<00:00, 29.32it/s]


Working folder 08_EMPTY


100%|████████████████████████████████████████████████████████████████████████████████| 625/625 [00:21<00:00, 29.54it/s]


Finished


In [None]:
for folder in os.listdir(TARGET_DIR):
    folder_path = os.path.join(TARGET_DIR, folder)
    num_files = len(os.listdir(folder_path))
    print(folder, num_files)

01_TUMOR_CLUSTER_0 260
01_TUMOR_CLUSTER_1 365
02_STROMA_CLUSTER_0 254
02_STROMA_CLUSTER_1 371
03_COMPLEX_CLUSTER_0 284
03_COMPLEX_CLUSTER_1 341
04_LYMPHO_CLUSTER_0 226
04_LYMPHO_CLUSTER_1 399
05_DEBRIS_CLUSTER_0 390
05_DEBRIS_CLUSTER_1 235
06_MUCOSA_CLUSTER_0 264
06_MUCOSA_CLUSTER_1 361
07_ADIPOSE_CLUSTER_0 378
07_ADIPOSE_CLUSTER_1 247
08_EMPTY_CLUSTER_0 307
08_EMPTY_CLUSTER_1 318
