forked from ildoonet/data-science-bowl-2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmeans_clustering.py
98 lines (70 loc) · 3.45 KB
/
kmeans_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import shutil
import numpy as np
from sklearn.cluster import KMeans
from tensorpack.dataflow.common import MapDataComponent
from tensorpack.dataflow import PrefetchData
from tqdm import tqdm
from data_augmentation import random_crop_224
from data_feeder import CellImageDataManagerTrainAll
master_dir_train = '/data/public/rw/datasets/dsb2018/external_data/crowd_sourced/train_grayall_tnbc'
# master_dir_train = '/data/public/rw/datasets/dsb2018/extra_data'
master_dir_origin_ext_train_kmeans = '/data/public/rw/datasets/dsb2018/train_kmeans_grayall_tnbc'
master_dir_origin_ext_valid_kmeans = '/data/public/rw/datasets/dsb2018/valid_kmeans_grayall_tnbc'
ratio = 0.8
n_clusters = 4
def cluster_features(features, n_clusters=10):
kmeans = KMeans(n_clusters=n_clusters, random_state=1111)
kmeans.fit(features)
labels = kmeans.labels_
return labels
def get_test_valid_split_labels():
ds_train = CellImageDataManagerTrainAll(master_dir_train)
ds_train = MapDataComponent(ds_train, random_crop_224)
ds_train = PrefetchData(ds_train, 1000, 12)
ds_train_img = ds_train.get_data()
features = []
train_lists = []
valid_lists = []
for idx, dp in tqdm(enumerate(ds_train_img)):
img = np.asarray(dp[0].image(is_gray=False))
img = np.ndarray.flatten(img)
features.append(img)
features = np.stack(features, axis=0)
labels = cluster_features(features, n_clusters=n_clusters)
idx_labels = []
for i in range(n_clusters):
idx_labels.append(np.transpose(np.argwhere((labels == i))))
# Split train and valid data set
for n in range(n_clusters):
train_lists.extend(idx_labels[n][:, :int(idx_labels[n].shape[1] * ratio)])
valid_lists.extend(idx_labels[n][:, int(idx_labels[n].shape[1] * ratio):])
for n in range(n_clusters):
np.random.shuffle(train_lists[n])
np.random.shuffle(valid_lists[n])
return train_lists, valid_lists
def copy_clustered_image(train_lists, valid_lists):
# try:
# shutil.rmtree(os.path.join(master_dir_origin_ext_train_kmeans))
# shutil.rmtree(os.path.join(master_dir_origin_ext_valid_kmeans))
# except Exception as err:
# print('copy_clustered_image error:', err)
# if file directory does not exist, create new one
if not os.path.exists(master_dir_origin_ext_train_kmeans):
os.mkdir(master_dir_origin_ext_train_kmeans)
if not os.path.exists(master_dir_origin_ext_valid_kmeans):
os.mkdir(master_dir_origin_ext_valid_kmeans)
train_files_list = list(next(os.walk(master_dir_train))[1])
for n in range(n_clusters):
for col_train in range(train_lists[n].shape[0]):
shutil.copytree(os.path.join(master_dir_train, train_files_list[train_lists[n][col_train]])
, os.path.join(master_dir_origin_ext_train_kmeans, train_files_list[train_lists[n][col_train]]))
for col_valid in range(valid_lists[n].shape[0]):
shutil.copytree(os.path.join(master_dir_train, train_files_list[valid_lists[n][col_valid]])
, os.path.join(master_dir_origin_ext_valid_kmeans, train_files_list[valid_lists[n][col_valid]]))
print('=================== DONE ======================')
if __name__ == '__main__':
train_lists, valid_lists = get_test_valid_split_labels()
print("train_list: {}".format(train_lists))
print("valid_list: {}".format(valid_lists))
copy_clustered_image(train_lists, valid_lists)