# Step 1. Download the CIFAR-100 data and preprocess

**Output of this step:**

* testData: Input data of the TreeTC model ("rawData.csv")
* indices: Corresponding index of each image in the raw data

**Remark:**

* tensorflow==1.14.0
* keras==2.3.1

**Note:**

* Choose the working directory as the directory where this code file locates.
* Please first install the required R and Python packages:
    * R: System_preparation.R
    * Python: pip install -r requirements.txt

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import keras
from keras.datasets import cifar100

Using TensorFlow backend.


In [3]:
import tensorflow
print("tensorflow version:", tensorflow.__version__)
print("keras version:", keras.__version__)

tensorflow version: 1.14.0
keras version: 2.3.1


In [4]:
(X_train, y_train), (X_test, y_test) = cifar100.load_data(label_mode="fine") # label: "fine"  / "coarse"

print(X_train.shape)
print(y_train.shape)

(50000, 32, 32, 3)
(50000, 1)


## Normalization of cifar100 original data

In [5]:
X_train = X_train.reshape(50000, 3072) / 255  
print(X_train.shape)

(50000, 3072)


## Select 9 classes

Superclasses information:

| Superclass                     | Class                                               |
| :----------------------------- | :----------------------------------------------------------- |
| aquatic mammals                | 30 dolphin                                           |
| fish                           | 67 flatfish, 73 sharks                             |
| fruit and vegetables           | 53 oranges    |
| household furniture            | 20 chair                                             |
| large carnivores               | 43 lion                                              |
| large omnivores and herbivores | 21 chimpanzee                   |
| trees                          | 47 maple, 52 oak                              |

In [6]:
labels_select = [20, 21, 30, 43, 47, 52, 53, 67,  73]
labels_bool = list(map(lambda x: x in labels_select, y_train))

In [7]:
X_train_sub9 = X_train[labels_bool]
y_train_sub9 = y_train[labels_bool]

print(X_train_sub9.shape)
print(y_train_sub9.shape)

(4500, 3072)
(4500, 1)


## UMAP: dimension reduction

In [8]:
import umap

n_components = 10
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=n_components, metric='euclidean', random_state=42)

X_train_sub9_umap10D = reducer.fit_transform(X_train_sub9)

## Assign 4500 images to 30 sources

In [9]:
X_umap = X_train_sub9_umap10D

numOfSubjects = 30         # number of sources
numOfClasses = 3           # number of image classes for one source
m = 50                     # number of data in one image class for one source
numOfData_eachClass = 500  # total number of images in one image class

indicesMat = np.zeros((len(labels_select), numOfData_eachClass+1))
for i in range(len(labels_select)):
    indicesMat[i, np.arange(numOfData_eachClass)] = np.where(y_train_sub9 == labels_select[i])[0]

indicesMat = indicesMat.astype("int")
DataOrigin = [0] * numOfSubjects
indices = []

y1 = [20, 21, 30]
y2 = [43, 47, 52]
y3 = [53, 67, 73]

In [10]:
for s in range(numOfSubjects):
    if s % 3 == 0:
        classIds = y1
    elif s % 3 == 1:
        classIds = y2
    else:
        classIds = y3
    
    tmpMat = np.zeros((numOfClasses * m, X_umap.shape[1]))
    num = 0

    for tmp in classIds:
        ind = np.where([l == tmp for l in labels_select])[0]
        k = indicesMat[ind, numOfData_eachClass]
        selIds = indicesMat[ind, int(k):(int(k)+m)]  # only integer scalar arrays can be converted to a scalar index!
                                                     # selIds: (1, 50)
        tmpMat[num:(num+m), :] = X_umap[selIds, :]
        indicesMat[ind, numOfData_eachClass] = k + m
        num = num + m
        indices.extend(np.squeeze(selIds))           # np.squeeze: (50,)

    DataOrigin[s] = tmpMat
    
# Attention: indices here in Python equals indices - 1 in R !!!
# If we use indices in Python, when we draw the images, we DO NOT need to conduct "clIds = TopClIds[cl, :] - 1"

In [11]:
testData = np.vstack(DataOrigin)
rawData = np.hstack((testData, np.array(indices).reshape(-1,1)))
np.savetxt("./rawData.csv", rawData, delimiter=",")

# Step2. Implement TreeTC and conduct MCMC

Conduct in "Step2_TreeTC_mcmc.R"


# Step 3. Draw Figure S9

In [12]:
import numpy as np
import pandas as pd
from matplotlib import pyplot

import PIL
from PIL import Image

import os, re

In [13]:
X = X_train_sub9
y = y_train_sub9
X_reshape = (X * 255).reshape(X.shape[0], 32, 32, 3).astype(np.int)

for group_id in [1,2,3]:    
    TopClIds = pd.read_csv('./TopClIds/group' + str(group_id) + '.csv', header = None)
    TopClIds = np.array(TopClIds)
    
    ncol = 2
    nrow = 5
    
    for cl in range(TopClIds.shape[0]):
        clIds = TopClIds[cl, :]
        imgNames = list(map(lambda i: "./image_group" + str(group_id) + "/" + "img" + str(i) + ".png", np.arange(len(clIds))))
        for i in range(len(clIds)):
            clId = clIds[i]
            pyplot.imsave(imgNames[i], X_reshape[clId:(clId+1)][0].astype(np.uint8))

        imgs = [ PIL.Image.open(i) for i in imgNames ]
        imgs_all = [0] * nrow
        iii = [i * ncol for i in range(nrow+1)]
        for j in range(nrow):
            imgs_all[j] = np.hstack( (np.asarray( k ) for k in imgs[iii[j]:iii[j+1]] ) )

        imgs_comb = np.vstack(imgs_all)
        imgs_comb = PIL.Image.fromarray( imgs_comb)
        imgs_comb.save("./image_group" + str(group_id) + "/" + "10Imgs-" + str(cl) + ".png" )
        
    delDir = "./image_group" + str(group_id)
    delList = os.listdir(delDir)
    for f in delList:
        filePath = os.path.join( delDir, f )
        if re.match("img\d.png", f):
            os.remove(filePath)

