In [None]:
### use fiftyone lib from voxel51 to learn embedding for object detection datasets
import fiftyone as fo
import fiftyone.zoo as foz

In [None]:
# load a resnet50 CNN model that trained on the imagenet dataset
model = foz.load_zoo_model('resnet50-imagenet-torch')

### k-center greedy algorithm

In [3]:
import numpy as np
from sklearn.metrics import pairwise_distances

def update_distances(features, cluster_center, min_dist, only_new=True, reset_dist=False):
    if reset_dist:
      min_dist = None
    if cluster_center:
      # Update min_distances for all examples given new cluster center.
      x = features[cluster_center]
      dist = pairwise_distances(features, x, metric='euclidean')

      if min_dist is None:
        min_dist = np.min(dist, axis=1).reshape(-1,1)
      else:
        min_dist = np.minimum(min_dist, dist)
        
    return min_dist
        
def k_center(features, k, **kwargs):
    cluster_centers = []
    cluster_delta = []
    k_delta = 0
    n_obs = features.shape[0]
    min_dist = None

    for _ in range(k):
      if min_dist is None:
        # Initialize centers with a randomly selected datapoint
        ind = np.random.choice(np.arange(n_obs))
        #ind = 0
      else:
        ind = np.argmax(min_dist)

      min_dist = update_distances(features, [ind], min_dist, only_new=True, reset_dist=False)
      cluster_centers.append(ind)
      k_delta = max(min_dist)
      cluster_delta.append(k_delta)
    print('Maximum distance from cluster centers is %0.2f'
            % k_delta)

    return cluster_centers, cluster_delta

### object detection datasets loading

In [None]:
# load COCO 2017 dataset (do not need to download dataset separately)
coco = foz.load_zoo_dataset(
    "coco-2017",
    split="validation",
)

In [None]:
# load BDD dataset, need to download the dataset manually
source_dir = "../bdd100k/"
bdd = foz.load_zoo_dataset('bdd100k', split='validation')

In [None]:
#load Cityscapes dataset, need to download the dataset manually
source_dir = "../cityscapes/"
city = foz.load_zoo_dataset('cityscapes', split='validation', source_dir=source_dir)

In [None]:
# load KITTI dataset, do not need to download dataset separately
kitti = foz.load_zoo_dataset('kitti', split='test')

In [None]:
# load PASCAL VOC 2012 dataset, do not need to download dataset separately
voc = foz.load_zoo_dataset('voc-2012', split='validation')

In [None]:
# load our real-world driving image data version from our version management system
proj_name = 'Self Driving Project'
dan_port = '55299'
import sys
sys.path.insert(0,'dataset')
import katech_dataset
from katech_dataset import KATECHDetectionDataset
# add images
samples = []
data_versions = ['D0821']
for data_version in data_versions:
    print(data_version)
    dataset = KATECHDetectionDataset(dan_port, proj_name, data_version, 'train', 
                                          katech_dataset.get_transform(train=False))
    for _, image in enumerate(dataset.images):
        sample = fo.Sample(filepath=image['path'])
        samples.append(sample)
# create data set
katech1 = fo.Dataset('D0821-dataset')
katech1.add_samples(samples)

### classification datasets loading

In [None]:
# load MNIST dataset
mnist =  foz.load_zoo_dataset("mnist", split="test")

In [None]:
# load CIFAR10 dataset
cifar10 =  foz.load_zoo_dataset("cifar10", split="test")

In [None]:
# load MNIST-FASHION dataset
fashion = foz.load_zoo_dataset("fashion-mnist", split="test")

### compute dataset embedding

In [None]:
# object detection datasets
coco_emb = coco.compute_embeddings(model, num_workers=16)
bdd_emb = bdd.compute_embeddings(model, num_workers=16)
cityscapes_emb = city.compute_embeddings(model, num_workers=16)
kitti_emb = kitti.compute_embeddings(model, num_workers=16)
katech1_emb = katech1.compute_embeddings(model, num_workers=16)

# classification datasets
cifar10_emb = cifar10.compute_embeddings(model, num_workers=16)
mnist_emb = mnist.compute_embeddings(model, num_workers=16)
fashion_emb = fashion.compute_embeddings(model, num_workers=16)

### compute core set of each dataset as a k-center computation using greedy algorithm (k=10)

In [None]:
# object detection datasets
coco_cluster, coco_delta = k_center(coco_emb, 10)
kitti_cluster, kitti_delta = k_center(kitti_emb, 10)
bdd_cluster, bdd_delta = k_center(bdd_emb, 10)
cityscapes_cluster, cityscapes_delta = k_center(cityscapes_emb, 10)
katech1_cluster, katech1_delta = k_center(katech1_emb, 10)

# classification datasets
cifar10_cluster, cifar10_delta = k_center(cifar10_emb, 10)
mnist_cluster, mnist_delta = k_center(mnist_emb, 10)
fashion_cluster, fashion_delta = k_center(fashion_emb, 10)

### compute pairwise distance between each dataset using all data samples

In [None]:
dd = pairwise_distances(coco_emb, katech1_emb)
print(np.mean(dd))
dd = pairwise_distances(bdd_emb, katech1_emb)
print(np.mean(dd))
dd = pairwise_distances(kitti_emb, katech1_emb)
print(np.mean(dd))
dd = pairwise_distances(cityscapes_emb, katech1_emb)
print(np.mean(dd))

### compute pairwise distance between each dataset using their core sets

In [364]:
dd = pairwise_distances(coco_emb[coco_cluster], bdd_emb[bdd_cluster])
print(np.mean(dd))
dd = pairwise_distances(bdd_emb[bdd_cluster], city_emb[city_cluster])
print(np.mean(dd))
dd = pairwise_distances(city_emb[city_cluster], kitti_emb[kitti_cluster])
print(np.mean(dd))
dd = pairwise_distances(kitti_emb[kitti_cluster], voc_emb[voc_cluster])
print(np.mean(dd))
dd = pairwise_distances(voc_emb[voc_cluster], katech1_emb[katech1_cluster])
print(np.mean(dd))

22.45616293510311
12.568231143192158
10.32389547798818
22.374163230166786
21.872829284294383


In [588]:
dd = pairwise_distances(mnist_emb[mnist_cluster], svhn_emb[svhn_cluster])
print(np.mean(dd))
dd = pairwise_distances(mnist_emb[mnist_cluster], fashion_emb[fashion_cluster])
print(np.mean(dd))
dd = pairwise_distances(mnist_emb[mnist_cluster], cifar10_emb[cifar10_cluster])
print(np.mean(dd))
dd = pairwise_distances(svhn_emb[svhn_cluster], cifar10_emb[cifar10_cluster])
print(np.mean(dd))

24.39969271968268
28.172643435389322
32.22887416115161
33.46390624008123
