In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import db_utils
from dataset_utils import load_dataset

%load_ext autoreload
%autoreload 2
# %matplotlib inline

In [2]:
datasets = [
    'MNIST-SMALL',
    'COIL20',
    'COUNTRY-2014',
    'BREAST-CANCER95',
    'MPI',
    'DIABETES'
]

dataset_name = 'COIL20'
embedding_dir = './output/{}'.format(dataset_name)
X, y, labels = load_dataset(dataset_name)

Loading dataset: COIL20


In [None]:
# load calculated embedding for different value of perplexity
import joblib
import os

embeddings = []
perps = []

for file in os.listdir(embedding_dir):
    if file.endswith('.z'):
        in_name = os.path.join(embedding_dir, file)
        tsne_obj = joblib.load(in_name)
        embeddings.append(tsne_obj.embedding_.ravel())
        perps.append(tsne_obj.get_params()['perplexity'])
        
embeddings = np.array(embeddings)

In [None]:
print(len(perps), embeddings.shape)

In [None]:
# find the most closed perplexity to the given rounded value
# e.g, given `rounded_perp = 5`, return `real_perp = 5.014803659274878`

def _approximated_closed(rounded_perp):
    # always return the closet real_perp
    diff = [abs(real_perp - rounded_perp) for real_perp in perps]
    idx = diff.index(min(diff))
    return perps[idx]

In [None]:
_approximated_closed(5)

In [None]:
def _scatter(ax, rounded_perp_val):
    perp_val = _approximated_closed(rounded_perp_val)
    idx = perps.index(perp_val)
    emb = embeddings[idx].reshape(-1,2)
    ax.scatter(emb[:, 0], emb[:, 1], c=y, alpha=0.3, cmap='tab10')
    ax.axes.set_xlabel('perplexity={}'.format(rounded_perp_val))
    ax.set_xticklabels([])
    ax.tick_params('x', length=0)
    ax.get_yaxis().set_visible(False)

In [None]:
rounded_perp_vals = [1, 5, 20, 50, 1500]

fig, axes = plt.subplots(1, len(rounded_perp_vals), figsize=(20,5))

for i, rounded_perp_val in enumerate(rounded_perp_vals):
    ax = axes[i]
    _scatter(ax, rounded_perp_val)

plt.rcParams.update({'font.size': 16})
plt.tight_layout()
plt.savefig('./plots/{}_examples.pdf'.format(dataset_name))

### Show the mannual constraints in the embedding (update 20181012)

In [3]:
# load the constraint from file
import pickle

constraint_file = './output/manual_constraints/{}.pkl'.format(dataset_name)
constraints = pickle.load(open(constraint_file, 'rb'))
must_links = constraints['mustlinks']
cannot_links = constraints['cannotlinks']

In [41]:
must_links

[[7, 27], [1169, 860], [366, 1043], [1003, 22], [1104, 1348]]

In [10]:
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from io import BytesIO
import struct
import base64

In [17]:
tsne = TSNE(perplexity=50, random_state=2018)

In [18]:
X2d = tsne.fit_transform(X)

In [79]:
# get position of constrained points
ml_pos = np.array([ (X2d[a], X2d[b]) for (a,b) in must_links ])
cl_pos = np.array([ (X2d[a], X2d[b]) for (a,b) in cannot_links ])
cannot_links

[[917, 834], [150, 306], [1391, 1035], [1216, 875], [623, 638]]

In [84]:
# manual get the positions of moving points

ml_img_pos = np.array([
 [[18, 12],
 [38, 20]],
 [[33, 40],
 [-9, 40]],
 [[-38, -18],
 [-31, -50]],
 [[-5, 25],
 [18, 25]],
 [[39, -5],
 [19, -1]]
])

cl_img_pos =  np.array([
 [[11, -22],
 [-33, -25]],
 [[-46, 8],
 [18, -8]],
 [[-34, 6],
 [-30, -46]],
 [[32, -15],
 [-31, 30]],
 [[-46, 37],
 [19, 42]]
])

In [94]:
# Note the unusual workflow:
# set GET_POSITION_MODE on or off
# ONLY WORK WITH ONE TYPE OF CONSTRAINT AT A TIME
# in the GET_POSITION_MODE (on), show the id of the constrained points
# remember to show the list of constrained point (one type of constraint)
# click to qt-scatter to select the position for image of each point (remember to click in order)
# Must convert the list of clicked points into the list of positions as in `ml_img_pos`

%matplotlib qt
fig, ax = plt.subplots(figsize=(12, 10))
GET_POSITION_MODE = False

# scatter plot for the embedding
ax.scatter(X2d[:,0], X2d[:,1], c=y, alpha=0.1, cmap='tab10')
plt.axis('off')

if GET_POSITION_MODE:
    plt.axis('on')
    info = []
    
    def onclick(event):
        # print(event)
        info.append([int(event.xdata), int(event.ydata)])
    cid = fig.canvas.mpl_connect('button_press_event', onclick)

artists = []

def show_image(idx, x, y, box_x, box_y, zoom=1., img_size=32):
    if GET_POSITION_MODE:
        ax.annotate(str(idx), xy=(x,y), xytext=(x,y))
        # print('show: ', idx, x, y)
    else:
        # ax.annotate(str(idx), xy=(x,y), xytext=(x,y))
        image_data = 255.0 - X[idx].reshape(img_size, img_size).T
        im = OffsetImage(image_data, zoom=zoom, cmap='gray')
        ab = AnnotationBbox(
            im,
            xy=(x, y),
            xybox=(box_x, box_y),
            xycoords='data',
            frameon=False,
            arrowprops=dict(arrowstyle="->", linestyle=':', alpha=0.5)
        )
        artists.append(ax.add_artist(ab))

# plot the constraints
def plot_links(links, point_ids, img_pos, style='-', color='black'):
    for ([pa, pb], [ia, ib], [img_a, img_b]) in zip(links, point_ids, img_pos):
        x_pos = [pa[0], pb[0]]
        y_pos = [pa[1], pb[1]]
        ax.plot(x_pos, y_pos, linestyle=style, color=color, lw=1.)
        ax.scatter(x=x_pos, y=y_pos, marker='o', color=color)
        
        # show 2 image ia, ib
        show_image(ia, pa[0], pa[1], box_x=img_a[0], box_y=img_a[1])
        show_image(ib, pb[0], pb[1], box_x=img_b[0], box_y=img_b[1])
        
plot_links(links=ml_pos, point_ids=must_links, img_pos=ml_img_pos, style='-', color='blue')
# plot_links(links=cl_pos, point_ids=cannot_links, img_pos=cl_img_pos, style=':', color='red')

plt.tight_layout()
plt.savefig('./plots/example_constrains_COIL20_ML.pdf')

In [83]:
info

[[11, -22],
 [-33, -25],
 [-46, 8],
 [18, -8],
 [-34, 6],
 [-30, -46],
 [32, -15],
 [-31, 30],
 [-46, 37],
 [19, 42]]