# segmentation

In [None]:
import torch
import sys
import os
import json
import time
import numpy as np
import argparse

from torch.utils.data import DataLoader
from torch.utils.data import WeightedRandomSampler
from umap.umap_ import find_ab_params

from singleVis.custom_weighted_random_sampler import CustomWeightedRandomSampler
from singleVis.SingleVisualizationModel import SingleVisualizationModel
from singleVis.losses import SingleVisLoss, UmapLoss, ReconstructionLoss
from singleVis.edge_dataset import DataHandler
from singleVis.trainer import SingleVisTrainer
from singleVis.data import NormalDataProvider
import singleVis.config as config
from singleVis.eval.evaluator import Evaluator
from singleVis.spatial_edge_constructor import kcSpatialEdgeConstructor
from singleVis.temporal_edge_constructor import GlobalTemporalEdgeConstructor

In [None]:
# CONTENT_PATH = "/home/xianglin/projects/DVI_data/noisy/pairflip/cifar10"
CONTENT_PATH = "/home/xianglin/projects/DVI_data/resnet18_cifar10"
# CONTENT_PATH = "/home/xianglin/projects/DVI_data/resnet18_mnist"
# CONTENT_PATH = "/home/xianglin/projects/DVI_data/resnet18_fmnist"

In [None]:
sys.path.append(CONTENT_PATH)
from config import config

SETTING = config["SETTING"]
CLASSES = config["CLASSES"]
DATASET = config["DATASET"]
PREPROCESS = config["VISUALIZATION"]["PREPROCESS"]
GPU_ID = config["GPU"]
EPOCH_START = config["EPOCH_START"]
EPOCH_END = config["EPOCH_END"]
EPOCH_PERIOD = config["EPOCH_PERIOD"]

# Training parameter (subject model)
TRAINING_PARAMETER = config["TRAINING"]
NET = TRAINING_PARAMETER["NET"]
LEN = TRAINING_PARAMETER["train_num"]

# Training parameter (visualization model)
VISUALIZATION_PARAMETER = config["VISUALIZATION"]
LAMBDA = VISUALIZATION_PARAMETER["LAMBDA"]
B_N_EPOCHS = VISUALIZATION_PARAMETER["BOUNDARY"]["B_N_EPOCHS"]
L_BOUND = VISUALIZATION_PARAMETER["BOUNDARY"]["L_BOUND"]
INIT_NUM = VISUALIZATION_PARAMETER["INIT_NUM"]
ALPHA = VISUALIZATION_PARAMETER["ALPHA"]
BETA = VISUALIZATION_PARAMETER["BETA"]
MAX_HAUSDORFF = VISUALIZATION_PARAMETER["MAX_HAUSDORFF"]
HIDDEN_LAYER = VISUALIZATION_PARAMETER["HIDDEN_LAYER"]
S_N_EPOCHS = VISUALIZATION_PARAMETER["S_N_EPOCHS"]
T_N_EPOCHS = VISUALIZATION_PARAMETER["T_N_EPOCHS"]
N_NEIGHBORS = VISUALIZATION_PARAMETER["N_NEIGHBORS"]
PATIENT = VISUALIZATION_PARAMETER["PATIENT"]
MAX_EPOCH = VISUALIZATION_PARAMETER["MAX_EPOCH"]
SEGMENTS = VISUALIZATION_PARAMETER["SEGMENTS"]
RESUME_SEG = VISUALIZATION_PARAMETER["RESUME_SEG"]

# define hyperparameters
DEVICE = torch.device("cuda:{}".format(GPU_ID) if torch.cuda.is_available() else "cpu")

import Model.model as subject_model
net = eval("subject_model.{}()".format(NET))

In [None]:
data_provider = NormalDataProvider(CONTENT_PATH, net, EPOCH_START, EPOCH_END, EPOCH_PERIOD, split=-1, device=DEVICE, classes=CLASSES,verbose=1)
if PREPROCESS:
    data_provider.initialize(LEN//10, l_bound=L_BOUND)

model = SingleVisualizationModel(input_dims=512, output_dims=2, units=256, hidden_layer=HIDDEN_LAYER)
negative_sample_rate = 5
min_dist = .1
_a, _b = find_ab_params(1.0, min_dist)
umap_loss_fn = UmapLoss(negative_sample_rate, DEVICE, _a, _b, repulsion_strength=1.0)
recon_loss_fn = ReconstructionLoss(beta=1.0)
criterion = SingleVisLoss(umap_loss_fn, recon_loss_fn, lambd=LAMBDA)

# Resume from a check point
if RESUME_SEG in range(len(SEGMENTS)):
    prev_epoch = SEGMENTS[RESUME_SEG][0]
    with open(os.path.join(data_provider.content_path, "selected_idxs", "selected_{}.json".format(prev_epoch)), "r") as f:
        prev_selected = json.load(f)
        
    INIT_NUM = len(prev_selected)
    save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(RESUME_SEG))
    save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
    model.load_state_dict(save_model["state_dict"])
    start_point = RESUME_SEG - 1
    print("Resume from {}-th segment with {} points...".format(RESUME_SEG, INIT_NUM))
else: 
    prev_selected = np.random.choice(np.arange(LEN), size=INIT_NUM, replace=False)
    start_point = len(SEGMENTS)-1

In [None]:
model = SingleVisualizationModel(input_dims=512, output_dims=2, units=256, hidden_layer=HIDDEN_LAYER)
negative_sample_rate = 5
min_dist = .1
_a, _b = find_ab_params(1.0, min_dist)
umap_loss_fn = UmapLoss(negative_sample_rate, DEVICE, _a, _b, repulsion_strength=1.0)
recon_loss_fn = ReconstructionLoss(beta=1.0)
criterion = SingleVisLoss(umap_loss_fn, recon_loss_fn, lambd=LAMBDA)

optimizer = torch.optim.Adam(model.parameters(), lr=.01, weight_decay=1e-5)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=.1)

In [None]:
save_dir = os.path.join(data_provider.content_path, "img")
os.system("mkdir -p {}".format(save_dir))

In [None]:
# visualize img from one specified segment
trainer = SingleVisTrainer(model, criterion=criterion, optimizer=optimizer, lr_scheduler=lr_scheduler, edge_loader=None, DEVICE=DEVICE)
trainer.load(file_path=os.path.join(data_provider.model_path,"tnn_hybrid_{}.pth".format(4)))

from singleVis.visualizer import visualizer
vis = visualizer(data_provider, trainer.model, 200, 10, CLASSES)
for i in range(155,200,1):
    vis.savefig(i, path=os.path.join(save_dir, "hybrid_{}.png".format(i)))

In [None]:
# evaluate a certain epoch
i=20
evaluator = Evaluator(data_provider, trainer)
evaluator.save_epoch_eval(i, 15, temporal_k=5, save_corrs=False, file_name="test_evaluation_hybrid")

In [None]:
# evaluate hybrid epoch
from singleVis.backend import find_neighbor_preserving_rate
eval_num = 50
l = 60000

alpha = np.zeros((eval_num, l))
delta_x = np.zeros((eval_num, l))

trainer.load(file_path=os.path.join(data_provider.model_path,"tnn_hybrid_{}.pth".format(2)))
trainer.model.eval()

for t in range(9,50):
    prev_data = data_provider.train_representation(t)
    prev_embedding = trainer.model.encoder(
        torch.from_numpy(prev_data).to(dtype=torch.float32, device=trainer.DEVICE)).cpu().detach().numpy()

    curr_data = data_provider.train_representation(t+1)
    curr_embedding = trainer.model.encoder(
        torch.from_numpy(curr_data).to(dtype=torch.float32, device=trainer.DEVICE)).cpu().detach().numpy()

    alpha_ = find_neighbor_preserving_rate(prev_data, curr_data, n_neighbors=15)
    delta_x_ = np.linalg.norm(prev_embedding - curr_embedding, axis=1)

    alpha[t-1] = alpha_
    delta_x[t-1] = delta_x_

trainer.load(file_path=os.path.join(data_provider.model_path,"tnn_hybrid_{}.pth".format(1)))
trainer.model.eval()

for t in range(4,9):
    prev_data = data_provider.train_representation(t)
    prev_embedding = trainer.model.encoder(
        torch.from_numpy(prev_data).to(dtype=torch.float32, device=trainer.DEVICE)).cpu().detach().numpy()

    curr_data = data_provider.train_representation(t+1)
    curr_embedding = trainer.model.encoder(
        torch.from_numpy(curr_data).to(dtype=torch.float32, device=trainer.DEVICE)).cpu().detach().numpy()

    alpha_ = find_neighbor_preserving_rate(prev_data, curr_data, n_neighbors=15)
    delta_x_ = np.linalg.norm(prev_embedding - curr_embedding, axis=1)

    alpha[t-1] = alpha_
    delta_x[t-1] = delta_x_

trainer.load(file_path=os.path.join(data_provider.model_path,"tnn_hybrid_{}.pth".format(0)))
trainer.model.eval()

for t in range(1,4):
    prev_data = data_provider.train_representation(t)
    prev_embedding = trainer.model.encoder(
        torch.from_numpy(prev_data).to(dtype=torch.float32, device=trainer.DEVICE)).cpu().detach().numpy()

    curr_data = data_provider.train_representation(t+1)
    curr_embedding = trainer.model.encoder(
        torch.from_numpy(curr_data).to(dtype=torch.float32, device=trainer.DEVICE)).cpu().detach().numpy()

    alpha_ = find_neighbor_preserving_rate(prev_data, curr_data, n_neighbors=15)
    delta_x_ = np.linalg.norm(prev_embedding - curr_embedding, axis=1)

    alpha[t-1] = alpha_
    delta_x[t-1] = delta_x_

from singleVis.eval.evaluate import evaluate_proj_temporal_perseverance_corr
val_corr, corr_std = evaluate_proj_temporal_perseverance_corr(alpha, delta_x)
val_corr, corr_std

In [None]:
from pynndescent import NNDescent
def hausdorff_d(curr_data, prev_data):

    # number of trees in random projection forest
    n_trees = min(64, 5 + int(round((curr_data.shape[0]) ** 0.5 / 20.0)))
    # max number of nearest neighbor iters to perform
    n_iters = max(5, int(round(np.log2(curr_data.shape[0]))))
    # distance metric
    metric = "euclidean"
    # get nearest neighbors
    nnd = NNDescent(
        curr_data,
        n_neighbors=1,
        metric=metric,
        n_trees=n_trees,
        n_iters=n_iters,
        max_candidates=10,
        verbose=False
    )
    _, dists1 = nnd.query(prev_data,k=1)
    m1 = dists1.mean()
    return m1

In [None]:
curr_data = data_provider.train_representation(200)
prev_data = data_provider.train_representation(199)
hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
# mnist
d = np.zeros(19)
for curr_epoch in range(20,1,-1):
    curr_data = data_provider.test_representation(curr_epoch)
    prev_data = data_provider.test_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[2:].sum(),d[:3].sum()

In [None]:
# fmnist
d = np.zeros(49)
for curr_epoch in range(50,1,-1):
    curr_data = data_provider.test_representation(curr_epoch)
    prev_data = data_provider.test_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[9:].sum(),d[4:10].sum(),d[:5].sum()

In [None]:
# cifar10
d = np.zeros(200)
for curr_epoch in range(200, 1, -1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[:7].sum(), d[6:16].sum(), d[15:43].sum(), d[42:200].sum()

In [None]:
# symmetric cifar10
d = np.zeros(199)
for curr_epoch in range(200,1,-1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[155:].sum(),d[60:156].sum(), d[:61].sum()

In [None]:
# pairflip cifar10
d = np.zeros(199)
for curr_epoch in range(200,1,-1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[135:].sum(),d[48:136].sum(), d[:49].sum()

## tried methods
1. normed distance between epochs
2. the jaccard similarity between consecutive epochs

# overlapping

In [None]:
import json
dataset = "symmetric"
path = "/home/xianglin/projects/DVI_data/noisy/{}/cifar10/clean_label.json".format(dataset)
with open(path, "r") as f:
    clean_label = json.load(f)
path = "/home/xianglin/projects/DVI_data/noisy/{}/cifar10/noisy_label.json".format(dataset)
with open(path, "r") as f:
    noisy_label = json.load(f)

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(4))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((160, 50000, 512))
for i in range(160):
    samples[i] = data_provider.train_representation(i+41)

embeddings_2d = np.zeros((50000, 160, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(3))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+31)

embeddings_2d_2 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_2[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(2))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+21)

embeddings_2d_3 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_3[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(1))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+11)

embeddings_2d_4 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_4[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(0))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+1)

embeddings_2d_5 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_5[i] = embedding_2d

In [None]:
embedding = np.concatenate((embeddings_2d_5, embeddings_2d_4), axis=1)
embedding = np.concatenate((embedding, embeddings_2d_3), axis=1)
embedding = np.concatenate((embedding, embeddings_2d_2), axis=1)
embedding = np.concatenate((embedding, embeddings_2d), axis=1)

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(2))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((44, 50000, 512))
for i in range(44):
    samples[i] = data_provider.train_representation(i+156)

embeddings_2d = np.zeros((50000, 44, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(1))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((96, 50000, 512))
for i in range(96):
    samples[i] = data_provider.train_representation(i+61)

embeddings_2d_1 = np.zeros((50000, 96, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_1[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(0))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((60, 50000, 512))
for i in range(60):
    samples[i] = data_provider.train_representation(i+1)

embeddings_2d_2 = np.zeros((50000, 60, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_2[i] = embedding_2d

In [None]:
embedding = np.concatenate((embeddings_2d_2, embeddings_2d_1), axis=1)
embedding = np.concatenate((embedding, embeddings_2d), axis=1)

In [None]:
embedding = embedding.reshape(len(embedding), -1)

## test trajectory

In [None]:
noisy_label = np.array(noisy_label)
clean_label = np.array(clean_label)

In [None]:
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.manifold import TSNE 
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import Birch, KMeans

In [None]:
for cls_num in range(10):
    cls = np.argwhere(np.array(noisy_label)==cls_num).squeeze()
    high_data = embedding[cls].reshape(len(cls), -1)


    reducer = umap.UMAP(n_components=2)
    em_2d = reducer.fit_transform(high_data)

    # from sklearn.manifold import TSNE
    # embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(high_data)

    brc = Birch(n_clusters=2)
    brc.fit(em_2d)

    labels = brc.labels_
    centroid = brc.subcluster_centers_
    centroid_labels = brc.subcluster_labels_
    # clean 1, noise 0
    bin = np.bincount(labels)
    if bin[0] > bin[1]:
        centroid_labels = np.abs(centroid_labels-1)
        labels = np.abs(labels-1)

    plt.clf()
    plt.scatter(
        em_2d[:, 0],
        em_2d[:, 1],
        s=1,
        c=clean_label[cls],
        cmap="tab10")
    plt.scatter(
        brc.subcluster_centers_[:, 0],
        brc.subcluster_centers_[:, 1],
        s=5,
        c='black')
    plt.show()
    plt.clf()
    plt.scatter(
        em_2d[:, 0],
        em_2d[:, 1],
        s=1,
        c=brc.labels_,
        cmap="Pastel2")

    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(centroid[centroid_labels==1])
    dists, indices = nbrs.kneighbors(centroid[centroid_labels==1])
    suspicious = (dists[:, -1]/ dists[:, 1])>1.8

    cleans = centroid[centroid_labels==1]
    noises = centroid[centroid_labels==0]
    plt.scatter(
        cleans[:, 0],
        cleans[:, 1],
        s=5,
        c='r')
    plt.scatter(
        noises[:, 0],
        noises[:, 1],
        s=5,
        c='black')
    plt.scatter(
        cleans[suspicious][:, 0],
        cleans[suspicious][:, 1],
        s=5,
        c='g')
    plt.show()

In [None]:
np.save(os.path.join(CONTENT_PATH, "embedding.npy"), embedding)