# segmentation

In [None]:
import numpy as np
import torch
import sys
import os
import json
sys.path.append("..")
from singleVis.SingleVisualizationModel import SingleVisualizationModel
from singleVis.data import NormalDataProvider
from singleVis.eval.evaluator import Evaluator
from singleVis.projector import Projector

In [None]:
CONTENT_PATH = "/home/xianglin/projects/DVI_data/noisy/symmetric/cifar10"
# CONTENT_PATH = "/home/xianglin/projects/DVI_data/resnet18_cifar10"
# CONTENT_PATH = "/home/xianglin/projects/DVI_data/resnet18_mnist"
# CONTENT_PATH = "/home/xianglin/projects/DVI_data/resnet18_fmnist"
GPU_ID = 0

In [None]:
sys.path.append(CONTENT_PATH)
from config import config

SETTING = config["SETTING"]
CLASSES = config["CLASSES"]
DATASET = config["DATASET"]
PREPROCESS = config["VISUALIZATION"]["PREPROCESS"]
EPOCH_START = config["EPOCH_START"]
EPOCH_END = config["EPOCH_END"]
EPOCH_PERIOD = config["EPOCH_PERIOD"]

# Training parameter (subject model)
TRAINING_PARAMETER = config["TRAINING"]
NET = TRAINING_PARAMETER["NET"]
LEN = TRAINING_PARAMETER["train_num"]

# Training parameter (visualization model)
VISUALIZATION_PARAMETER = config["VISUALIZATION"]
LAMBDA = VISUALIZATION_PARAMETER["LAMBDA"]
S_LAMBDA = VISUALIZATION_PARAMETER["S_LAMBDA"]
B_N_EPOCHS = VISUALIZATION_PARAMETER["BOUNDARY"]["B_N_EPOCHS"]
L_BOUND = VISUALIZATION_PARAMETER["BOUNDARY"]["L_BOUND"]
INIT_NUM = VISUALIZATION_PARAMETER["INIT_NUM"]
ALPHA = VISUALIZATION_PARAMETER["ALPHA"]
BETA = VISUALIZATION_PARAMETER["BETA"]
MAX_HAUSDORFF = VISUALIZATION_PARAMETER["MAX_HAUSDORFF"]
HIDDEN_LAYER = VISUALIZATION_PARAMETER["HIDDEN_LAYER"]
S_N_EPOCHS = VISUALIZATION_PARAMETER["S_N_EPOCHS"]
T_N_EPOCHS = VISUALIZATION_PARAMETER["T_N_EPOCHS"]
N_NEIGHBORS = VISUALIZATION_PARAMETER["N_NEIGHBORS"]
PATIENT = VISUALIZATION_PARAMETER["PATIENT"]
MAX_EPOCH = VISUALIZATION_PARAMETER["MAX_EPOCH"]
SEGMENTS = VISUALIZATION_PARAMETER["SEGMENTS"]
RESUME_SEG = VISUALIZATION_PARAMETER["RESUME_SEG"]
# define hyperparameters
DEVICE = torch.device("cuda:{}".format(GPU_ID) if torch.cuda.is_available() else "cpu")

content_path = CONTENT_PATH
sys.path.append(content_path)

import Model.model as subject_model
# net = resnet18()
net = eval("subject_model.{}()".format(NET))
classes = ("airplane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")

In [None]:
data_provider = NormalDataProvider(CONTENT_PATH, net, EPOCH_START, EPOCH_END, EPOCH_PERIOD, split=-1, device=DEVICE, classes=CLASSES,verbose=1)
if PREPROCESS:
    data_provider.initialize(LEN//10, l_bound=L_BOUND)

model = SingleVisualizationModel(input_dims=512, output_dims=2, units=256, hidden_layer=HIDDEN_LAYER)
projector = Projector(vis_model=model, content_path=CONTENT_PATH, segments=SEGMENTS, device=DEVICE)

In [None]:
# Resume from a check point
if RESUME_SEG in range(len(SEGMENTS)):
    prev_epoch = SEGMENTS[RESUME_SEG][0]
    with open(os.path.join(data_provider.content_path, "selected_idxs", "selected_{}.json".format(prev_epoch)), "r") as f:
        prev_selected = json.load(f)
        
    INIT_NUM = len(prev_selected)
    save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(RESUME_SEG))
    save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
    model.load_state_dict(save_model["state_dict"])
    start_point = RESUME_SEG - 1
    print("Resume from {}-th segment with {} points...".format(RESUME_SEG, INIT_NUM))
else: 
    prev_selected = np.random.choice(np.arange(LEN), size=INIT_NUM, replace=False)
    start_point = len(SEGMENTS)-1

In [None]:
from singleVis.visualizer import visualizer

vis = visualizer(data_provider, projector, 500)
# save_dir = os.path.join(data_provider.content_path, "img")
# os.system("mkdir -p {}".format(save_dir))

# for i in range(EPOCH_START, EPOCH_END+1, EPOCH_PERIOD):
#     vis.savefig(i, path=os.path.join(save_dir, "{}_{}_tnn.png".format(DATASET, i)))

In [None]:
evaluator = Evaluator(data_provider, projector)
eval_epoch = 40
evaluator.save_epoch_eval(eval_epoch, 15, temporal_k=5, save_corrs=False, file_name="test_evaluation_hybrid")
evaluator.eval_proj_invariants_train(eval_epoch)
evaluator.eval_proj_invariants_test(eval_epoch)
evaluator.eval_temporal_nn_train(eval_epoch, 5)
evaluator.eval_temporal_nn_test(eval_epoch, 5)

In [None]:
from pynndescent import NNDescent
def hausdorff_d(curr_data, prev_data):

    # number of trees in random projection forest
    n_trees = min(64, 5 + int(round((curr_data.shape[0]) ** 0.5 / 20.0)))
    # max number of nearest neighbor iters to perform
    n_iters = max(5, int(round(np.log2(curr_data.shape[0]))))
    # distance metric
    metric = "euclidean"
    # get nearest neighbors
    nnd = NNDescent(
        curr_data,
        n_neighbors=1,
        metric=metric,
        n_trees=n_trees,
        n_iters=n_iters,
        max_candidates=10,
        verbose=False
    )
    _, dists1 = nnd.query(prev_data,k=1)
    m1 = dists1.mean()
    return m1

In [None]:
curr_data = data_provider.train_representation(200)
prev_data = data_provider.train_representation(199)
hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
# mnist
d = np.zeros(19)
for curr_epoch in range(20,1,-1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[4:].sum(),d[:5].sum()

In [None]:
# fmnist
d = np.zeros(49)
for curr_epoch in range(50,1,-1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[:4].sum(), d[3:7].sum(),d[6:11].sum(), d[10:].sum()

In [None]:
# cifar10
d = np.zeros(200)
for curr_epoch in range(200, 1, -1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[:4].sum(), d[3:10].sum(), d[9:18].sum(),d[17:41].sum(),d[40:200].sum()

In [None]:
# symmetric cifar10
d = np.zeros(199)
for curr_epoch in range(200,1,-1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[155:].sum(),d[60:156].sum(), d[:61].sum()

In [None]:
# pairflip cifar10
d = np.zeros(199)
for curr_epoch in range(200,1,-1):
    curr_data = data_provider.train_representation(curr_epoch)
    prev_data = data_provider.train_representation(curr_epoch-1)
    d[curr_epoch-2] = hausdorff_d(curr_data=curr_data, prev_data=prev_data)

In [None]:
d[135:].sum(),d[48:136].sum(), d[:49].sum()

## tried methods
1. normed distance between epochs
2. the jaccard similarity between consecutive epochs

# overlapping

In [None]:
dataset = "symmetric"
path = "/home/xianglin/projects/DVI_data/noisy/{}/cifar10/clean_label.json".format(dataset)
with open(path, "r") as f:
    clean_label = json.load(f)
path = "/home/xianglin/projects/DVI_data/noisy/{}/cifar10/noisy_label.json".format(dataset)
with open(path, "r") as f:
    noisy_label = json.load(f)

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(4))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((160, 50000, 512))
for i in range(160):
    samples[i] = data_provider.train_representation(i+41)

embeddings_2d = np.zeros((50000, 160, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(3))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+31)

embeddings_2d_2 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_2[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(2))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+21)

embeddings_2d_3 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_3[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(1))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+11)

embeddings_2d_4 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_4[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(0))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((10, 50000, 512))
for i in range(10):
    samples[i] = data_provider.train_representation(i+1)

embeddings_2d_5 = np.zeros((50000, 10, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_5[i] = embedding_2d

In [None]:
embedding = np.concatenate((embeddings_2d_5, embeddings_2d_4), axis=1)
embedding = np.concatenate((embedding, embeddings_2d_3), axis=1)
embedding = np.concatenate((embedding, embeddings_2d_2), axis=1)
embedding = np.concatenate((embedding, embeddings_2d), axis=1)

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(2))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((44, 50000, 512))
for i in range(44):
    samples[i] = data_provider.train_representation(i+156)

embeddings_2d = np.zeros((50000, 44, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(1))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((96, 50000, 512))
for i in range(96):
    samples[i] = data_provider.train_representation(i+61)

embeddings_2d_1 = np.zeros((50000, 96, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_1[i] = embedding_2d

In [None]:
save_model_path = os.path.join(data_provider.model_path, "tnn_hybrid_{}.pth".format(0))
save_model = torch.load(save_model_path, map_location=torch.device("cpu"))
model.load_state_dict(save_model["state_dict"])
model.to(device=torch.device("cuda:{}".format(GPU_ID)))

samples = np.zeros((60, 50000, 512))
for i in range(60):
    samples[i] = data_provider.train_representation(i+1)

embeddings_2d_2 = np.zeros((50000, 60, 2))
for i in range(50000):
    embedding_2d = model.encoder(torch.from_numpy(samples[:,i,:]).to(device=DEVICE, dtype=torch.float)).cpu().detach().numpy()
    embeddings_2d_2[i] = embedding_2d

In [None]:
embedding = np.concatenate((embeddings_2d_2, embeddings_2d_1), axis=1)
embedding = np.concatenate((embedding, embeddings_2d), axis=1)

In [None]:
embedding = embedding.reshape(len(embedding), -1)

## test trajectory

In [None]:
noisy_label = np.array(noisy_label)
clean_label = np.array(clean_label)

In [None]:
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.manifold import TSNE 
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import Birch, KMeans

In [None]:
for cls_num in range(10):
    cls = np.argwhere(np.array(noisy_label)==cls_num).squeeze()
    high_data = embedding[cls].reshape(len(cls), -1)


    reducer = umap.UMAP(n_components=2)
    em_2d = reducer.fit_transform(high_data)

    # from sklearn.manifold import TSNE
    # embedding = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(high_data)

    brc = Birch(n_clusters=2)
    brc.fit(em_2d)

    labels = brc.labels_
    centroid = brc.subcluster_centers_
    centroid_labels = brc.subcluster_labels_
    # clean 1, noise 0
    bin = np.bincount(labels)
    if bin[0] > bin[1]:
        centroid_labels = np.abs(centroid_labels-1)
        labels = np.abs(labels-1)

    plt.clf()
    plt.scatter(
        em_2d[:, 0],
        em_2d[:, 1],
        s=1,
        c=clean_label[cls],
        cmap="tab10")
    plt.scatter(
        brc.subcluster_centers_[:, 0],
        brc.subcluster_centers_[:, 1],
        s=5,
        c='black')
    plt.show()
    plt.clf()
    plt.scatter(
        em_2d[:, 0],
        em_2d[:, 1],
        s=1,
        c=brc.labels_,
        cmap="Pastel2")

    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(centroid[centroid_labels==1])
    dists, indices = nbrs.kneighbors(centroid[centroid_labels==1])
    suspicious = (dists[:, -1]/ dists[:, 1])>1.8

    cleans = centroid[centroid_labels==1]
    noises = centroid[centroid_labels==0]
    plt.scatter(
        cleans[:, 0],
        cleans[:, 1],
        s=5,
        c='r')
    plt.scatter(
        noises[:, 0],
        noises[:, 1],
        s=5,
        c='black')
    plt.scatter(
        cleans[suspicious][:, 0],
        cleans[suspicious][:, 1],
        s=5,
        c='g')
    plt.show()

In [None]:
np.save(os.path.join(CONTENT_PATH, "embedding.npy"), embedding)