In [1]:
import torch
import torchvision
import plotly.express as px
import numpy as np
import pandas as pd

from tqdm import tqdm
from torch.utils.data import random_split
from torchvision import transforms
from sklearn.semi_supervised import LabelSpreading
from sklearn.manifold import TSNE
from PIL import Image
from sklearn.model_selection import StratifiedShuffleSplit

from modules import np_image_to_base64
from vae import VariationalAutoencoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = VariationalAutoencoder(3, 64, device)
model.load_state_dict(torch.load("model.pt"))
model.to(device)

mnist_testset = torchvision.datasets.CIFAR10(root="../datasets", train=True, download=True, transform=None)
test_transform = transforms.Compose([
    transforms.ToTensor(),
])


Files already downloaded and verified


# Stratified Split

In [59]:
encoded_samples = []
true_labels = []
imgs = []

m=0.005

features = mnist_testset.data
labels = mnist_testset.targets

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=m, random_state=42)

train_indices, test_indices = next(stratified_split.split(features, labels))

unlabeled_features = features[train_indices]
unlabeled_labels = labels[train_indices]
labeled_features = features[test_indices]
labeled_labels = labels[test_indices]


unlabeled_features.transform = test_transform
labeled_features.transform = test_transform
print("Len of labeled: ", len(labeled_features), " Len of unlabeled: ", len(unlabeled_features))


Len of labeled:  50  Len of unlabeled:  9950


In [60]:
hist1 = px.histogram(unlabeled_labels, title="Unlabeled")
hist1.show()
hist2 = px.histogram(labeled_labels, title="Labeled")
hist2.show()

In [61]:
model.eval()
transform = transforms.ToTensor()
for index, sample in tqdm(enumerate(labeled_features)):
    img = sample.unsqueeze(0).to(device)
    imgs.append({"image": sample})
    label = labeled_labels[index]
    with torch.no_grad():
        encoded_img  = model.encoder(img.unsqueeze(0).float())
    encoded_img = encoded_img.flatten().cpu().numpy()
    encoded_sample = {f"Enc. Variable {i}": enc for i, enc in enumerate(encoded_img)}
    encoded_sample['label'] = label
    encoded_samples.append(encoded_sample)
    true_labels.append(label)

for index, sample in tqdm(enumerate(unlabeled_features)):
    img = sample.unsqueeze(0).to(device)
    imgs.append({"image": sample})
    true_labels.append(unlabeled_labels[index])
    label = -1
    with torch.no_grad():
        encoded_img  = model.encoder(img.unsqueeze(0).float())
    encoded_img = encoded_img.flatten().cpu().numpy()
    encoded_sample = {f"Enc. Variable {i}": enc for i, enc in enumerate(encoded_img)}
    encoded_sample['label'] = label
    encoded_samples.append(encoded_sample)

encoded_samples = pd.DataFrame(encoded_samples)

50it [00:00, 1133.36it/s]
9950it [00:07, 1392.76it/s]


In [62]:
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(encoded_samples.drop(['label'],axis=1))

In [63]:
label_prop_model = LabelSpreading()
label_prop_model.fit(tsne_results, encoded_samples["label"].astype("int"))

labels = label_prop_model.predict(tsne_results)


invalid value encountered in divide



In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(true_labels, labels)

0.4544

In [65]:
fig = px.scatter(tsne_results, x=0, y=1, color=labels.astype(str),labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

# Random Split

In [66]:
encoded_samples = []
true_labels = []
imgs = []

mnist_testset = torchvision.datasets.MNIST(root="", train=False, download=True, transform=None)
test_transform = transforms.Compose([
    transforms.ToTensor(),
])
mnist_testset.transform = test_transform

m=len(mnist_testset)
mnist_testset_label, mnist_testset_unlabel = random_split(mnist_testset, [int(m*0.005), int(m*0.995)], torch.Generator().manual_seed(42))
print("Len of labeled: ", len(mnist_testset_label), " Len of unlabeled: ", len(mnist_testset_unlabel))

Len of labeled:  50  Len of unlabeled:  9950


In [67]:
labels_label = [label for _, label in mnist_testset_label]
labels_unlabel = [label for _, label in mnist_testset_unlabel]
hist1 = px.histogram(labels_unlabel, title="Unlabeled")
hist1.show()
hist2 = px.histogram(labels_label, title="Labeled")
hist2.show()

In [68]:
model.eval()
for sample in tqdm(mnist_testset_label):
    img = sample[0].unsqueeze(0).to(device)
    imgs.append({"image": sample[0]})
    label = sample[1]
    with torch.no_grad():
        encoded_img  = model.encoder(img)
    encoded_img = encoded_img.flatten().cpu().numpy()
    encoded_sample = {f"Enc. Variable {i}": enc for i, enc in enumerate(encoded_img)}
    encoded_sample['label'] = label
    encoded_samples.append(encoded_sample)
    true_labels.append(label)

for sample in tqdm(mnist_testset_unlabel):
    img = sample[0].unsqueeze(0).to(device)
    imgs.append({"image": sample[0]})
    true_labels.append(sample[1])
    label = -1
    with torch.no_grad():
        encoded_img  = model.encoder(img)
    encoded_img = encoded_img.flatten().cpu().numpy()
    encoded_sample = {f"Enc. Variable {i}": enc for i, enc in enumerate(encoded_img)}
    encoded_sample['label'] = label
    encoded_samples.append(encoded_sample)

encoded_samples = pd.DataFrame(encoded_samples)

100%|██████████| 50/50 [00:00<00:00, 771.95it/s]
100%|██████████| 9950/9950 [00:08<00:00, 1185.69it/s]


In [69]:
tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(encoded_samples.drop(['label'],axis=1))

In [70]:
label_prop_model = LabelSpreading()
label_prop_model.fit(tsne_results, encoded_samples["label"])
labels = label_prop_model.predict(tsne_results)


invalid value encountered in divide



In [71]:
accuracy_score(true_labels, labels)

0.4846

In [72]:
fig = px.scatter(tsne_results, x=0, y=1, color=labels.astype(str),labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

# TSNE trick from https://arxiv.org/pdf/1712.09005.pdf

In [82]:
tsne = TSNE(n_components=2, perplexity=50, early_exaggeration=30, n_iter=2000, random_state=42)
tsne_results = tsne.fit_transform(encoded_samples.drop(['label'],axis=1))

In [83]:
true_labels_str = [str(label) for label in true_labels]
fig = px.scatter(tsne_results, x=0, y=1, color=true_labels_str,labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

# Clustering and selection

In [1]:
import torch
import torchvision
import plotly.express as px
import numpy as np
import pandas as pd

from tqdm import tqdm
from torch.utils.data import random_split
from torchvision import transforms
from sklearn.semi_supervised import LabelSpreading
from sklearn.manifold import TSNE
from PIL import Image
from sklearn.model_selection import StratifiedShuffleSplit

from modules import np_image_to_base64
from vae import VariationalAutoencoder
from sklearn.cluster import DBSCAN, MeanShift

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = VariationalAutoencoder(3, 64, device)
model.load_state_dict(torch.load("model.pt"))
model.to(device)

test_transform = transforms.Compose([
    transforms.ToTensor(),
])
mnist_testset = torchvision.datasets.CIFAR10(root="../datasets", train=False, download=True, transform=test_transform)

Files already downloaded and verified


In [34]:
encoded_samples = []
true_labels = []

In [35]:
model.eval()
for sample in tqdm(mnist_testset):
    img = sample[0].unsqueeze(0).to(device)
    label = sample[1]
    with torch.no_grad():
        encoded_img  = model.encoder.encode(img)
    encoded_img = encoded_img.flatten().cpu().numpy()
    encoded_sample = {f"Enc. Variable {i}": enc for i, enc in enumerate(encoded_img)}
    encoded_samples.append(encoded_sample)
    true_labels.append(label)

encoded_samples = pd.DataFrame(encoded_samples)

100%|██████████| 10000/10000 [00:06<00:00, 1662.52it/s]


In [36]:
# tsne = TSNE(n_components=2, perplexity=50, early_exaggeration=8,n_iter=250, n_iter_without_progress=750)
# tsne_results = tsne.fit_transform(encoded_samples)

In [37]:
# tsne_2 = TSNE(n_components=2, n_iter=250,n_iter_without_progress=0, early_exaggeration=2)
# tsne_results = tsne_2.fit_transform(tsne_results)

In [38]:
tsne = TSNE(n_components=2, perplexity=50)
tsne_results = tsne.fit_transform(encoded_samples)

In [46]:
ms = DBSCAN(eps=3, min_samples=20).fit(tsne_results)
labels = ms.labels_
labels_unique = np.unique(labels)
print(len(labels_unique))

2


In [47]:
cluster_labels_str = [str(label) for label in labels]
fig = px.scatter(tsne_results, x=0, y=1, color=cluster_labels_str,labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

In [41]:
import random
from collections import defaultdict
label_indices = defaultdict(list)
for idx, label in enumerate(labels):
    label_indices[label].append(idx)
    
num_samples_per_label = 10

selected_indices = []
for label in label_indices.keys():
    selected_indices.extend(random.sample(label_indices[label], num_samples_per_label))
print(selected_indices)

ValueError: Sample larger than population or is negative

In [None]:
selected = []
for idx, label in enumerate(labels):
    if idx in selected_indices:
        selected.append(str(1))
    else:
        selected.append(str(-1))

In [None]:
fig = px.scatter(tsne_results, x=0, y=1, color=selected,labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

In [None]:
labels = []

for idx, label in enumerate(true_labels):
    if idx in selected_indices:
        labels.append(label)
    else:
        labels.append(-1)

In [None]:
labels_str = [str(label) for label in labels]
fig = px.scatter(tsne_results, x=0, y=1, color=labels_str,labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

In [None]:
label_prop_model = LabelSpreading()
label_prop_model.fit(tsne_results, labels)
labels = label_prop_model.predict(tsne_results)


invalid value encountered in divide



In [None]:
labels_str = [str(label) for label in labels]
fig = px.scatter(tsne_results, x=0, y=1, color=labels_str,labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'},
                 color_discrete_map= {'-1': "black"})
fig.show()

In [None]:
accuracy_score(true_labels, labels)

NameError: name 'accuracy_score' is not defined

# Other