In [None]:
import sys
sys.path.insert(0, "/vectorizer")

import numpy as np 
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
                               
from scipy.spatial.distance import cosine
from sklearn.manifold import TSNE

from vectorizer.utils import load_embeddings_from_hdf5, load_labels_from_npy

In [None]:
# EMBEDDINGS_FILE = "/datasets/probing/rc_probing/v0.3/balanced/embeddings/sentence-embeddings/dev.tsv_bert-base-cased_layer=0_pooler=cls.hdf5"
# EMBEDDINGS_FILE = "/datasets/probing/rc_probing/v0.3/balanced/embeddings/sentence-embeddings/dev.tsv_bert-base-cased_layer=5_pooler=mean.hdf5"
EMBEDDINGS_FILE = "/datasets/probing/rc_probing/v0.3/balanced/embeddings/sentence-embeddings/dev.tsv_glove_layer=0_pooler=mean.hdf5"

LABELS_FILE = "/datasets/probing/rc_probing/v0.3/balanced/embeddings/sentence-embeddings/dev_labels.npy"

In [None]:
# Load embeddings and labels from disc
embeddings = load_embeddings_from_hdf5(EMBEDDINGS_FILE)
labels = load_labels_from_npy(LABELS_FILE)

In [None]:
print(embeddings.shape)
print(labels.shape)
print(labels.flatten())

In [None]:
print(embeddings[:10, :])

In [None]:
positive_sample_indices = np.where(labels.flatten() == 1)[0]
negative_sample_indices = np.where(labels.flatten() == 0)[0]
print(len(positive_sample_indices))
print(len(negative_sample_indices))

In [None]:
# Plot histograms for embeddings
N = 100
fig, axes = plt.subplots(1, 1, figsize=(7, 4), dpi=100)

for idx, sample_embeddings in enumerate(embeddings[:N]):
    axes.hist(sample_embeddings, alpha=0.25, bins=50, density=False, label=f'sample: {idx}')
# axes.legend(loc='best');

In [None]:
# Create t-SNE emebddings
ppl = 30
init = 'random'

tsne = TSNE(n_components=2, perplexity=ppl, random_state=123, init=init)   
tsne_embedded = tsne.fit_transform(embeddings[:])
print(tsne_embedded.shape)

In [None]:
# Plot t-SNE embeddings
fig, axes = plt.subplots(1, 1, figsize=(7, 7), dpi=100)

# Plot positive samples
x, y = tsne_embedded[positive_sample_indices, 0], tsne_embedded[positive_sample_indices, 1]
axes.scatter(x, y, marker='.', linewidths=2, alpha=0.5, label='1')

# Plot negative samples
x, y = tsne_embedded[negative_sample_indices, 0], tsne_embedded[negative_sample_indices, 1]
axes.scatter(x, y, marker='.', linewidths=2, alpha=0.5, label='0')

axes.set_title(f'init={init} -- ppl={ppl}')
axes.legend(loc='best')
axes.xaxis.set_minor_locator(AutoMinorLocator())
axes.yaxis.set_minor_locator(AutoMinorLocator())
axes.tick_params(which='minor', length=3, color='black')
axes.grid(color='gray', linestyle='--', linewidth=.25)
    
plt.tight_layout();