In [64]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['text.usetex'] = False

import string
from sklearn.decomposition import PCA
from umap import UMAP
import numpy as np
from TextEmbedding import WordEmbedding

In [65]:
we = WordEmbedding()

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
chars = list(string.printable)
chars[-6] = " "
chars[-5] = "\\n"
chars[-4] = "\\t"
chars[-3] = "\\r"
chars[-2] = "\\v"
chars[-1] = "\\f"

words = ['one']
char_embeddings = we.embedding_chars
word_embeddings = we.embed_words(words=words)
print(char_embeddings.shape)
print(word_embeddings.shape)

embeddings = np.concatenate([char_embeddings, word_embeddings], axis=0)

mapper = UMAP(
    n_components=2,
    metric='cosine',
    n_neighbors=15,
    min_dist=0.2,
    random_state=42)
# mapper = PCA(n_components=2)
embeddings_reduced = mapper.fit_transform(embeddings)

# Split the reduced embeddings array into char_embeddings_reduced and word_embeddings_reduced
char_embeddings_reduced = embeddings_reduced[:100]
word_embeddings_reduced = embeddings_reduced[100:]


(100, 1024)
(1, 1024)


In [67]:
# Define a function to get the type of a character
def get_char_type(char):
    if char.isalpha():
        if char.islower():
            return 'lowercase'
        elif char.isupper():
            return 'uppercase'
    elif char.isdigit():
        return 'numeric'
    else:
        return 'punctuation'


# Create a list of colors for each character type
colors = {'lowercase': 'blue', 'uppercase': 'green',
          'numeric': 'orange', 'punctuation': 'red'}

# Assign a color to each character based on its type
char_colors = [colors[get_char_type(char)] for char in chars]

In [68]:
# calculate norms of embeddings
norms = np.linalg.norm(char_embeddings_reduced, axis=1)


In [69]:
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px

fig = go.Figure()

# Add scatter trace for character embeddings
char_trace = go.Scatter(
    x=char_embeddings_reduced[:, 0],
    y=char_embeddings_reduced[:, 1],
    text=chars,
    opacity=0.8,
    mode='markers',
    marker=dict(
        size=10,
        color=char_colors
    ),
    name='Char Embeddings'
)
fig.add_trace(char_trace)

# create trace for each character
traces = []
for i, char in enumerate("one"):
    x, y = char_embeddings_reduced[i]
    size = norms[i] * 2   # size of marker based on norm of embedding
    trace = go.Scatter(x=[x], y=[y], 
                       name=char, 
                       mode="markers", 
                       opacity=0.5, 
                       marker=dict(size=size, line=dict(width=2, color='white'))
                )
    fig.add_trace(trace)


# Add scatter trace for word embeddings
word_trace = go.Scatter(
    x=word_embeddings_reduced[:, 0],
    y=word_embeddings_reduced[:, 1],
    text=words,
    opacity=0.8,
    mode='markers',
    marker=dict(
        size=20,
        color='pink'
    ),
    name='Word Embeddings'
)
fig.add_trace(word_trace)

# # Connect the characters of each word with lines
# char_indices = np.arange(0, len(string.printable))
# for i, word in enumerate(words):
#     # Get the indices of the characters in the correct order
#     char_order = [string.printable.index(c) for c in word]
#     fig.add_trace(
#         px.line(
#             x=char_embeddings_reduced[char_indices[char_order], 0],
#             y=char_embeddings_reduced[char_indices[char_order], 1],
#         ).data[0]
#         # go.Scatter3d(x=char_embeddings_reduced[char_indices[char_order],0],
#         #            y=char_embeddings_reduced[char_indices[char_order],1],
#         #            z=char_embeddings_reduced[char_indices[char_order],2],
#         #            mode='lines', line=dict(dash='dash'), name=word)
#     )


# Set figure layout
fig.update_layout(
    title='Reduced Embeddings',
    # xaxis_title='A',
    # yaxis_title='B',
    width=800,
    height=600,
    hovermode='closest',
    plot_bgcolor='white',
    # paper_bgcolor='white'
)

fig.show()
