In [111]:
import torch
import pandas as pd
import numpy as np

from typing import List
from torch_geometric.data import InMemoryDataset

In [112]:
doc0 = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog"]
doc1 = ["Welcome", "to", "the", "dog", "days", "of", "summer"]
docs = [doc0, doc1]
vocab = set(doc0 + doc1)

In [113]:
vocab_to_int = {term: token for token, term in enumerate(vocab)}
int_to_vocab = {token: term for term, token in vocab_to_int.items()}

In [114]:
def tokenize(sent):
    """Map sentence to tokens"""
    return [vocab_to_int[word] for word in sent]


def untokenize(sent):
    """Map tokens to sentence"""
    return [int_to_vocab[token] for token in sent]


def one_hot_encode(sent):
    """One hot encode a sentence"""
    return np.stack([token_to_one_hot(x) for x in sent])


def decode_one_hot(encoding) -> List[str]:
    """Decode a one hot encoding back to sentence"""
    return [one_hot_word_to_token(x) for x in encoding]


def token_to_one_hot(token):
    """One hot encode a token"""
    word = np.zeros(len(vocab), dtype=int)
    word[token] = 1
    return word


def one_hot_word_to_token(one_hot_word):
    """Convert one hot encoded word to token"""
    return int_to_vocab[np.argmax(one_hot_word)]

In [115]:
tokenized = [tokenize(x) for x in docs]
sentences = [untokenize(x) for x in tokenized]

In [116]:
sentences

[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog'],
 ['Welcome', 'to', 'the', 'dog', 'days', 'of', 'summer']]

In [122]:
onehot_encoded = one_hot_encode(tokenized[1])

In [123]:
onehot_encoded

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [124]:
sentence = decode_one_hot(onehot_encoded)

In [125]:
sentence

['Welcome', 'to', 'the', 'dog', 'days', 'of', 'summer']

In [126]:
onehot_encoded.shape

(7, 14)