In [1]:
from nltk import sent_tokenize, wordpunct_tokenize
from collections import Counter
from itertools import chain
import torch
import numpy as np

class Vocab:
    '''Abstract vocabulary class that has useful helper functions
    '''
    def __init__(self, corpus, top_k = None, case_sensitive = False):
        '''
        Builds a vocabulary, using the words in the corpus

        Args:
            corpus: string of text.
            top_k: the number of words to be included in the vocabulary, including the special tokens:
            "UNK(unknown)", "_BEGIN_(beginning of a sentence)", "_END_(end of a sentence)"

        Returns:
            word2id: dictionary of (word, id) pairs
            id2word: list of words in the vocabulary
        '''
        if type(top_k) == int:
            top_k -= 3
        if not case_sensitive:
            corpus = corpus.lower()
        
        word_counts = Counter(wordpunct_tokenize(corpus)).most_common(top_k)

        id2word = sorted([word for word,count in word_counts]) + ['UNK','_BEGIN_','_END_']
        word2id = {word: i for i, word in enumerate(id2word)}

        self.id2word = id2word
        self.word2id = word2id
        self.V = len(id2word)

    def sents2id(self, text, top_k = None, case_sensitive = False):
        '''Tokenizes a text into sentences, mapping the words to corresponding indices.

        Args:
            text: string.

        Returns:
            sents_list: List of sentences, where each sentences are the list of word indices.
        '''
        word2id = self.word2id
        id2word = self.id2word
        
        if not case_sensitive:
            text = text.lower()
        
        sents = sent_tokenize(text)

        sents_list = []
        for i in range(len(sents)):
            sent = wordpunct_tokenize(sents[i])
            sent = [word2id[word] if word in word2id else word2id['UNK'] for word in sent]
            sent = [word2id['_BEGIN_']] + sent + [word2id['_END_']]
            sents_list.append(sent)

        return sents_list

    def id2sents(self, sents):
        '''Returns the string representation of the sentences, where sentences is a list of sentences
        and each sentences are lists of word ids.

        Args:
            sents: a list of word ids in the dictionary

        Returns:
            sents_str: string representation of sentences.
        '''

        return ' '.join([self.id2word[i_word] for i_word in chain(*sents)])

    def sent2onehot(self, tokens):
        '''
        Converts the list of word indices into the corresponding list of one-hot vectors

        Args:
            tokens: a sequence of word indices

        Returns:
            onehots: a sequence of one-hot vectors corresponding to tokens.
        '''
        onehots = []
        for i in tokens:
            vec = np.zeros(shape=self.V, dtype=int)
            vec[i] = 1
            onehots.append(vec)

        onehots = torch.from_numpy(np.vstack(onehots)).type(torch.FloatTensor)
        return onehots

    def onehot2sent(self, vecs):
        '''
        Converts a sequence of one-hot vectors into a sequence of word indices

        Args:
            vecs: a sequence of one-hot vectors. 
            Should be a torch tensor where each rows correspond to a one-hot vector of a word.

        Returns:
            sent: a list of word indices that corresponds to vecs
        '''
        maxs, argmaxs = torch.max(vecs, dim = 1) # dim: axis to get argmaxs

        sent = [self[i] for i in argmaxs]
        return sent
    
    def __str__(self):
        return str(self.word2id)
    
    def __getitem__(self, key):
        if type(key) == int:
            return self.id2word[key]
        elif type(key) == str:
            return self.word2id[key]
        else:
            print('Wrong type')
            return None

In [2]:
corpus = 'Deep learning (also known as deep structured learning or hierarchical learning) \
is part of a broader family of machine learning methods based on learning data representations, \
as opposed to task-specific algorithms. Learning can be supervised, partially supervised or unsupervised.\
 Some representations are loosely based on interpretation of information processing and communication patterns \
 in a biological nervous system, such as neural coding that attempts to define a relationship between various \
 stimuli and associated neuronal responses in the brain. Research attempts to create efficient systems to \
 learn these representations from large-scale, unlabeled data sets. Deep learning architectures such as \
 deep neural networks, deep belief networks and recurrent neural networks have been applied to fields \
 including computer vision, speech recognition, natural language processing, audio recognition, social \
 network filtering, machine translation, bioinformatics and drug design .where they produced results \
 comparable to and in some cases superior to human experts.'

In [3]:
vocab = Vocab(corpus)

In [4]:
sents = vocab.sents2id(corpus)
print(sents)
print(vocab.id2sents(sents))

[[107, 33, 55, 0, 7, 51, 12, 33, 88, 55, 68, 44, 55, 1, 50, 69, 65, 5, 24, 39, 65, 57, 55, 58, 16, 66, 55, 32, 77, 2, 12, 67, 99, 94, 3, 85, 6, 4, 108], [107, 55, 25, 17, 91, 2, 70, 91, 68, 102, 4, 108], [107, 84, 77, 11, 56, 16, 66, 49, 65, 48, 72, 8, 28, 71, 46, 5, 22, 60, 92, 2, 89, 12, 63, 27, 95, 14, 99, 34, 5, 76, 20, 103, 87, 8, 13, 64, 79, 46, 96, 23, 4, 108], [107, 78, 14, 99, 31, 37, 93, 99, 54, 97, 77, 42, 53, 3, 81, 2, 101, 32, 82, 4, 108], [107, 33, 55, 10, 89, 12, 33, 63, 62, 2, 33, 19, 62, 8, 75, 63, 62, 43, 18, 9, 99, 40, 47, 30, 104, 2, 86, 74, 2, 59, 52, 72, 2, 15, 74, 2, 83, 61, 41, 2, 57, 100, 2, 21, 8, 36, 35, 4, 105, 98, 73, 80, 29, 99, 8, 46, 84, 26, 90, 99, 45, 38, 4, 108]]
_BEGIN_ deep learning ( also known as deep structured learning or hierarchical learning ) is part of a broader family of machine learning methods based on learning data representations , as opposed to task - specific algorithms . _END_ _BEGIN_ learning can be supervised , partially supervised

In [5]:
vocab[1]

')'

In [6]:
vocab[')']

1

In [7]:
onehot = vocab.sent2onehot(sents[0])
print(onehot.size())
print(vocab.onehot2sent(onehot))

torch.Size([39, 109])
['_BEGIN_', 'deep', 'learning', '(', 'also', 'known', 'as', 'deep', 'structured', 'learning', 'or', 'hierarchical', 'learning', ')', 'is', 'part', 'of', 'a', 'broader', 'family', 'of', 'machine', 'learning', 'methods', 'based', 'on', 'learning', 'data', 'representations', ',', 'as', 'opposed', 'to', 'task', '-', 'specific', 'algorithms', '.', '_END_']
