In [62]:
from nltk import sent_tokenize, wordpunct_tokenize
from collections import Counter
from itertools import chain

def build_vocab(corpus, top_k = None):
    '''
    Builds a vocabulary, using the words in the corpus
    
    Args:
        corpus: string of text.
        top_k: the number of words to be included in the vocabulary, including the special tokens:
        "UNK(unknown)", "_BEGIN_(beginning of a sentence)", "_END_(end of a sentence)"
        
    Returns:
        word2id: dictionary of (word, id) pairs
        id2word: list of words in the vocabulary
    '''
    if type(top_k) == int:
        top_k -= 3
    word_counts = Counter(wordpunct_tokenize(corpus)).most_common(top_k)
    
    id2word = sorted([word for word,count in word_counts]) + ['UNK','_BEGIN_','_END_']
    word2id = {word: i for i, word in enumerate(id2word)}
    
    return word2id, id2word

def sents2id(corpus, top_k = None, case_sensitive = False):
    '''Tokenizes the whole corpus into sentences, mapping the words to corresponding indices.
    
    Args:
        corpus: string.
        
    Returns:
        sents_list: List of sentences, where each sentences are the list of word indices.
        word2id: dictionary of (word, id) pairs
        id2word: list of words in the vocabulary
    '''
    if not case_sensitive:
        corpus = corpus.lower()
    word2id, id2word = build_vocab(corpus, top_k)
    sents = sent_tokenize(corpus)
    
    sents_list = []
    for i in range(len(sents)):
        sent = wordpunct_tokenize(sents[i])
        sent = [word2id[word] if word in word2id else word2id['UNK'] for word in sent]
        sent = [word2id['_BEGIN_']] + sent + [word2id['_END_']]
        sents_list.append(sent)
        
    return sents_list, word2id, id2word

def id2sents(sents):
    '''Returns the string representation of the sentences, where sentences is a list of sentences
    and each sentences are lists of word ids.
    
    Args:
        sents: a list of word ids in the dictionary
        
    Returns:
        sents_str: string representation of sentences.
    '''
    
    return ' '.join([id2word[i_word] for i_word in chain(*sents)])

In [63]:
corpus = 'Deep learning (also known as deep structured learning or hierarchical learning) \
is part of a broader family of machine learning methods based on learning data representations, \
as opposed to task-specific algorithms. Learning can be supervised, partially supervised or unsupervised.\
 Some representations are loosely based on interpretation of information processing and communication patterns \
 in a biological nervous system, such as neural coding that attempts to define a relationship between various \
 stimuli and associated neuronal responses in the brain. Research attempts to create efficient systems to \
 learn these representations from large-scale, unlabeled data sets. Deep learning architectures such as \
 deep neural networks, deep belief networks and recurrent neural networks have been applied to fields \
 including computer vision, speech recognition, natural language processing, audio recognition, social \
 network filtering, machine translation, bioinformatics and drug design .where they produced results \
 comparable to and in some cases superior to human experts.'

In [64]:
sents_list, word2id, id2word = sents2id(corpus, top_k = 50)

print(sents_list)
print(id2sents(sents_list))

print(word2id)
print()

print(id2word)
print()

[[48, 16, 21, 0, 5, 47, 8, 16, 39, 21, 28, 47, 21, 47, 47, 47, 25, 4, 47, 47, 25, 22, 21, 47, 10, 26, 21, 15, 33, 1, 8, 27, 43, 47, 2, 37, 47, 3, 49], [48, 21, 13, 47, 41, 1, 47, 41, 28, 47, 3, 49], [48, 36, 33, 47, 47, 10, 26, 47, 25, 18, 30, 6, 14, 29, 17, 4, 47, 47, 42, 1, 40, 8, 24, 47, 47, 9, 43, 47, 4, 32, 47, 47, 38, 6, 47, 47, 34, 17, 47, 47, 3, 49], [48, 47, 9, 43, 47, 47, 47, 43, 20, 47, 33, 47, 19, 2, 35, 1, 45, 15, 47, 3, 49], [48, 16, 21, 7, 40, 8, 16, 24, 23, 1, 16, 12, 23, 6, 47, 24, 23, 47, 11, 47, 43, 47, 47, 47, 47, 1, 47, 31, 1, 47, 47, 30, 1, 47, 31, 1, 47, 47, 47, 1, 22, 44, 1, 47, 6, 47, 47, 3, 46, 47, 47, 47, 47, 43, 6, 17, 36, 47, 47, 43, 47, 47, 3, 49]]
_BEGIN_ deep learning ( also UNK as deep structured learning or UNK learning UNK UNK UNK of a UNK UNK of machine learning UNK based on learning data representations , as opposed to UNK - specific UNK . _END_ _BEGIN_ learning can UNK supervised , UNK supervised or UNK . _END_ _BEGIN_ some representations UNK UNK 