http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class

In [4]:
from torch.utils.data import Dataset, DataLoader
import os
from os.path import join
from nltk import sent_tokenize, wordpunct_tokenize
from collections import Counter
from itertools import chain

class DocumentDataset(Dataset):
    '''
    Documents dataset.
    '''
    
    def __init__(self, filename, pretrained = None, to_lowercase = True):
        '''
        Args:
            filename (string)
        '''
        with open(filename) as f:
            raw = f.read()
        if to_lowercase:
            raw = raw.lower()
        
        sentences = [wordpunct_tokenize(sent)
                     for sent in sent_tokenize(raw)]
        
        wordcounts = Counter(chain(*[sent for sent in sentences]))
        
        self.sentences = sentences
        self.wordcounts = wordcounts
        
    def __getitem__(self, idx):
        return self.sentences[idx]
    
    def __len__(self):
        return len(self.sentences)

In [11]:
dataset[0]

['donald',
 'john',
 'trump',
 '(',
 'born',
 'june',
 '14',
 ',',
 '1946',
 ')',
 'is',
 'the',
 '45th',
 'and',
 'current',
 'president',
 'of',
 'the',
 'united',
 'states',
 ',',
 'in',
 'office',
 'since',
 'january',
 '20',
 ',',
 '2017',
 '.',
 'before',
 'entering',
 'politics',
 ',',
 'he',
 'was',
 'a',
 'businessman',
 'and',
 'television',
 'personality',
 '.']

In [13]:
dataset = DocumentDataset('./data/Trump.txt')
train_loader = DataLoader(dataset = dataset,
                         batch_size = 3,
                         shuffle = True)

for batch_idx, input in enumerate(train_loader):
    print(batch_idx, input)
#     input = Variable(input)
    

0 [('as', 'his', 'he'), ('of', 'campaign', 'became'), ('2017', 'received', 'the'), (',', 'extensive', 'oldest'), ('he', 'free', 'and'), ('was', 'media', 'wealthiest'), ('the', 'coverage', 'person'), ('544th', ';', 'ever'), ('richest', 'many', 'to'), ('person', 'of', 'assume'), ('in', 'his', 'the'), ('the', 'public', 'presidency'), ('world', 'statements', ','), (',', 'were', 'the'), ('with', 'controversial', 'first'), ('an', 'or', 'without'), ('estimated', 'false', 'prior'), ('net', '.', 'military')]
1 [('after', 'trump', 'trump'), ('trump', 'won', 'was'), ('dismissed', 'the', 'born'), ('fbi', 'general', 'in'), ('director', 'election', 'the'), ('james', 'on', 'new'), ('comey', 'november', 'york'), (',', '8', 'city'), ('the', ',', 'borough'), ('justice', '2016', 'of'), ('department', ',', 'queens'), ('appointed', 'in', '.')]
2 [('in', 'commentators', 'a'), ('foreign', 'described', 'third'), ('policy', 'his', '-'), (',', 'political', 'generation'), ('he', 'positions', 'businessman'), ('wi