http://pytorch.org/tutorials/beginner/data_loading_tutorial.html#dataset-class

In [4]:
from torch.utils.data import Dataset, DataLoader
import os
from os.path import join
from nltk import sent_tokenize, wordpunct_tokenize
from collections import Counter
from itertools import chain

class DocumentDataset(Dataset):
    '''
    Documents dataset.
    '''
    
    def __init__(self, filename, pretrained = None, to_lowercase = True):
        '''
        Args:
            filename (string)
        '''
        with open(filename) as f:
            raw = f.read()
        if to_lowercase:
            raw = raw.lower()
        
        sentences = [wordpunct_tokenize(sent)
                     for sent in sent_tokenize(raw)]
        
        wordcounts = Counter(chain(*[sent for sent in sentences]))
        
        self.sentences = sentences
        self.wordcounts = wordcounts
        
    def __getitem__(self, idx):
        return self.sentences[idx]
    
    def __len__(self):
        return len(self.sentences)

In [11]:
dataset[0]

['donald',
 'john',
 'trump',
 '(',
 'born',
 'june',
 '14',
 ',',
 '1946',
 ')',
 'is',
 'the',
 '45th',
 'and',
 'current',
 'president',
 'of',
 'the',
 'united',
 'states',
 ',',
 'in',
 'office',
 'since',
 'january',
 '20',
 ',',
 '2017',
 '.',
 'before',
 'entering',
 'politics',
 ',',
 'he',
 'was',
 'a',
 'businessman',
 'and',
 'television',
 'personality',
 '.']

In [15]:
dataset = DocumentDataset('./data/Trump.txt')
train_loader = DataLoader(dataset = dataset,
                         batch_size = 2,
                         shuffle = False)

for batch_idx, input in enumerate(train_loader):
    print(batch_idx, input)
#     input = Variable(input)
    

0 [('donald', 'trump'), ('john', 'was'), ('trump', 'born'), ('(', 'in'), ('born', 'the'), ('june', 'new'), ('14', 'york'), (',', 'city'), ('1946', 'borough'), (')', 'of'), ('is', 'queens'), ('the', '.')]
1 [('he', 'a'), ('earned', 'third'), ('an', '-'), ('economics', 'generation'), ('degree', 'businessman'), ('from', ','), ('the', 'trump'), ('wharton', 'followed'), ('school', 'in'), ('of', 'the'), ('the', 'footsteps'), ('university', 'of'), ('of', 'his'), ('pennsylvania', 'grandmother'), ('.', 'elizabeth')]
2 [('he', 'trump'), ('served', "'"), ('as', 's'), ('chairman', 'business'), ('and', 'career'), ('president', 'primarily'), ('of', 'focused'), ('the', 'on'), ('trump', 'building'), ('organization', 'or'), ('from', 'renovating'), ('1971', 'office'), ('until', 'towers'), ('his', ','), ('inauguration', 'hotels'), ('as', ','), ('president', 'casinos'), ('in', ','), ('january', 'and'), ('2017', 'golf'), (',', 'courses'), ('when', '.')]
3 [('aside', 'he'), ('from', 'has'), ('that', 'writte