Load the pretrained vector into the memory

In [1]:
from modules.texts import GloVeLoader
import os

path_glove = os.path.join(os.path.expanduser('~'),
             'data/NLP/word_embeddings/GloVe/glove.6B.300d.txt')
glove = GloVeLoader(path_glove)

The pretrained vector file to use: /home/yhs/data/NLP/word_embeddings/GloVe/glove.6B.300d.txt
The number of words in the pretrained vector: 400000
The dimension of the pretrained vector: 300


Load the dataset

In [2]:
from modules.texts import Vocab

file = './data/Trump.txt'

with open(file) as f:
    vocab = Vocab(f.read())

# Load the word embeddings
import torch
import torch.nn as nn

d = 300
emb = nn.Embedding(vocab.V, d)
for word in vocab.word2id:
    try:
        emb.weight.data[vocab[word]] = torch.from_numpy(glove[word])
    except KeyError as e:
        # Case when pretrained embedding for a word does not exist
        pass

# emb.weight.requires_grad = False # suppress updates

In [13]:
import os
from os.path import join
from nltk import sent_tokenize, wordpunct_tokenize
from collections import Counter
from itertools import chain
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import numpy as np

class DocumentDataset(Dataset):
    '''
    Documents dataset.
    '''
    
    def __init__(self, filename, vocab, case_sensitive = False):
        '''
        Args:
            filename (string): full path of the document file
            vocab (Vocab): Vocabulary class that contains the vocabulary for a corpus
            emb (nn.Embedding): word embeddings corresponding to the words in words_dict
            case_sensitive (bool): whether lower/uppercase letters differ
        '''
        
        with open(filename) as f:
            raw = f.read()
        if not case_sensitive:
            raw = raw.lower()
        
        self.vocab = vocab
        # input sentences
        self.inputs = vocab.sents2id(raw, case_sensitive)
        np.random.seed(0)
        self.targets = [np.random.randint(2) for sent in self.inputs]
        
    def __getitem__(self, idx):
        inputs = torch.LongTensor(self.inputs[idx]) 
        targets = torch.LongTensor(self.targets[idx])
        
        return inputs, targets
    
    def __len__(self):
        return len(self.inputs)   
    
doc = DocumentDataset(file, vocab)
docloader = DataLoader(doc, batch_size=1, shuffle=False)

for i,t in docloader:
    print(emb(Variable(i)))

TypeError: 'int' object is not subscriptable

For the CNN sentence encoder, I have modified [this code](https://github.com/Shawn1993/cnn-text-classification-pytorch).

In [4]:
import torch.nn.functional as F

class SentenceEncoder(nn.Module):
    
    def __init__(self, vocab_size, emb_size, n_kernels, kernel_sizes, pretrained = None, static = False):
        '''
        Args:
            vocab_size (int): size of the vocabulary
            emb_size (int): dimension of word embeddings
            n_kernels (int): the number of filters
            kernel_sizes (int): a list of sliding windows to be used
            static (bool): whether you want the embeddings to be updated or not
        '''
        super().__init__()
        in_channels = 1
        self.vocab_size = vocab_size
        self.n_kernels = n_kernels
        self.kernel_sizes = kernel_sizes

        self.emb = nn.Embedding(vocab_size, emb_size)
        self.init_emb(pretrained)
        if static:
            self.emb.weight.requires_grad = False
        self.convs = nn.ModuleList(
            [nn.Conv2d(in_channels, n_kernels, (h, emb_size))
             for h in kernel_sizes])
        
        if torch.cuda.is_available():
            self.cuda()
    
    def init_emb(self, emb_pretrained):
        if emb_pretrained == None:
            return
        else:
            self.emb.weight = nn.Parameter(emb_pretrained.weight.data)

    def forward(self, s):
        '''
        Args:
            s (seq_len): a sentence of type torch.LongTensor.
            Each entries represent a word index.
        '''
        # (batch_size = 1, in_channel, seq_len, emb_size)
        s = self.emb(s).unsqueeze(1)
        
        feature_map = [F.relu(conv(s)).squeeze(3)
                       for conv in self.convs]
        feature_pooled = [F.max_pool1d(c, c.size(2)).squeeze(2)
                          for c in feature_map]
        feature_pooled = torch.cat(feature_pooled, 1)
        
        return feature_pooled

In [9]:
vocab_size = emb.weight.data.size(0)
emb_size = emb.weight.data.size(1)
n_kernels = 50
kernel_sizes = [1,2,3,4,5]
sentence_encoder = SentenceEncoder(vocab_size,
                                   emb_size,
                                   n_kernels,
                                   kernel_sizes,
                                   emb)

sents = []
for input, target in docloader:
    ####WARNING: the elements of the kernel_sizes should be larger
    #### than the minimum length of a sentence.
    input = Variable(input).cuda()
    sents.append(sentence_encoder(input))

In [10]:
torch.cat(sents, dim = 0)

Variable containing:
 0.6579  0.5226  1.6746  ...   0.2967  0.8536  0.6642
 0.4898  0.5226  1.6746  ...   0.1773  0.4255  0.7531
 0.4898  0.5226  1.6746  ...   0.2306  0.6965  0.6155
          ...             ⋱             ...          
 0.4898  0.5226  1.6746  ...   0.2634  0.5600  0.5050
 0.5785  0.5226  1.6746  ...   0.1486  0.6727  0.5038
 0.4898  0.5226  1.6746  ...   0.1210  0.9029  0.4325
[torch.cuda.FloatTensor of size 21x250 (GPU 0)]