# Doc2vec from scratch in PyTorch
We will be implementing this paper https://arxiv.org/abs/1405.4053

In [31]:
import numpy as np
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

### Subjectivity Dataset
The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:
```
wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
```
Download this data and save it in dir data.

In [2]:
!ls data

plot.tok.gt9.5000   quote.tok.gt9.5000  subjdata.README.1.0


In [3]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [13]:
obj_content = read_file("data/plot.tok.gt9.5000")
sub_content = read_file("data/quote.tok.gt9.5000")

In [14]:
len(obj_content), len(sub_content)

(5000, 5000)

### Compute vocabulary

In [8]:
from collections import defaultdict
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab    

In [17]:
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
content = np.append(sub_content, obj_content)

In [18]:
content[0]

'smart and alert , thirteen conversations about one thing is a small gem .'

In [20]:
word_count = get_vocab(content)
len(word_count.keys())

23908

In [21]:
# let's delete words that are very infrequent
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]
len(word_count.keys())

4836

In [22]:
vocab2index = {"<PAD>":0, "UNK":1} # init with padding and unknown
words = ["<PAD>", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

## Sentence enconding

In [23]:
def encode_sentence(s):
    enc = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    return enc

In [24]:
encode_sentence(content[0])

array([13,  2, 14, 11,  1,  9,  3,  7,  6, 12,  8,  5,  4, 10])

### Dataset 

In [35]:
text = encode_sentence(content[0])
text

array([13,  2, 14, 11,  1,  9,  3,  7,  6, 12,  8,  5,  4, 10])

We need to be able to sample $k$ words to predict the $k+1$th word.

In [38]:
def sample(text, k=3):
    """ Given a list of ids sample k consecutive words
    to predict the k+1th
    """
    n = len(text)
    s = np.random.randint(n-k, size=1)[0]
    return text[s:s+k], text[s+k]

In [44]:
for i in range(3):
    print(sample(text))

(array([ 7,  6, 12]), 8)
(array([ 6, 12,  8]), 5)
(array([14, 11,  1]), 9)


In [48]:
class Wor2VecDataset(Dataset):
    def __init__(self, content, k=3):
        self.content = content
    
    def __len__(self):
        return len(self.content)
    
    def __getitem__(self, idx):
        text = self.content[idx]
        doc_id = idx
        context, next_word = sample(text)
        return doc_id, context, next_word

In [55]:
encoded_text = [encode_sentence(text) for text in content]
encoded_text[0]

array([13,  2, 14, 11,  1,  9,  3,  7,  6, 12,  8,  5,  4, 10])

In [56]:
train_ds = Wor2VecDataset(encoded_text)

In [57]:
train_ds[0]

(0, array([12,  8,  5]), 4)

In [71]:
train_dl =  DataLoader(train_ds, batch_size=2, shuffle=True)
doc_id, context, next_word = next(iter(train_dl))

In [77]:
doc_id, context, next_word 

(tensor([9363, 7432]),
 tensor([[2332, 1404,   28],
         [1314, 4546,  109]]),
 tensor([  12, 2622]))

### Model

In [78]:
len(vocab2index), len(train_ds)

(4838, 10000)

In [79]:
vocab_size = len(vocab2index)
num_docs =  len(train_ds)
emb_size = 100

word_emb = nn.Embedding(vocab_size, emb_size)
doc_emb = nn.Embedding(num_docs, emb_size)

In [80]:
doc_id = doc_emb(doc_id)
doc_id.shape

torch.Size([2, 100])

In [81]:
context_emb = word_emb(context)
context_emb.shape

torch.Size([2, 3, 100])

In [82]:
context_emb_flat = context_emb.flatten(1)
context_emb_flat.shape

torch.Size([2, 300])

In [84]:
x = torch.cat((doc_id, context_emb_flat), dim=1)
x.shape

torch.Size([2, 400])

In [85]:
linear = nn.Linear(4*emb_size, vocab_size)

In [86]:
linear(x).shape

torch.Size([2, 4838])

In [88]:
F.cross_entropy(linear(x), next_word )

tensor(8.8092, grad_fn=<NllLossBackward0>)

In [106]:
class Doc2Vec(nn.Module):
    def __init__(self, vocab_size, num_docs, emb_size=100):
        super(Doc2Vec, self).__init__()
        ### Your code here
        
    def forward(self, x_doc, x_words):
        ## Your code here

In [107]:
model = Doc2Vec(vocab_size, num_docs)

In [120]:
def train_epocs(model, train_dl, optimizer, epochs=10000):
    #Your CODE HERE
    
    
    
    
        if i%1000:
            print("train_loss %.3f" % (train_loss))

In [121]:
model = Doc2Vec(vocab_size, num_docs)
train_dl =  DataLoader(train_ds, batch_size=1000, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [122]:
doc_id, context, next_word = next(iter(train_dl))
doc_id.shape, context.shape, next_word.shape

(torch.Size([1000]), torch.Size([1000, 3]), torch.Size([1000]))

In [None]:
#train_epocs(model, train_dl, optimizer)

### Lab:
1. Finish writting the model and the training loop.
2. Can you use the doc embeddings learned by the model to predict the orriginal label (0,)