In [122]:
import torch
import re
import pandas as pd
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import Dataset, DataLoader

In [5]:
with open("Squad_text") as fp:
    corpus = fp.read()

In [6]:
word_counter = Counter()
i = 0
for word in corpus.split(" "):
    word_counter[word] += 1

In [7]:
word_counter["history"]

41

In [8]:
total_words = len(word_counter)

## Data Munging ##

We go through the corpus, and create a data frame with 5 words. The middle word will be y, and the other 4 context words will be the x values.

In [39]:
df = pd.DataFrame(columns=['context','word'])

In [None]:
# Take the corpus and split it into continuous windoes of 5 words, picking
# the middle word as the outcome and the surrounding 4 as context words
text_data = [word for word in corpus.split(" ")]
data_set = []
current = 0
idx = 0
while total_words - current >= 5:
    end = current + 5
    ci = current + 2
    window = text_data[current:end]
    current += 1 
    count = 0
    string = ""
    for word in window:
        if count == 2:
            context_word = window[count]
        elif count == 4:
            string += word
            entry = {}
            df.loc[idx] = [string, context_word]
            entry[context_word] = string
            string = ""
            data_set.append(entry)
            count = 0
            idx += 1
        else:
            string += word
        count += 1
        string += ' '

In [110]:
len(data_set)

42497

In [111]:
data = df.copy(deep=True)

In [112]:
# sample the data to shuffle it
data = data.sample(frac=1).reset_index(drop=True)

In [113]:
# remove all non text symbols from the words
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z]+", r" ", text)
    return text
data.context = data.context.apply(preprocess_text)

In [115]:
data['split'] = ''
data.split.iloc[:29747] = 'train'
data.split.iloc[29747:36122] = 'val'
data.split.iloc[36122:] = 'test'

In [141]:
data.head()

Unnamed: 0,context,word,split
0,my body is for,which,train
1,school in moved to,Tesla,train
2,significant buildings st john s,are,train
3,on milutin,April,train
4,won the twice as,cup,train


In [142]:
data.tail()

Unnamed: 0,context,word,split
42492,does it to solve,take,test
42493,last semester in tesla,December,test
42494,required by most efficient,the,test
42495,warsaw is edge of,the,test
42496,of england nobility of,The,test


In [102]:
data.to_csv("squad11.csv")

In [198]:
class Vocabulary(object):
    def __init__(self, unk_token="<UNK>", token_to_idx=None):
        
        if token_to_idx is None:
            self._token_to_idx = {}
        
        
        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}
        
        self.unk_token = unk_token
        self.unk_index = 0
        self._token_to_idx[self.unk_token] = self.unk_index
        self._idx_to_token[self.unk_index] = self.unk_token
    
    def add_token(self, token):
        if token not in self._token_to_idx:
            index = len(self._idx_to_token)
            self._idx_to_token[index] = token
            self._token_to_idx[token] = index
    
    def add_many(self, text):
        for word in text.split(" "):
            self.add_token(word)
            
    def lookup_token(self, token):
        index = self.unk_index
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        return index

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx, 
                'unk_token': self._unk_token} 
 
    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)
    
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]  
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [None]:
class Vectorizer(object):
    def __init__(self, corpus_vocab, vector_size):
        self._corpus_vocab = corpus_vocab
        self._token_to_vector = {}
        self._vector_size = vector_size
        
    def vectorize(self, token, vector=None):
        if token not in self._token_to_vector:
            vector = Variable(torch.randn(self._vector_size).float(), requires_grad=True)
        self._token_to_vector[token] = vector

        return vector
    
    def update_vector(self, vector):
        if token not self._token_to_vector:
            self._token_to_vector[token] = vector
    
    @classmethod
    def from_dataframe(cls, cbow_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            cbow_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the CBOWVectorizer
        """
        cbow_vocab = Vocabulary()
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)
            
        return cls(cbow_vocab)

    @classmethod
    def from_serializable(cls, contents):
        cbow_vocab = \
            Vocabulary.from_serializable(contents['cbow_vocab'])
        return cls(cbow_vocab=cbow_vocab)

    def to_serializable(self):
        return {'cbow_vocab': self.cbow_vocab.to_serializable()}

In [123]:
class CBOWDataset(Dataset):
    def __init__(self, cbow_df, vectorizer):
        """
        Args:
            cbow_df (pandas.DataFrame): the dataset
            vectorizer (CBOWVectorizer): vectorizer instatiated from dataset
        """
        self.cbow_df = cbow_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, cbow_df.context))
        
        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, corpus_file):
        cbow_df = pd.read_csv(cbow_csv)
        train_cbow_df = cbow_df[cbow_df.split=='train']
        return cls(cbow_df, Vectorizer.from_dataframe(train_cbow_df))

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
        
    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        context_vector = \
            self._vectorizer.vectorize(row.context, self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [None]:
class CBOWClassifier(nn.Module):
    """ a simple perceptron based classifier """
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(CBOWClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=num_features)
        
    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,)
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        
        return y_out

In [169]:
train_df = data.context[data.split == 'train']

In [199]:
v = Vocabulary()

In [200]:
for (i,x) in train_df.items():
    v.add_many(x)

In [204]:
len(v)

7340

In [258]:
def get_input_layer(v):
    i = int(torch.randint(high=100, size=(1,)))
    phrase = df.iloc[i].context
    word = df.iloc[62].word
    X = torch.zeros(len(v)).float()
    for w in phrase.split(" "):
        word_idx = v.lookup_token(w)
        X[word_idx] = 1.0
    Y = v.lookup_token(word)
    return X, Y

In [259]:
X, Y = get_input_layer(v)
x = Variable(X).float()
y_true = Variable(torch.from_numpy(np.array([Y])).long())


#z2 = torch.matmul(W2, z1)
    
#log_softmax = F.log_softmax(z2, dim=0)

#loss = F.nll_loss(log_softmax.view(1,-1), y_true)
#loss_val += loss.data[0]
#loss.backward()
#W1.data -= learning_rate * W1.grad.data
#W2.data -= learning_rate * W2.grad.data

In [260]:
x.shape

torch.Size([7340])

In [261]:
y_true.shape

torch.Size([1])

In [262]:
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)

In [263]:
W1.shape

torch.Size([5, 7340])

In [264]:
z1 = torch.matmul(W1, x)

In [265]:
z1

tensor([-2.1203,  2.1124, -0.7214, -2.8684,  0.5607], grad_fn=<MvBackward>)

In [266]:
z1.shape

torch.Size([5])

In [267]:
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)

In [268]:
z2 = torch.matmul(W2, z1)

In [269]:
z2

tensor([ 3.1406, -3.9307, -6.8063,  ..., -1.9966, -2.1564, -7.0815],
       grad_fn=<MvBackward>)

In [270]:
z2.dim()

1

In [271]:
log_softmax = F.log_softmax(z2, dim=0)

In [272]:
log_softmax

tensor([-16.5182, -23.5896, -26.4651,  ..., -21.6555, -21.8153, -26.7403],
       grad_fn=<LogSoftmaxBackward>)

In [273]:
log_softmax.view(1, -1).dim()

2

In [274]:
loss = F.nll_loss(log_softmax.view(1, -1), y_true)

In [278]:
loss.item()

25.11407470703125

In [280]:
embedding_dims = 5
batch_size = 32
vocabulary_size = len(v)
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 101
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for i in range(batch_size):
        X, Y = get_input_layer(v)
        x = Variable(X).float()
        y_true = Variable(torch.from_numpy(np.array([Y])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(v)}')

Loss at epo 0: 0.10345082672805163
Loss at epo 10: 0.040456038209982725
Loss at epo 20: 0.01397100560143793
Loss at epo 30: 0.009662988818471373
Loss at epo 40: 0.002566625122444029
Loss at epo 50: 0.0030050863971088286
Loss at epo 60: 0.0009152318101935236
Loss at epo 70: 0.0034810426831972524
Loss at epo 80: 0.0020849846075726253
Loss at epo 90: 0.0009164769114021723
Loss at epo 100: 0.0008574926538351187


In [287]:
W1[:,1].shape

torch.Size([5])