In [1]:
import os
import time
from tqdm import tqdm
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cpu')

## model

In [2]:
## ENCODER CLASS
class BOWEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, class_in, is_pretrained):
        super(BOWEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,class_in)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        out = self.linear(out.float())
        return out
    
    def create_weights(self, vectors, id2token):
        '''Create weights metrics from vectors and id2token
        Returns:
        weights_matrix: torch.Tensor, dimension of (vocab size x embedding dim)
        '''
        weights_matrix = torch.from_numpy(np.array([vectors[id2token[i]] for i in range(2, len(id2token))]))
        zero = torch.zeros(2, weights_matrix.size()[1], dtype=torch.float64)
        weights_matrix = torch.cat([zero, weights_matrix])
        return weights_matrix

    
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        '''Create embedding layer that's used in a PyTorch model
        Returns:
        emb_layer: nn.Embedding()
        num_embeddings: int
        embedding_dim: int
        '''
        num_embeddings, embedding_dim = weights_matrix.size()
        emb_layer = nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False
        return emb_layer

In [21]:

class NNClassifier(nn.Module):
    def __init__(self, n_in, h_s, n_out):
        super().__init__()
#         self.linear1 = nn.Linear(n_in,h_s)
#         self.linear2 = nn.Linear(h_s,h_s)
#         self.linear3 = nn.Linear(h_s,n_out)
    
    def forward(self, x):
#         x = F.relu(self.linear1(x))
#         x = self.linear2(x)
#         x = F.relu(x)
#         x = self.linear3(x)
        return F.log_softmax(x)
    

class NNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_in, h_s, n_out, combine_mode, is_pretrained):
        super().__init__()
        self.encoder = BOWEncoder(vocab_size, embed_dim, n_in, is_pretrained)
        self.combine_mode = combine_mode
        if combine_mode == 'DIRECT':
            n_in = n_in * 2;
        self.classifier = NNClassifier(n_in,h_s, n_out)
    
    def forward(self, premise, len_premise, ):  #hypothesis, len_hypo
        premise = self.encoder(premise, )  #len_premise
        x = premise
#         hypothesis = self.encoder(hypothesis, len_hypo)
#         if self.combine_mode == 'DIRECT':
#             x = torch.cat((premise, hypothesis),1)
#         elif self.combine_mode == 'MUL':
#             x = torch.mul(premise, hypothesis)
#         elif self.combine_mode == 'SUB':
#             x = torch.sub(premise, hypothesis)
        x = self.classifier(x)
        return x

### BOW

In [323]:
## ENCODER CLASS
class BOWEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, class_in):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,class_in)
        self.logits = None
        self.sm = nn.LogSoftmax(dim=-1)
    
    def forward(self, data, length):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)  ## batch x maxlen x emb
        print("emb", out.shape)
        out = out.sum(dim=1)  ## batch x emb
        print("sum", out.shape)
        out /= length.view(length.size()[0],1).expand_as(out).float()
        print("avg", out.shape)
     
        out = self.linear(out.float())
        print("out", out.shape)
        self.logits = out.detach()
    
        out = self.sm(out)
        return out

In [83]:
m = BOWEncoder(tokenizer.vocab_size, 128, 3)

In [86]:
x = torch.tensor(np.random.rand(2,100)).int()
l = torch.tensor([1,2]).int()

x.shape, l.shape

(torch.Size([2, 100]), torch.Size([2]))

In [89]:
o = m.embed(x)
print(o.shape)
o = o.sum(dim=1)
print(o.shape)
o /= l.view(l.size()[0],1).expand_as(o).float()
o.shape

torch.Size([2, 100, 128])
torch.Size([2, 128])


torch.Size([2, 128])

In [64]:
e = m(x, l)

In [67]:
e.detach()

tensor([[0.3208, 0.3112, 0.3680]])

In [68]:
e

tensor([[0.3208, 0.3112, 0.3680]], grad_fn=<SoftmaxBackward0>)

## data

In [3]:
from helpers import prepare_dataset_nli

tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator', use_fast=True)

prepare_train_dataset = prepare_eval_dataset = \
    lambda exs: prepare_dataset_nli(exs, tokenizer, 128)


In [4]:

dataset = datasets.load_dataset('snli')
dataset = dataset.filter(lambda ex: ex['label'] != -1)


train_dataset = dataset['train']
train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=2,
    remove_columns=train_dataset.column_names
)

eval_dataset = dataset['validation']
eval_dataset_featurized = eval_dataset.map(
    prepare_eval_dataset,
    batched=True,
    num_proc=2,
    remove_columns=eval_dataset.column_names
)


Found cached dataset parquet (/home/sunaguo/.cache/huggingface/datasets/parquet/plain_text-ac8034d49bf805ac/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 3/3 [00:00<00:00, 107.67it/s]
Loading cached processed dataset at /home/sunaguo/.cache/huggingface/datasets/parquet/plain_text-ac8034d49bf805ac/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-9a809bd4db7edce2.arrow
Loading cached processed dataset at /home/sunaguo/.cache/huggingface/datasets/parquet/plain_text-ac8034d49bf805ac/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-70ef6e5e2f5d1d70.arrow
Loading cached processed dataset at /home/sunaguo/.cache/huggingface/datasets/parquet/plain_text-ac8034d49bf805ac/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7/cache-0262eb9dc529825e.arrow
Loading cached processed dataset at /home/sunaguo/.cache/huggingface/datasets/parquet/plain_text-ac8034d49bf805ac/0.0.0/14a00e99c0d

In [5]:
len(train_dataset_featurized), len(eval_dataset_featurized)

(549367, 9842)

In [53]:
len(train_dataset_featurized), len(eval_dataset_featurized)

(550152, 10000)

## prep

In [144]:
## getting pretrained embedding
from transformers import AutoModelForSequenceClassification

modelname = 'google/electra-small-discriminator'
task_kwargs = {'num_labels': 3} 
basemodel = AutoModelForSequenceClassification.from_pretrained(modelname, **task_kwargs)
emb = basemodel.electra.embeddings.word_embeddings.weight.detach()

EMBEDDING_DIM =  emb.shape[1]

emb[tokenizer.all_special_ids] = torch.zeros(EMBEDDING_DIM)

basemodel.eval()

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [147]:
vocab_size

30522

In [7]:
vocab_size = emb.shape[0]
emb_dim = 128
nclass = 3

In [304]:
## vocab_size, emb_dim, class_in
model = BOWEncoder(vocab_size, emb_dim, nclass)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)   

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.NLLLoss()
lsm = nn.LogSoftmax(dim=1)

save_path = 'bow.pt'

In [124]:
model.embed.weight

Parameter containing:
tensor([[ 0.0079, -0.0102, -0.0112,  ..., -0.0088, -0.0026, -0.0034],
        [ 0.0023, -0.0035, -0.0013,  ..., -0.0008,  0.0026, -0.0093],
        [ 0.0094, -0.0061, -0.0060,  ...,  0.0082,  0.0028, -0.0112],
        ...,
        [ 0.0012, -0.0039, -0.0129,  ...,  0.0040, -0.0040,  0.0005],
        [ 0.0027,  0.0030,  0.0068,  ..., -0.0020,  0.0061,  0.0076],
        [ 0.0110,  0.0119, -0.0117,  ..., -0.0039,  0.0134,  0.0121]],
       requires_grad=True)

In [119]:
model.linear.weight

Parameter containing:
tensor([[ 0.0780,  0.0105, -0.1745, -0.2116,  0.2044,  0.0911,  0.0927,  0.1407,
          0.2012, -0.2002, -0.1130,  0.1948, -0.1500, -0.1252, -0.0950,  0.0979,
         -0.0300, -0.0279, -0.0903,  0.1281,  0.0245, -0.2033,  0.1830,  0.0037,
          0.1137,  0.1143,  0.1439, -0.1793, -0.0927,  0.1774,  0.0944, -0.1786,
          0.0226, -0.1434,  0.1764,  0.0614, -0.1071, -0.0979, -0.0329, -0.1652,
         -0.0544,  0.0581, -0.0399, -0.0442, -0.0979, -0.1933, -0.1237,  0.1137,
          0.0281, -0.0073, -0.1676,  0.0246, -0.1775, -0.0671,  0.1329,  0.0459,
         -0.1396, -0.0957, -0.1236, -0.0892,  0.0220,  0.1763,  0.2124,  0.0200,
         -0.1425, -0.1426, -0.1543, -0.1153, -0.0663, -0.0267, -0.0763, -0.0009,
          0.0726, -0.0704, -0.1711,  0.0104,  0.1853, -0.1232, -0.1810, -0.0519,
          0.2046,  0.0765,  0.0894,  0.0077, -0.1509, -0.0995, -0.0006, -0.0833,
          0.0570, -0.1613,  0.1782, -0.0965,  0.0696, -0.1400,  0.2038, -0.0241,
      

In [120]:

model.embed.weight.data.copy_(emb)

model.embed.weight.requires_grad = False

In [121]:
model.embed.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0271,  0.0583, -0.0169,  ..., -0.0071,  0.0726,  0.0497],
        [ 0.0272,  0.0583, -0.0171,  ..., -0.0070,  0.0728,  0.0496],
        ...,
        [ 0.0267,  0.0577, -0.0167,  ..., -0.0071,  0.0724,  0.0492],
        [ 0.0275,  0.0583, -0.0170,  ..., -0.0070,  0.0731,  0.0496],
        [ 0.0263,  0.0573, -0.0160,  ..., -0.0063,  0.0715,  0.0492]])

## train

In [32]:
xs.shape

torch.Size([32, 128])

In [150]:
num_epochs = 1
best_accuracy = 0
batch_size = 32

for epoch in range(num_epochs):
    for bii in tqdm(range(len(train_dataset_featurized)//batch_size+1)):
        batchd = train_dataset_featurized[bii*batch_size:(bii+1)*batch_size]
        
        xs = torch.tensor(batchd["input_ids"])
#         print("xs", xs.shape)
        lens = (xs != 0).sum(1)
#         print("lens", lens.shape)
        labels = torch.tensor(batchd['label'])
#         print("labels", labels.shape)
        
        model.train()
        optimizer.zero_grad()
        logprobs = model(xs, lens) 
#         print("logits", labels.shape)
#         logprobs = lsm(logits)
#         print("logprobs", labels.shape)
        loss = criterion(logprobs, labels)
        loss.backward()
        optimizer.step()
        
        if bii % 1000 == 0:  
            train_loss = loss.item()
            
            model.eval()
            correct = 0
            val_loss = 0.

            for evbii in range(len(eval_dataset_featurized)//batch_size+1):
                batchd = eval_dataset_featurized[evbii*batch_size:(evbii+1)*batch_size]

                xs = torch.tensor(batchd["input_ids"])
    #             print("xs", xs.shape)
                lens = (xs != 0).sum(1)
    #             print("lens", lens.shape)
                labels = torch.tensor(batchd['label'])
    #             print("labels", labels.shape)

                logprobs = model(xs, lens) 
    #             print("logits", logits.shape)
#                 logprobs = lsm(logits)
    #             print("logprobs", logprobs.shape)

                loss = criterion(logprobs, labels)
                val_loss += loss.item()

                preds = torch.argmax(logprobs, dim=-1)
    #             print("preds", preds.shape)
                correct += (preds == labels).sum().numpy()

            accuracy = correct / len(eval_dataset_featurized)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                torch.save(model.state_dict(), save_path)

            print('Batch: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(
                (bii+1), train_loss, val_loss, round(accuracy, 3)
            ))

#     print('Epoch: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {}'.format(
#         (epoch+1), train_loss, val_loss, round(accuracy, 3)
#     ))

  0%|          | 5/17168 [00:03<2:35:21,  1.84it/s] 

Batch: 1 | Train Loss: 1.0991970300674438 | Val Loss: 338.31070363521576 | Val Accuracy: 0.348


  6%|▌         | 1008/17168 [00:31<46:35,  5.78it/s]  

Batch: 1001 | Train Loss: 1.0844221115112305 | Val Loss: 334.4391733407974 | Val Accuracy: 0.438


 12%|█▏        | 2005/17168 [00:59<59:44,  4.23it/s]

Batch: 2001 | Train Loss: 1.068668246269226 | Val Loss: 329.36107552051544 | Val Accuracy: 0.461


 18%|█▊        | 3009/17168 [01:26<40:34,  5.81it/s]

Batch: 3001 | Train Loss: 1.0415328741073608 | Val Loss: 324.40881329774857 | Val Accuracy: 0.472


 23%|██▎       | 4009/17168 [01:54<37:42,  5.82it/s]

Batch: 4001 | Train Loss: 1.0505361557006836 | Val Loss: 320.2888207435608 | Val Accuracy: 0.483


 29%|██▉       | 5009/17168 [02:22<34:53,  5.81it/s]

Batch: 5001 | Train Loss: 1.0537341833114624 | Val Loss: 316.96708446741104 | Val Accuracy: 0.497


 35%|███▌      | 6009/17168 [02:49<31:58,  5.82it/s]

Batch: 6001 | Train Loss: 0.9865809679031372 | Val Loss: 314.1910751461983 | Val Accuracy: 0.504


 41%|████      | 7009/17168 [03:17<29:17,  5.78it/s]

Batch: 7001 | Train Loss: 1.1000862121582031 | Val Loss: 312.05656093358994 | Val Accuracy: 0.512


 47%|████▋     | 8007/17168 [03:45<26:26,  5.77it/s]

Batch: 8001 | Train Loss: 1.001775860786438 | Val Loss: 310.1687545776367 | Val Accuracy: 0.518


 52%|█████▏    | 9006/17168 [04:13<23:25,  5.81it/s]

Batch: 9001 | Train Loss: 0.9694463610649109 | Val Loss: 308.5297416448593 | Val Accuracy: 0.522


 58%|█████▊    | 10008/17168 [04:41<20:40,  5.77it/s]

Batch: 10001 | Train Loss: 0.9584894180297852 | Val Loss: 307.05324923992157 | Val Accuracy: 0.524


 64%|██████▍   | 11005/17168 [05:09<24:14,  4.24it/s]

Batch: 11001 | Train Loss: 1.0768154859542847 | Val Loss: 305.9373205304146 | Val Accuracy: 0.527


 70%|██████▉   | 12005/17168 [05:38<20:20,  4.23it/s]

Batch: 12001 | Train Loss: 0.9610840678215027 | Val Loss: 304.8757339119911 | Val Accuracy: 0.533


 76%|███████▌  | 13005/17168 [06:06<12:38,  5.49it/s]

Batch: 13001 | Train Loss: 0.9883594512939453 | Val Loss: 303.7991444468498 | Val Accuracy: 0.533


 82%|████████▏ | 14007/17168 [06:34<09:36,  5.49it/s]

Batch: 14001 | Train Loss: 1.1926300525665283 | Val Loss: 302.9394243955612 | Val Accuracy: 0.534


 87%|████████▋ | 15008/17168 [07:02<06:14,  5.77it/s]

Batch: 15001 | Train Loss: 0.9064332842826843 | Val Loss: 302.23910760879517 | Val Accuracy: 0.534


 93%|█████████▎| 16008/17168 [07:30<03:30,  5.50it/s]

Batch: 16001 | Train Loss: 0.8755434155464172 | Val Loss: 301.5549948811531 | Val Accuracy: 0.538


 99%|█████████▉| 17005/17168 [07:58<00:38,  4.23it/s]

Batch: 17001 | Train Loss: 0.9976648092269897 | Val Loss: 300.94450026750565 | Val Accuracy: 0.54


100%|██████████| 17168/17168 [08:02<00:00, 35.56it/s]


In [347]:
train_losses = [1.09919703, 1.084422112, 1.068668246, 1.041532874, 1.050536156, 1.053734183, 0.9865809679, 1.100086212, 1.001775861, 0.9694463611, 0.958489418, 1.076815486, 0.9610840678, 0.9883594513, 1.192630053, 0.9064332843, 0.8755434155]
val_losses = [338.3107036, 334.4391733, 329.3610755, 324.4088133, 320.2888207, 316.9670845, 314.1910751, 312.0565609, 310.1687546, 308.5297416, 307.0532492, 305.9373205, 304.8757339, 303.7991444, 302.9394244, 302.2391076, 301.5549949]
val_accs = [0.348, 0.438, 0.461, 0.472, 0.483, 0.497, 0.504, 0.512, 0.518, 0.522, 0.524, 0.527, 0.533, 0.533, 0.534, 0.534, 0.538]

np.savez("data/fasttext_train_stats", train_losses=train_losses, val_losses=val_losses, val_accs=val_accs)

In [319]:

torch.save(model.state_dict(), "bow_best.pt")

In [19]:
d = np.array(eval_dataset_featurized["label"])
perc = [(d == ii).sum()/len(d) for ii in range(3)]
perc

[0.3382442592968909, 0.3286933550091445, 0.3330623856939646]

In [16]:
model.embed.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0271,  0.0583, -0.0169,  ..., -0.0071,  0.0726,  0.0497],
        [ 0.0272,  0.0583, -0.0171,  ..., -0.0070,  0.0728,  0.0496],
        ...,
        [ 0.0267,  0.0577, -0.0167,  ..., -0.0071,  0.0724,  0.0492],
        [ 0.0275,  0.0583, -0.0170,  ..., -0.0070,  0.0731,  0.0496],
        [ 0.0263,  0.0573, -0.0160,  ..., -0.0063,  0.0715,  0.0492]])

## get electra logit

In [305]:
model = torch.load("bow_best.pt")
model

BOWEncoder(
  (embed): Embedding(30522, 128, padding_idx=0)
  (linear): Linear(in_features=128, out_features=3, bias=True)
  (sm): LogSoftmax(dim=-1)
)

In [152]:
model.eval()
basemodel.eval()

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [241]:
xs.shape, lens.shape, type(xs), type(lens)

(torch.Size([23, 128]), torch.Size([23]), torch.Tensor, torch.Tensor)

In [329]:
x = torch.tensor(np.random.randint(0,30000, (1,128)))
l = torch.tensor([20]).int()
x.shape, l.shape, type(x), type(l)

(torch.Size([1, 128]), torch.Size([1]), torch.Tensor, torch.Tensor)

In [324]:
m = BOWEncoder(30522, 128, 3)
m.load_state_dict(torch.load("bow_best.pt"))
m.eval()

BOWEncoder(
  (embed): Embedding(30522, 128, padding_idx=0)
  (linear): Linear(in_features=128, out_features=3, bias=True)
  (sm): LogSoftmax(dim=-1)
)

In [330]:
l

tensor([20], dtype=torch.int32)

In [331]:
l.view(l.size()[0],1)

tensor([[20]], dtype=torch.int32)

In [None]:
length.view(length.size()[0],1).expand_as(out).float()

In [None]:
aa

In [332]:
m(x,l)

emb torch.Size([1, 128, 128])
sum torch.Size([1, 128])
avg torch.Size([1, 128])
out torch.Size([1, 3])


tensor([[-2.7361, -0.3920, -1.3492]], grad_fn=<LogSoftmaxBackward0>)

In [317]:
_ = model(xs, lens)
fl = model.logits
fl.shape

torch.Size([23, 3])

In [260]:
bl = basemodel(xs).logits
bl.shape

torch.Size([23, 3])

In [167]:
loss2 = nn.NLLLoss(reduce=False, reduction=None)

In [279]:

logprobs = model(xs, lens)
loss2(logprobs, labels)

tensor([1.7051, 0.3321, 1.0375, 1.2840, 1.3471, 0.8026, 0.8340, 0.6849, 1.1850,
        1.5402, 0.5908, 0.7205, 0.9859, 0.9000, 1.3907, 0.1224, 1.4801, 0.4347,
        1.8268, 0.4064, 0.7410, 1.1999, 0.6720], grad_fn=<NllLossBackward0>)

In [261]:
y_onehot = torch.zeros(23, 3)
# scatter will write the value of 1 into the position of y_onehot given by y
y_onehot.scatter_(1, labels.unsqueeze(1), 1)

# y_onehot = torch.tensor([0,1,0])

tensor([[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]])

In [291]:
bl

tensor([[ 0.0477,  0.0331, -0.1686],
        [ 0.0454,  0.0332, -0.1667],
        [ 0.0319,  0.0352, -0.1552],
        [ 0.0310,  0.0385, -0.1478],
        [ 0.0309,  0.0374, -0.1514],
        [ 0.0362,  0.0275, -0.1516],
        [ 0.0364,  0.0275, -0.1554],
        [ 0.0350,  0.0363, -0.1464],
        [ 0.0339,  0.0423, -0.1424],
        [ 0.0338,  0.0428, -0.1424],
        [ 0.0392,  0.0442, -0.1516],
        [ 0.0288,  0.0444, -0.1412],
        [ 0.0308,  0.0429, -0.1403],
        [ 0.0309,  0.0467, -0.1418],
        [ 0.0291,  0.0372, -0.1387],
        [ 0.0307,  0.0376, -0.1426],
        [ 0.0306,  0.0367, -0.1454],
        [ 0.0300,  0.0350, -0.1465],
        [ 0.0397,  0.0401, -0.1579],
        [ 0.0351,  0.0330, -0.1586],
        [ 0.0381,  0.0307, -0.1674],
        [ 0.0423,  0.0359, -0.1709],
        [ 0.0375,  0.0262, -0.1623]], grad_fn=<AddmmBackward0>)

In [306]:
l = lsm(bl).mul(y_onehot).sum(-1)
l

tensor([-1.2427, -1.0407, -1.0412, -1.0376, -1.2261, -1.2246, -1.0444, -1.0422,
        -1.0378, -1.2227, -1.0407, -1.0506, -1.2202, -1.0341, -1.2164, -1.0465,
        -1.0394, -1.0450, -1.2347, -1.0395, -1.0395, -1.2434, -1.0324],
       grad_fn=<SumBackward1>)

In [310]:

r = torch.log(torch.softmax(fl,-1).mul(torch.softmax(bl,-1)).sum(-1))
r

tensor([-1.0726, -1.0612, -1.0753, -1.0754, -1.0855, -1.1199, -1.1059, -1.0947,
        -1.0790, -1.0793, -1.0770, -1.0778, -1.1069, -1.0835, -1.0863, -1.0516,
        -1.0658, -1.0653, -1.0661, -1.0631, -1.0797, -1.0929, -1.0824],
       grad_fn=<LogBackward0>)

In [280]:
(l.sub(r))

tensor([-0.7114, -0.6468, -0.6470, -0.6457, -0.7066, -0.7061, -0.6481, -0.6473,
        -0.6458, -0.7056, -0.6468, -0.6503, -0.7048, -0.6444, -0.7037, -0.6488,
        -0.6463, -0.6483, -0.7091, -0.6464, -0.6464, -0.7116, -0.6438],
       grad_fn=<SubBackward0>)

In [334]:
(l.sub(r)).sum()

tensor(484.8472, grad_fn=<SumBackward0>)

In [283]:

logprobs = lsm(basemodel(xs).logits)
loss2(logprobs, labels)

tensor([-0.2886, -0.3532, -0.3530, -0.3543, -0.2934, -0.2939, -0.3519, -0.3527,
        -0.3542, -0.2944, -0.3532, -0.3497, -0.2952, -0.3556, -0.2963, -0.3512,
        -0.3537, -0.3517, -0.2909, -0.3536, -0.3536, -0.2884, -0.3562],
       grad_fn=<NllLossBackward0>)

## eval base electra wrong sets

In [336]:
model.eval()

BOWEncoder(
  (embed): Embedding(30522, 128, padding_idx=0)
  (linear): Linear(in_features=128, out_features=3, bias=True)
  (sm): LogSoftmax(dim=-1)
)

In [343]:
allpreds = []
for evbii in range(len(eval_dataset_featurized)//batch_size+1):
    batchd = eval_dataset_featurized[evbii*batch_size:(evbii+1)*batch_size]

    xs = torch.tensor(batchd["input_ids"])
#             print("xs", xs.shape)
    lens = (xs != 0).sum(1)
#             print("lens", lens.shape)
#     labels = torch.tensor(batchd['label'])
#             print("labels", labels.shape)

    logprobs = model(xs, lens) 
#             print("logits", logits.shape)
#                 logprobs = lsm(logits)
#             print("logprobs", logprobs.shape)

    preds = torch.argmax(logprobs, dim=-1)
    
    allpreds += preds.tolist()
    
allpreds = np.array(allpreds)
allpreds.shape

(9842,)

In [345]:
labels = eval_dataset_featurized["label"]

In [346]:
np.save("data/fasttext_eval_preds", allpreds)