In [16]:

# code from https://colab.research.google.com/drive/1wEPiGXToKfuNFGEjjoJsqlssuSbV8EU8#scrollTo=tCEDXLxq628O
# Sentiment Classification with BERTbased model
import transformers, tokenizers
# !conda install -c conda-forge pytorch-lightning
# !git clone https://github.com/davidtvs/pytorch-lr-finder.git && cd pytorch-lr-finder && python setup.py install

In [18]:
import torch, logging, os
from torch import nn
from typing import List
import torch.nn.functional as F
from transformers import DistilBertTokenizer, AutoTokenizer, AutoModelWithLMHead, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from functools import lru_cache
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from argparse import Namespace
from sklearn.metrics import classification_report


In [19]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModelWithLMHead.from_pretrained("distilroberta-base")
base_model = model.base_model

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

In [20]:
text = "Elvis is the king of rock!"
enc = tokenizer.encode_plus(text)
enc.keys()


dict_keys(['input_ids', 'attention_mask'])

In [32]:
input = enc["input_ids"]
print(input)
input = torch.tensor(input).unsqueeze(0)
print(input)
mask = torch.tensor(enc['attention_mask']).unsqueeze(0)
print(mask)
out = base_model(input, mask)
print(out)
print(out[0].shape)

# out = base_model(torch.tensor(enc["input_ids"]).unsqueeze(0), torch.tensor(enc["attention_mask"]).unsqueeze(0))
# print(out)
# print(out[0].shape)

[0, 9682, 9578, 16, 5, 8453, 9, 3152, 328, 2]
tensor([[   0, 9682, 9578,   16,    5, 8453,    9, 3152,  328,    2]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0384,  0.0482, -0.0128,  ...,  0.0148, -0.0885,  0.0388],
         [ 0.0312,  0.2362, -0.0594,  ..., -0.0862, -0.0779,  0.0986],
         [-0.0213,  0.2983, -0.0144,  ...,  0.1338, -0.0386,  0.1351],
         ...,
         [-0.1052,  0.0377, -0.0083,  ...,  0.1869, -0.0304,  0.0325],
         [-0.0163, -0.0976,  0.0186,  ..., -0.2109, -0.1637,  0.1759],
         [-0.0542,  0.0595, -0.0695,  ..., -0.0484, -0.1007,  0.0362]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=None, hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)
torch.Size([1, 10, 768])


In [45]:
t = "Elvis is the king of rock"
enc = tokenizer.encode_plus(t)
token_representations = base_model(torch.tensor(enc['input_ids']).unsqueeze(0))[0][0]
print(token_reprensentations)
print(tokenizer.decode(enc['input_ids']))
print(f"Length: {len(enc['input_ids'])}")
# print("Length: {}".format(len(enc['input_ids'])))
print(token_representations.shape)

tensor([[-0.0337,  0.0782, -0.0158,  ...,  0.0062, -0.0636,  0.0063],
        [ 0.0288,  0.2495, -0.0943,  ..., -0.1287, -0.0692,  0.1227],
        [-0.0370,  0.3241, -0.0406,  ...,  0.1083, -0.0169,  0.1451],
        ...,
        [-0.0277, -0.0062,  0.1047,  ...,  0.1294,  0.0563,  0.0373],
        [-0.1182,  0.0516, -0.0148,  ...,  0.1692, -0.0189,  0.0419],
        [-0.0517,  0.1022, -0.0675,  ..., -0.0594, -0.0697, -0.0191]],
       grad_fn=<SelectBackward>)
<s>Elvis is the king of rock</s>
Length: 9
torch.Size([9, 768])


In [48]:
def mish(input):
    return input * torch.tanh(F.softplus(input))
  
class Mish(nn.Module):
    def forward(self, input):
        return mish(input)

In [49]:
class EmoModel(nn.Module):
    def __init__(self, base_model, n_classes, base_model_output_size=768, dropout=0.05):
        super().__init__()
        self.base_model = base_model
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, base_model_output_size),
            Mish(),
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, n_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean=0.0, std=0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, input_, *args):
        X, attention_mask = input_
        hidden_states = self.base_model(X, attention_mask=attention_mask)
        
        # here I use only representation of <s> token, but you can easily use more tokens,
        # maybe do some pooling / RNNs... go crazy here!

        # use the [CLS] representation
 
        return self.classifier(hidden_states[0][:, 0, :])


In [51]:
classifier = EmoModel(AutoModelWithLMHead.from_pretrained("distilroberta-base").base_model, 3)
X = torch.tensor(enc["input_ids"]).unsqueeze(0).to('cpu')
attn = torch.tensor(enc["attention_mask"]).unsqueeze(0).to('cpu')
print(classifier((X, attn)))

tensor([[-0.0147, -0.0957, -0.0712]], grad_fn=<AddmmBackward>)
