In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
import sys
# sys.path.append('../../transformers/src')

import math
import pandas as pd
import numpy as np

import torch
import torch.nn as nn

from transformers import AdamW, pipeline, PegasusForConditionalGeneration, PegasusTokenizer
from transformers import BartConfig
from transformers import AutoConfig
from transformers.models.bart.modeling_bart import EncoderLayer, SinusoidalPositionalEmbedding, LayerNorm

In [2]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("torch_device:",torch_device)

torch_device: cuda


In [3]:
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
# batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device)
if torch_device == 'cuda':
    model = torch.nn.DataParallel(model).to(torch_device)
else:
    model = model.to(torch_device)


In [22]:
for i in range(len(tokenizer)):
    token = tokenizer.convert_ids_to_tokens(i)
    if token.startswith(('<')):
        print(token)

<pad>
</s>
<unk>
<n>


In [17]:
print(tokenizer.convert_tokens_to_ids('<sep>'))
print(tokenizer.convert_tokens_to_ids('<n>'))

105
106


In [28]:
tokenizer.additional_special_tokens

['[SAYS]', '[EOU]', '[EOT]']

In [31]:
special_tokens = tokenizer.additional_special_tokens
[tokenizer.convert_tokens_to_ids(token) for token in special_tokens]

[96103, 96104, 96105]

In [30]:
math.sqrt(len(tokenizer))

310.0096772683072

In [8]:
sample_text = [
    "Yuji [SAYS] Hi, John! [EOU] How are you? [EOU] [EOT] John [says] I'm good. Thanks. [EOU]",
    "Naraki [SAYS] Good evening, Mr.Kim. [EOU] How was your today? [EOU] [EOT] Kim [says] It is a pleasant day. [EOU]"
]
batch = tokenizer.prepare_seq2seq_batch(sample_text, truncation=True, padding='longest')
print('  '.join([tokenizer.convert_ids_to_tokens(i) for i in batch['input_ids'][0]]))

special_tokens_dict = {'additional_special_tokens': ['[SAYS]','[EOU]','[EOT]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.module.resize_token_embeddings(len(tokenizer))

batch = tokenizer.prepare_seq2seq_batch(sample_text, truncation=True, padding='longest')
print('  '.join([tokenizer.convert_ids_to_tokens(i) for i in batch['input_ids'][0]]))

▁Yu  ji  ▁[  SA  YS  ]  ▁Hi  ,  ▁John  !  ▁[  E  OU  ]  ▁How  ▁are  ▁you  ?  ▁[  E  OU  ]  ▁[  E  OT  ]  ▁John  ▁[  say  s  ]  ▁I  '  m  ▁good  .  ▁Thanks  .  ▁[  E  OU  ]  </s>  <pad>  <pad>  <pad>
▁Yu  ji  [SAYS]  ▁Hi  ,  ▁John  !  [EOU]  ▁How  ▁are  ▁you  ?  [EOU]  [EOT]  ▁John  ▁[  say  s  ]  ▁I  '  m  ▁good  .  ▁Thanks  .  [EOU]  </s>  <pad>  <pad>  <pad>


In [84]:
tokenizer.convert_tokens_to_ids('[EOT]')

96105

In [85]:
class SpeakerConverter():
    def __init__(self, speaker_num, eot_idx):
        self.speaker_num = speaker_num
        self.eot_idx=eot_idx
        self.current_speaker_id=1
    
    def init_speaker_id(self):
        self.current_speaker_id=1
    
    def change_speaker_id(self):
        if self.current_speaker_id==1:
            self.current_speaker_id = 2
        elif self.current_speaker_id==2:
            self.current_speaker_id = 1
    
    def convert_id_to_speaker_id(self, w_id):
        if w_id==0:
            return 0
        elif w_id==self.eot_idx:
            self.change_speaker_id()
        return self.current_speaker_id

    def convert_batch(self, input_ids):
        batch_speaker_ids = []
        for text_ids in input_ids:
            speaker_ids = []
            sc.init_speaker_id()
            for w_id in text_ids:
                speaker_ids.append(sc.convert_id_to_speaker_id(w_id.item()))
            batch_speaker_ids.append(speaker_ids)
        return torch.tensor(batch_speaker_ids)

# Operation Check
sc = SpeakerConverter(speaker_num = 2, eot_idx = tokenizer.convert_tokens_to_ids('[EOT]'))
batch_speaker_ids = sc.convert_batch(batch['input_ids'])
embed_speaker = nn.Embedding(3, 10, padding_idx=0)
embed_spk = embed_speaker(batch_speaker_ids)

In [86]:
class BartEncoderWithSpeakerEmbedding(nn.Module):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    :class:`EncoderLayer`.
    Args:
        config: BartConfig
    """

    def __init__(self, config: BartConfig, embed_tokens):
        super().__init__()

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = config.max_position_embeddings

        self.embed_tokens = embed_tokens

        # speaker embedding setup
        self.eot_idx = embed_tokens.num_embeddings - 1
        speaker_num = 2 #TODO
        self.speaker_converter = SpeakerConverter(speaker_num = speaker_num, eot_idx = self.eot_idx)
        self.speaker_embed_scale = 0.1
        self.embed_speaker = nn.Embedding(speaker_num+1, embed_dim, padding_idx=0)
        
        if config.static_position_embeddings:
            self.embed_positions = SinusoidalPositionalEmbedding(
                config.max_position_embeddings, embed_dim, self.padding_idx
            )
        else:
            self.embed_positions = LearnedPositionalEmbedding(
                config.max_position_embeddings,
                embed_dim,
                self.padding_idx,
                config.extra_pos_embeddings,
            )
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
        # mbart has one extra layer_norm
        self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None

    def forward(
        self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True
    ):
        """
        Args:
            input_ids (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            attention_mask (torch.LongTensor): indicating which indices are padding tokens
        Returns:
            BaseModelOutput or Tuple comprised of:
                - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate hidden states of shape `(src_len,
                  batch, embed_dim)`. Only populated if *output_hidden_states:* is True.
                - **all_attentions** (tuple(torch.FloatTensor)): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        """
        # check attention mask and invert
        if attention_mask is not None:
            attention_mask = invert_mask(attention_mask)

        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
        embed_pos = self.embed_positions(input_ids)
        
        batch_speaker_ids = self.speaker_converter.convert_batch(input_ids)
        embed_spk = self.embed_speaker(batch_speaker_ids) * self.speaker_embed_scale
        
        # x = inputs_embeds + embed_pos
        x = inputs_embeds + embed_pos + embed_spk
        x = self.layernorm_embedding(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        encoder_states = [] if output_hidden_states else None
        all_attentions = () if output_attentions else None
        for encoder_layer in self.layers:
            if output_hidden_states:
                encoder_states.append(x)
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                attn = None
            else:
                x, attn = encoder_layer(x, attention_mask, output_attentions=output_attentions)

            if output_attentions:
                all_attentions = all_attentions + (attn,)

        if self.layer_norm:
            x = self.layer_norm(x)
        if output_hidden_states:
            encoder_states.append(x)
            # T x B x C -> B x T x C
            encoder_states = tuple(hidden_state.transpose(0, 1) for hidden_state in encoder_states)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if not return_dict:
            return tuple(v for v in [x, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)


In [124]:
model_name = 'google/pegasus-xsum'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
# batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device)
# if torch_device == 'cuda':
#     model = torch.nn.DataParallel(model).to(torch_device)
# else:
#     model = model.to(torch_device)

special_tokens_dict = {'additional_special_tokens': ['[SAYS]','[EOU]','[EOT]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(96106, 1024)

In [126]:
original_encoder = model.model.encoder

In [127]:
config = AutoConfig.from_pretrained("google/pegasus-xsum")
model.model.encoder = BartEncoderWithSpeakerEmbedding(config, model.model.shared)

In [128]:
model.model.encoder.layers[0].fc1.weight

Parameter containing:
tensor([[ 0.0048,  0.0194,  0.0144,  ..., -0.0228, -0.0309, -0.0052],
        [-0.0075,  0.0200, -0.0099,  ...,  0.0241, -0.0309, -0.0011],
        [-0.0077, -0.0009, -0.0259,  ..., -0.0113,  0.0046,  0.0265],
        ...,
        [ 0.0131,  0.0139,  0.0084,  ...,  0.0091, -0.0060,  0.0289],
        [-0.0276, -0.0245, -0.0044,  ..., -0.0249,  0.0062,  0.0243],
        [ 0.0029,  0.0141,  0.0160,  ..., -0.0208, -0.0162, -0.0165]],
       requires_grad=True)

In [129]:
original_encoder.layers[0].fc1.weight

Parameter containing:
tensor([[ 0.1794, -0.0928,  0.1256,  ..., -0.3336,  0.6347, -0.1645],
        [-0.1243,  0.0026, -0.0529,  ..., -0.1189, -0.2767,  0.2300],
        [-0.0132, -0.0356,  0.0921,  ...,  0.0906, -0.3452,  0.1085],
        ...,
        [-0.1842, -0.0378,  0.0911,  ..., -0.4140, -0.3656, -0.1045],
        [ 0.1888, -0.0156,  0.0107,  ..., -0.3721,  0.0658,  0.1703],
        [-0.4935, -0.0103,  0.1714,  ...,  0.0078, -0.2624,  0.0964]],
       requires_grad=True)

In [130]:
model.model.encoder.layers[0].self_attn.k_proj.bias

Parameter containing:
tensor([-0.0190, -0.0234, -0.0032,  ..., -0.0156, -0.0280, -0.0148],
       requires_grad=True)

In [131]:
original_encoder.layers[0].self_attn.k_proj.bias

Parameter containing:
tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True)

In [132]:
param2 = model.model.encoder.state_dict()
for name, param in original_encoder.named_parameters():
    param2[name] = original_encoder.state_dict()[name]
model.model.encoder.load_state_dict(param2)

<All keys matched successfully>

In [133]:
if torch_device == 'cuda':
    model = torch.nn.DataParallel(model).to(torch_device)
else:
    model = model.to(torch_device)


In [134]:
model.module.model.encoder.layers[0].self_attn.k_proj.bias

Parameter containing:
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True)

In [98]:
model.module.model.encoder.layers[0].fc1.weight

Parameter containing:
tensor([[ 2.8663e-02, -1.9442e-02,  7.4206e-03,  ..., -1.5709e-02,
         -1.6828e-02, -2.5521e-02],
        [ 1.0093e-02,  2.6218e-02,  2.4884e-02,  ..., -1.2983e-02,
         -1.2506e-02, -1.2657e-02],
        [-2.9881e-02, -3.0453e-02, -6.0546e-03,  ..., -4.6832e-03,
         -1.9767e-02, -9.9810e-03],
        ...,
        [ 1.7503e-02, -2.7974e-02, -2.9739e-02,  ...,  1.1636e-02,
          2.1632e-02,  6.9369e-03],
        [-4.0149e-03,  2.1383e-02,  2.8010e-02,  ..., -2.1716e-03,
          1.7844e-02,  2.3698e-03],
        [-5.2091e-05, -2.3913e-02, -2.1844e-02,  ...,  2.4226e-02,
          7.8678e-03, -7.3340e-03]], requires_grad=True)

In [76]:
model.module.model.encoder

BartEncoderWithSpeakerEmbedding(
  (embed_tokens): Embedding(96103, 1024, padding_idx=0)
  (embed_speaker): Embedding(3, 1024, padding_idx=0)
  (embed_positions): SinusoidalPositionalEmbedding(512, 1024)
  (layers): ModuleList(
    (0): EncoderLayer(
      (self_attn): Attention(
        (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
      )
      (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1024, out_features=4096, bias=True)
      (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (1): EncoderLayer(
      (self_attn): Attention(
        (k_proj): Linear(in_features=1024, out_features=

In [141]:
import pickle
with open('../../Expt_DialogSum/summarizer/args.dat', 'rb') as fp:
    args = pickle.load(fp)


In [142]:
sys.path.append('../../Expt_DialogSum/summarizer')
from finetune import SummarizationModule
model: SummarizationModule = SummarizationModule(args)

ContextualVersionConflict: (tensorboard 1.15.0 (/home/naraki/.local/lib/python3.6/site-packages), Requirement.parse('tensorboard>=2.2.0'), {'pytorch-lightning'})

In [77]:
src_text = [
    """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]
batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest')

In [6]:
batch = batch.to(torch_device)
translated = model.module.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."

In [7]:
corpus_dir = "/home/naraki/dialogsum/corpus"
df_train = pd.read_table(os.path.join(corpus_dir,"train.tsv"), index_col=0)
df_train.head()

Unnamed: 0,id,summary,dialogue,speaker_num,dialogue_len
0,13818513,Amanda baked cookies and will bring Jerry some...,Amanda: I baked cookies. Do you want some?\r\...,2,94
1,13728867,Olivia and Olivier are voting for liberals in ...,Olivia: Who are you voting for in this electio...,2,111
2,13681000,Kim may try the pomodoro technique recommended...,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",2,528
3,13730747,Edward thinks he is in love with Bella. Rachel...,"Edward: Rachel, I think I'm in ove with Bella....",2,155
4,13728094,"Sam is confused, because he overheard Rick com...",Sam: hey overheard rick say something\r\nSam:...,2,909


In [8]:
dialogues = list(df_train['dialogue'][:4].values)
summaries = list(df_train['summary'][:4].values)

In [9]:
batch = tokenizer.prepare_seq2seq_batch(dialogues, truncation=True, padding='longest').to(torch_device)
# batch = tokenizer.prepare_seq2seq_batch(dialogues, truncation=True, max_length=256).to(torch_device)
translated = model.module.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
tgt_text

['Jerry: Hello, Amanda.',
 "Olivia: Hi, I'm Olivia from Newsround and I'm here to answer your questions.",
 "Kim: Hi Tim, what's up?",
 "Rachel: I'm outside."]

In [10]:
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
encoding = tokenizer(dialogues, return_tensors='pt', padding=True, truncation=True)
# input_ids = encoding['input_ids'].to(torch_device)
# attention_mask = encoding['attention_mask'].to(torch_device)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [11]:
outputs = model(input_ids, attention_mask=attention_mask)


In [12]:
outputs[0].shape

torch.Size([4, 131, 96103])

In [15]:
outputs.logits

AttributeError: 'tuple' object has no attribute 'logits'

In [13]:
loss = outputs.loss
loss.backward()
optimizer.step()

AttributeError: 'tuple' object has no attribute 'loss'

In [14]:
outputs

(tensor([[[ 0.0000e+00,  1.2714e+01,  6.7395e-01,  ..., -3.6678e+00,
            2.4565e-01, -3.3426e+00],
          [ 0.0000e+00,  1.3197e+01,  7.1254e-01,  ..., -4.1114e+00,
           -1.5395e+00, -1.0908e+00],
          [ 0.0000e+00,  1.3785e+01,  6.9809e-01,  ..., -6.1680e+00,
           -1.4458e+00, -1.1608e+00],
          ...,
          [ 0.0000e+00,  1.3297e+01,  8.8518e-02,  ..., -4.8198e+00,
           -3.4773e-01, -9.2595e-01],
          [ 0.0000e+00,  1.2996e+01,  3.6114e-01,  ..., -4.4285e+00,
            1.8546e+00, -2.1107e+00],
          [ 0.0000e+00,  1.3119e+01,  7.2225e-01,  ..., -3.4602e+00,
            1.8937e+00, -6.2107e+00]],
 
         [[ 0.0000e+00,  1.3079e+01,  1.3642e+00,  ..., -3.3401e+00,
           -2.5243e+00, -4.6213e+00],
          [ 0.0000e+00,  1.0686e+01,  3.9215e-01,  ..., -2.0992e+00,
           -1.0811e+00, -5.1900e+00],
          [ 0.0000e+00,  1.0144e+01,  1.0483e+00,  ..., -1.7541e+00,
           -1.7547e+00, -3.3855e+00],
          ...,
    