In [None]:
import json
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

## Load Abstrasts and Facets

In [184]:
def read_facet_results(file_name):
    with open(file_name, "r") as f:
        lines = f.readlines()
    facet_results = []
    for line in lines:
        facet_results.append(json.loads(line))
    return facet_results

facet_results = read_facet_results('annotation/gpt_annotation/cs5_abstract-tag.json')


In [192]:
abstracts = [] 
labels = []
for annotation in facet_results:
    abstracts.append(annotation['sent'])
    labels.append(annotation['facets'] )

## Preprocess data

In [97]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [196]:
class FacetedSummaryDataset(Dataset):
    def __init__(self, abstracts, labels, facet2idx = {"Background": 0, "Method": 1, "Result": 2,  "Value": 3, "Others": 4}):
        self.abstracts = abstracts
        self.labels = labels
        self.label_ids = []
        for sublist in labels:
            self.label_ids.append([facet2idx[facet] for facet in sublist])
        
    def __len__(self):
        return len(self.abstracts)
    
    def __getitem__(self, idx):
       
        model_input = {'input_ids': None, 'attention_mask': None, 'token_type_ids': None, 'labels': None}

        # tokenize a list of sentences
        # add the sep token
        tokenized_sentences = [tokenizer.tokenize(sent) + [tokenizer.sep_token] for sent in self.abstracts[idx]]

        # token type ids
        sent_lens = [len(sent_tokens) for sent_tokens in tokenized_sentences]
        token_type_ids = [[i]*length for i, length in enumerate(sent_lens)]
        token_type_ids = [item for sublist in token_type_ids for item in sublist] # flatten the list
        
        # add the cls token
        tokenized_sentences = [tokenizer.cls_token] + [token for sent in tokenized_sentences for token in sent]
        token_type_ids = [0] + token_type_ids

        # convert the tokens to indices
        # model_input["tokenized_sentences"] = tokenized_sentences
        model_input["input_ids"] = tokenizer.convert_tokens_to_ids(tokenized_sentences)
        model_input['attention_mask'] = [1] * len(model_input['input_ids'])
        model_input['token_type_ids'] = token_type_ids
        
        # convert the facets to indices
        model_input['labels'] = labels[idx]

        return model_input['input_ids'], model_input['attention_mask'], model_input['token_type_ids'], model_input['labels']
    
dataset = FacetedSummaryDataset(abstracts, labels)
print(f'The size of dataset: {len(dataset)}')

The size of dataset: 14


In [158]:
# collate function for padding the input
def collate_fn(batch):
    input_ids, attention_mask, token_type_ids, lst_labels = zip(*batch)
    
    max_len = max([len(ids) for ids in input_ids])
    input_ids = [ids + [0] * (max_len - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_len - len(mask)) for mask in attention_mask]
    token_type_ids = [ids + [0] * (max_len - len(ids)) for ids in token_type_ids]
    lst_labels = [item for sublist in lst_labels for item in sublist]
    return input_ids, attention_mask, token_type_ids, lst_labels

dataloader = DataLoader(dataset, batch_size=3, collate_fn=collate_fn)


## Building Local Models

In [168]:

from transformers import DistilBertModel, BertModel
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import torch

# init
num_facets = 5 # four facets and "others"
model_name = "bert-base-uncased"
plm = BertModel.from_pretrained(model_name)
dropout = nn.Dropout(0.1) # 0.1 is the dropout rate in the implementation of DistilBertForQuestionAnswering
classifier = nn.Linear(plm.config.hidden_size, num_facets) 



### Train the local model

In [161]:
for batch in dataloader:
    input_ids = torch.tensor(batch[0], dtype=torch.long)
    attention_mask = torch.tensor(batch[1], dtype=torch.long)
    token_type_ids = torch.tensor(batch[2], dtype=torch.long)
    labels = torch.tensor(batch[3], dtype=torch.long)

    break

    

In [171]:
# forward pass
lm_output = plm(
    input_ids=input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids
)


IndexError: index out of range in self

In [None]:
hidden_states = lm_output[0]  # (bs, max_len, dim)
hidden_states = dropout(hidden_states)  # (bs, max_len, dim)
logits = classifier(hidden_states)  # (bs, max_len, num_facets)

# only calculate the loss on [SEP] tokens inserted at the end of each sentence
sep_indices = (inputs["input_ids"] == tokenizer.sep_token_id).nonzero()
# mask all the other tokens except [SEP] tokens
sep_mask = torch.zeros_like(inputs["input_ids"]).bool()
sep_mask[sep_indices[:, 0], sep_indices[:, 1]] = True


# gather logits for [SEP] tokens
sep_logits = logits[sep_mask].view(-1, num_facets)  # (nu_sep1+num_sep2+..., num_facets)
labels = None