In [None]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from collections import defaultdict
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertPreTrainedModel, BertModel, AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
from torch.nn import Embedding
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from datasets import load_dataset
from tabulate import tabulate
from keras.utils import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
import ast
import math

2024-12-12 12:38:47.513459: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-12 12:38:47.513523: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-12 12:38:47.513547: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 12:38:47.519847: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Pos Tagging

In [3]:
class BertForWordClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        subword_to_word_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        # average the token-level outputs to compute word-level representations
        max_seq_len = subword_to_word_ids.max() + 1
        word_latents = []
        for i in range(max_seq_len):
            mask = (subword_to_word_ids == i).unsqueeze(dim=-1)
            word_latents.append((sequence_output * mask).sum(dim=1) / mask.sum())
        word_batch = torch.stack(word_latents, dim=1)

        sequence_output = self.dropout(word_batch)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [4]:
class PosTagProsaDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'B-PPO': 0, 'B-KUA': 1, 'B-ADV': 2, 'B-PRN': 3, 'B-VBI': 4, 'B-PAR': 5, 'B-VBP': 6, 'B-NNP': 7, 'B-UNS': 8, 'B-VBT': 9, 'B-VBL': 10, 'B-NNO': 11, 'B-ADJ': 12, 'B-PRR': 13, 'B-PRK': 14, 'B-CCN': 15, 'B-$$$': 16, 'B-ADK': 17, 'B-ART': 18, 'B-CSN': 19, 'B-NUM': 20, 'B-SYM': 21, 'B-INT': 22, 'B-NEG': 23, 'B-PRI': 24, 'B-VBE': 25}
    INDEX2LABEL = {0: 'B-PPO', 1: 'B-KUA', 2: 'B-ADV', 3: 'B-PRN', 4: 'B-VBI', 5: 'B-PAR', 6: 'B-VBP', 7: 'B-NNP', 8: 'B-UNS', 9: 'B-VBT', 10: 'B-VBL', 11: 'B-NNO', 12: 'B-ADJ', 13: 'B-PRR', 14: 'B-PRK', 15: 'B-CCN', 16: 'B-$$$', 17: 'B-ADK', 18: 'B-ART', 19: 'B-CSN', 20: 'B-NUM', 21: 'B-SYM', 22: 'B-INT', 23: 'B-NEG', 24: 'B-PRI', 25: 'B-VBE'}
    NUM_LABELS = 26
    
    def load_dataset(self, data):
        # Prepare buffer
        dataset = []
        sentence = []
        seq_label = []
        for i in range (len(data)):
            for j in range (len(data[i]['tokens'])):
                sentence.append(data[i]['tokens'][j])
                seq_label.append(self.LABEL2INDEX[data[i]['pos_tags'][j]])
            dataset.append({
                    'sentence': sentence,
                    'seq_label': seq_label
                })
            sentence = []
            seq_label = []
        return dataset
    
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):
        data = self.data[index]
        sentence, seq_label = data['sentence'], data['seq_label']
        
        # Add CLS token
        subwords = [self.tokenizer.cls_token_id]
        subword_to_word_indices = [-1] # For CLS
        
        # Add subwords
        for word_idx, word in enumerate(sentence):
            subword_list = self.tokenizer.encode(word, add_special_tokens=False)
            subword_to_word_indices += [word_idx for i in range(len(subword_list))]
            subwords += subword_list
            
        # Add last SEP token
        subwords += [self.tokenizer.sep_token_id]
        subword_to_word_indices += [-1]
        
        return np.array(subwords), np.array(subword_to_word_indices), np.array(seq_label), data['sentence']
    
    def __len__(self):
        return len(self.data)

class PosTagDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(PosTagDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        max_tgt_len = max(map(lambda x: len(x[2]), batch))
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        subword_to_word_indices_batch = np.full((batch_size, max_seq_len), -1, dtype=np.int64)
        seq_label_batch = np.full((batch_size, max_tgt_len), -100, dtype=np.int64)

        seq_list = []
        for i, (subwords, subword_to_word_indices, seq_label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_to_word_indices = subword_to_word_indices[:max_seq_len]

            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            subword_to_word_indices_batch[i,:len(subwords)] = subword_to_word_indices
            seq_label_batch[i,:len(seq_label)] = seq_label

            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, subword_to_word_indices_batch, seq_label_batch, seq_list


In [13]:
indosum_data = load_dataset("maryantocinn/indosum", trust_remote_code=True)

In [14]:
tokenized_indosum = []
sentence_data = []
for i in range (len(indosum_data['train'])):
    sentences = indosum_data['train'][i]['document'].split('. ')
    for j in range (len(sentences)):
        tokens = re.findall(r'\w+|[^\w\s]', sentences[j].lower())
        tokens.append('.')
        pos_tags = ['B-NNP' for token in range (len(tokens))]
        tokenized_indosum.append({'tokens': tokens, 'pos_tags': pos_tags})
        sentence_data.append({'sentence': sentences[j].lower(), 'article_id': i, 'sentence_id': j})
sentence_data[:5]

[{'sentence': 'jakarta, cnn indonesia - - dokter ryan thamrin, yang terkenal lewat acara dokter oz indonesia, meninggal dunia pada jumat (4 / 8) dini hari',
  'article_id': 0,
  'sentence_id': 0},
 {'sentence': 'dokter lula kamal yang merupakan selebriti sekaligus rekan kerja ryan menyebut kawannya itu sudah sakit sejak setahun yang lalu',
  'article_id': 0,
  'sentence_id': 1},
 {'sentence': 'lula menuturkan, sakit itu membuat ryan mesti vakum dari semua kegiatannya, termasuk menjadi pembawa acara dokter oz indonesia',
  'article_id': 0,
  'sentence_id': 2},
 {'sentence': 'kondisi itu membuat ryan harus kembali ke kampung halamannya di pekanbaru, riau untuk menjalani istirahat',
  'article_id': 0,
  'sentence_id': 3},
 {'sentence': '" setahu saya dia orangnya sehat, tapi tahun lalu saya dengar dia sakit',
  'article_id': 0,
  'sentence_id': 4}]

In [15]:
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-large-p1')
config.num_labels = PosTagProsaDataset.NUM_LABELS
w2i, i2w = PosTagProsaDataset.LABEL2INDEX, PosTagProsaDataset.INDEX2LABEL

model = BertForWordClassification.from_pretrained('indobenchmark/indobert-large-p1', config=config)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
test_indosum = PosTagProsaDataset(tokenized_indosum, tokenizer, lowercase=True)
test_indosum_loader = PosTagDataLoader(dataset=test_indosum, max_seq_len=512, batch_size=8, shuffle=False)
model.load_state_dict(torch.load('model/postag/postagger_indobert.pth'))

  model.load_state_dict(torch.load('model/postag/postagger_indobert.pth'))


<All keys matched successfully>

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

## Semantic Role Labeling

In [167]:
configurations = {
    "default": {
        "srl_labels": ["ARG0","ARG1","ARG2","AM-MOD","AM-ADV","REL","AM-TMP","AM-CAU","AM-LOC","AM-DIR", "AM-MNR","AM-DIS","AM-PRD","ARG3","ARG4","AM-LVB","AM-PRP","AM-COM","AM-GOL","AM-EXT","AM-REC","AM-NEG","AM-ADJ"],
        "verb_labels": ["B-VBE", "B-VBI", "B-VBL", "B-VBP", "B-VBT"],
        "seed_val": 43,
        "max_len": 512,
        "info_every": 30,
        "gradient_clip": 1.0,
        "gozali_data_location": "data/srl_corpus_gojali.txt"
    },
    "xlmr_32": {
      "model": "FacebookAI/xlm-roberta-large",
      "batch_size": 32,
      "epochs": 8,
      "learning_rate": 1e-4,
      "model_location": "model/srl",
      "load_model_location": "model/srl",
    },
}

In [168]:
seed_val = configurations["default"]["seed_val"]
device = "cuda:7"
LongTensor = torch.LongTensor
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [169]:
class MatrixModel():
    def __init__(self):
        self.arg_excess = defaultdict(int)
        self.arg_missed = defaultdict(int)
        self.arg_match = defaultdict(int)

    def filter_label(self, text, gold_labels, pred_labels):
        new_gold_labels, new_pred_labels = [], []
        for i in range(len(text)):
            if text[i].startswith("▁"):
                new_gold_labels.append(gold_labels[i])
                new_pred_labels.append(pred_labels[i])
            else:
                continue
        return new_gold_labels, new_pred_labels

    def evaluate_tagset(self, text, gold_labels, pred_labels):
        new_gold_labels, new_pred_labels = self.filter_label(text, gold_labels, pred_labels)
        label_filter = ["O"]
        gld = set([f"{i}_{y}" for i, y in enumerate(new_gold_labels) if y not in label_filter])
        sys = set([f"{i}_{y}" for i, y in enumerate(new_pred_labels) if y not in label_filter])

        excess = sys - gld  # False Positives
        missed = gld - sys  # False Negatives
        true_pos = sys.intersection(gld)

        eval_obj = {"excess": [x.split("_")[1][2:] for x in excess],
                    "missed": [x.split("_")[1][2:] for x in missed],
                    "match": [x.split("_")[1][2:] for x in true_pos]}
        self.add_to_eval_dicts(eval_obj)

    def add_to_eval_dicts(self, eval_metrics):
        for arg in eval_metrics["excess"]:
            self.arg_excess[arg] += 1
        for arg in eval_metrics["missed"]:
            self.arg_missed[arg] += 1
        for arg in eval_metrics["match"]:
            self.arg_match[arg] += 1

    def get_metrics(self, false_pos, false_neg, true_pos):
        _denom1 = true_pos + false_pos
        precision = true_pos / _denom1 if _denom1 else 0
        _denom2 = true_pos + false_neg
        recall = true_pos / _denom2 if _denom2 else 0
        _denom3 = precision + recall
        F1 = 2 * ((precision * recall) / _denom3) if _denom3 else 0
        return precision*100, recall*100, F1*100

    def show_overall_metrics(self, save_to_file=None, print_metrics=True):
        processed_args = set()
        results = []
        tot_excess, tot_missed, tot_match = 0, 0, 0
        for arg, count in self.arg_match.items():
            excess = self.arg_excess.get(arg, 0)
            missed = self.arg_missed.get(arg, 0)
            p,r,f = self.get_metrics(false_pos=excess, false_neg=missed, true_pos=count)
            processed_args.add(arg)
            results.append((arg, count, excess, missed, p, r, f))
            tot_excess += excess
            tot_missed += missed
            tot_match += count
        for arg, count in self.arg_excess.items():
            if arg not in processed_args:
                excess = count
                missed = self.arg_missed.get(arg, 0)
                correct = self.arg_match.get(arg, 0)
                p, r, f = self.get_metrics(false_pos=excess, false_neg=missed, true_pos=correct) # p,r,f = 0,0,0
                processed_args.add(arg)
                results.append((arg, correct, excess, missed, p, r, f))
                tot_excess += excess
                tot_missed += missed
                tot_match += correct
        for arg, count in self.arg_missed.items():
            if arg not in processed_args:
                excess = self.arg_excess.get(arg, 0)
                correct = self.arg_match.get(arg, 0)
                missed = count
                p, r, f = self.get_metrics(false_pos=excess, false_neg=missed, true_pos=correct) # p,r,f = 0,0,0
                results.append((arg, correct, excess, missed, p, r, f))
                tot_excess += excess
                tot_missed += missed
                tot_match += correct
        results = sorted(results, key= lambda x: x[0])

        prec, rec, F1 = self.get_metrics(false_pos=tot_excess, false_neg=tot_missed, true_pos=tot_match)

        if print_metrics:
            print("\n--- OVERALL ---\nCorrect: {0}\tExcess: {1}\tMissed: {2}\nPrecision: {3:.2f}\t\tRecall: {4:.2f}\nF1: {5:.2f}\n".format(tot_match, tot_excess, tot_missed, prec, rec, F1))
            print(tabulate(results, headers=["corr.", "excess", "missed", "prec.", "rec.", "F1"], floatfmt=".2f"))
        if save_to_file:
            fout = open(save_to_file, "w")
            fout.write("\n--- OVERALL ---\nCorrect: {0}\tExcess: {1}\tMissed: {2}\nPrecision: {3:.2f}\t\tRecall: {4:.2f}\nF1: {5:.2f}\n".format(tot_match, tot_excess, tot_missed, prec, rec, F1))
            fout.write(tabulate(results, headers=["corr.", "excess", "missed", "prec.", "rec.", "F1"], floatfmt=".2f"))

In [170]:
class TransformerModel():
    def __init__(self, config):
        self.name = config["model"]
        self.batch_size = config["batch_size"]
        self.epochs = config["epochs"]
        self.learning_rate = config["learning_rate"]
        self.save_location_model = config["model_location"]
        self.tokenizer: AutoTokenizer = None
        self.model: AutoModelForTokenClassification = None

    def build_label_vocab(self) -> dict:
        label2index = {"O": 0}
        for j in range(2):
          tag = "B-" if j==0 else "I-"
          for labelset in configurations["default"]["srl_labels"]:
            if labelset not in label2index:
              label2index[tag+labelset] = len(label2index)
        return label2index

    def create_model(self):
        label2index = self.build_label_vocab()
        index2label = {v: k for k, v in label2index.items()}
        self.tokenizer = AutoTokenizer.from_pretrained(self.name, cache_dir="./cache")
        self.model = AutoModelForTokenClassification.from_pretrained(self.name, num_labels=len(label2index), cache_dir="./cache")
        self.model.config.finetuning_task = 'token-classification'
        self.model.config.id2label = index2label
        self.model.config.label2id = label2index
        self.model.config.type_vocab_size = 2
        # Create a new Embeddings layer, with 2 possible segments IDs instead of 1
        self.model.roberta.embeddings.token_type_embeddings = Embedding(2, self.model.config.hidden_size)
        self.model.roberta.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=self.model.config.initializer_range)
        self.model.to(device)

    def load_model(self, model_dir):
        self.model = AutoModelForTokenClassification.from_pretrained(model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model.to(device)

    def expand_label_token(self, original_sentence, original_labels):
        tmp_labels = []
        txt_sentences = " ".join(original_sentence)
        tokens = self.tokenizer(txt_sentences, padding="max_length", truncation=True, max_length=configurations["default"]["max_len"])
        sentence = self.tokenizer.tokenize(txt_sentences)
        for i, word in enumerate(original_sentence):
            word_pieces = self.tokenizer.tokenize(word)
            if len(word_pieces) == 1:
                tmp_labels.append(original_labels[i])
            else:
                tmp_labels.append(original_labels[i])
                for _ in range (1, len(word_pieces)):
                    if original_labels[i] == "O":
                        tmp_labels.append("O")
                    else:
                        tmp_labels.append("I" + original_labels[i][1:])
        labels = ["O"] + tmp_labels + ["O"]
        length = len(sentence)+2
        return tokens, labels, length

    def get_data(self, data):
        list_input_ids, verb_indicators, all_labels, attention_masks, length_sentences = [], [], [], [], []
        for _, obj in enumerate(data):
            tokens, labelset, length_sentence = self.expand_label_token(obj["words"], obj["arguments"])
            input_ids = tokens["input_ids"]
            attent_mask = tokens["attention_mask"]
            list_input_ids.append(input_ids)
            attention_masks.append(attent_mask)
            length_sentences.append(length_sentence)
            # Verb Indicator (which predicate to label)
            bio_verb = [1 if label[2:] == "REL" else 0 for label in labelset]
            verb_indicators.append(bio_verb)
            all_labels.append(labelset)

        return list_input_ids, verb_indicators, all_labels, attention_masks, length_sentences

    def load_srl_dataset(self, data):
        input_ids, verb_indicators, labels, attention_masks, seq_lengths = self.get_data(data)
        label_ixs = []
        label2index = self.build_label_vocab()
        # Convert label to their indices
        for i, labelset in enumerate(labels):
            label_ixs.append([label2index.get(l, 1) for l in labelset])
        # pad label and verb consequence
        input_is_pred = pad_sequences(verb_indicators, maxlen=configurations["default"]["max_len"], dtype="long", value=0, truncating="post", padding="post")
        label_ids = pad_sequences(label_ixs, maxlen=configurations["default"]["max_len"], dtype="long", value=0, truncating="post", padding="post")
        label_ids = LongTensor(label_ids)
        return LongTensor(input_ids), LongTensor(attention_masks), label_ids,  LongTensor(seq_lengths), LongTensor(input_is_pred)

    def create_train_val_dataloader(self, train_data, val_data):
        train_inputs, train_masks, train_labels, _, train_preds = self.load_srl_dataset(train_data)
        # Create the DataLoader for training set.
        train_data = TensorDataset(train_inputs, train_masks, train_labels, train_preds)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size)
        val_inputs, val_masks, val_labels, _, val_preds = self.load_srl_dataset(val_data)
        # Create the DataLoader for validation set
        val_data = TensorDataset(val_inputs, val_masks, val_labels, val_preds)
        val_sampler = RandomSampler(val_data)
        val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=self.batch_size)
        return train_dataloader, val_dataloader

    def create_pred_dataloader(self, data):
        prediction_inputs, prediction_masks, gold_labels, seq_lens, gold_predicates = self.load_srl_dataset(data)
        # Create the DataLoader.
        prediction_data = TensorDataset(prediction_inputs, prediction_masks, gold_labels, seq_lens, gold_predicates)
        prediction_sampler = SequentialSampler(prediction_data)
        prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=self.batch_size)
        return prediction_dataloader


In [171]:
train_data = pd.read_csv("data/indosum_verb_train.csv")
val_data = pd.read_csv("data/indosum_verb_validation.csv")
test_data = pd.read_csv("data/indosum_verb_test.csv")

In [172]:
sentence = []
for i, text in test_data.iterrows():
    new_text = text["sentence"].split(",")
    new_text = " ,".join(new_text)
    new_text = new_text.split(".")
    new_text = " .".join(new_text)
    new_text = new_text.split("?")
    new_text = " ?".join(new_text)
    sentence.append(new_text)

In [173]:
test_data["sentence"] = sentence

In [174]:
correct = []
for i, text in test_data.iterrows():
    words = text["sentence"].split(" ")
    try:
        index = words.index(text["verb"])
        correct.append(True)
    except ValueError:
        correct.append(None)

In [175]:
test_data["correct"] = correct

In [176]:
test_data = test_data.dropna()

In [178]:
class PredictModel():
    def __init__(self, config):
        self.model_dir = config["model_location"]
        self.pad_token_label_id = CrossEntropyLoss().ignore_index
        self.transformer_model: TransformerModel = TransformerModel(config)
        self.matrix_model = None

    def preprocess_text(self,data):
        list_data = []
        for i, text in data.iterrows():
            words = text["sentence"].split(" ")
            labels = ["O" for i in range (len(words))]
            index = words.index(text["verb"])
            labels[index] = "B-REL"
            srl = {
            "words" : words,
            "arguments" : labels,
            "predicate" : text["verb"]
            }
            list_data.append(srl)
        return list_data

    def generate_label(self, data_text):
        data = self.preprocess_text(data_text)
        self.matrix_model = MatrixModel()
        # Load Saved Model
        self.transformer_model.load_model(self.model_dir)
        model = self.transformer_model.model
        tokenizer =self.transformer_model.tokenizer
        label2index = self.transformer_model.build_label_vocab()
        index2label = {v: k for k, v in label2index.items()}
        list_pred_labels = []
        # Load File for Predictions
        pred_dataloader = self.transformer_model.create_pred_dataloader(data)
        model.eval()
        total_sents = 0

        for batch in pred_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_lengths, b_preds = batch
            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=b_preds, attention_mask=b_input_mask)
            logits = outputs[0]
            class_probabilities = torch.softmax(logits, dim=-1)

            # Move class_probabilities and labels to CPU
            class_probabilities = class_probabilities.detach().cpu().numpy()
            argmax_indices = np.argmax(class_probabilities, axis=-1)

            label_ids = b_labels.to('cpu').numpy()
            seq_lengths = b_lengths.to('cpu').numpy()

            for ix in range(len(label_ids)):
                total_sents += 1
                text = tokenizer.convert_ids_to_tokens(b_input_ids[ix])
                # Store predictions and true labels
                pred_labels = [index2label[p] for p in argmax_indices[ix][:seq_lengths[ix]]]
                gold_labels = [index2label[g] for g in label_ids[ix]]
                # Delete unnecessary label
                idx_pad = text.index("<pad>")
                pred_labels = pred_labels[1:len(pred_labels)-1]
                gold_labels = gold_labels[1:idx_pad-1]
                text = text[1:idx_pad-1]
                _, pred_labels = self.matrix_model.filter_label(text, gold_labels, pred_labels)
                list_pred_labels.append(pred_labels)
                # print(f"\n----- {total_sents} -----\n{teks}\n{pred_labels}")
        new_data = data_text
        new_data["srl"] = list_pred_labels
        new_data.to_csv("indosum_srl_test.csv")


In [179]:
# large english model with gozali data 32 batch
model_predict = PredictModel(configurations["xlmr_32"])
model_predict.generate_label(test_data)

## Summarization

In [3]:
data = pd.read_csv("indosum_srl_test.csv")
indosum_data = load_dataset("maryantocinn/indosum", trust_remote_code=True)

In [None]:
# split punctuation in every word
sentence_summary = []
for text in indosum_data["test"]:
    new_text = text["summary"].split(",")
    new_text = " ,".join(new_text)
    new_text = new_text.split(".")
    new_text = " .".join(new_text)
    new_text = new_text.split("?")
    new_text = " ?".join(new_text)
    new_text = new_text.lower()
    sentence_summary.append(new_text)

In [None]:
device = "cuda:7"
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = BertModel.from_pretrained('indobenchmark/indobert-base-p1')
model.to(device)

def get_word_embeddings(tokens):
    inputs = tokenizer(tokens, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.squeeze(0)
    return embeddings.cpu().numpy()

def filter_labels_if_not_in_common(embed1, embed2, labels1, labels2):
    common_labels = set(labels1).intersection(set(labels2))

    embed1 = [embed for embed, label in zip(embed1, labels1) if label in common_labels]
    label1 = [label for label in labels1 if label in common_labels]
    embed2 = [embed for embed, label in zip(embed2, labels2) if label in common_labels]
    label2 = [label for label in labels2 if label in common_labels]

    all_elements = set(labels1).union(set(labels2))

    return embed1, embed2, label1, label2, len(all_elements)

def filter_words_labels(srl_tag):
    filtered_labels = [label for label in srl_tag if label != 'O']
    filtered_labels = [label[2:] for label in filtered_labels]
    return filtered_labels

def filter_words_token(row):
    tokens = row["sentence"].split()
    labels = row["srl"]
    filtered_tokens = [token for token, label in zip(tokens, labels) if label != 'O']
    return filtered_tokens

def sentence_similarity(embeddings1, embeddings2, labels1, labels2, count):
    max_similarities = {}

    for i, (emb1, label1) in enumerate(zip(embeddings1, labels1)):
        for j, (emb2, label2) in enumerate(zip(embeddings2, labels2)):
            if label1 == label2:
                # Calculate similarity
                similarity = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]

                if label1 not in max_similarities or similarity > max_similarities[label1]:
                    max_similarities[label1] = similarity

    # Sum up the maximum similarities for each label
    total_max_similarity = sum(max_similarities.values())

    return total_max_similarity / count

def count_o_labels(srl_tags):
    return srl_tags.count('O')

def reduce_same_sentence(data):
    data['o_label_count'] = data['srl'].apply(count_o_labels)
    df_sorted = data.sort_values(by='o_label_count', ascending=True)
    df_reduced = df_sorted.drop_duplicates(subset='sentence', keep='first')
    return df_reduced.sort_index()

def calculate_sentence_scores(df):
    sentence_scores = []
    similarity_cache = {}  # Cache to store previously computed similarities
    for i, row1 in df.iterrows():
        embeddings1, labels1 = row1['embeddings'], row1['srl']
        sentence_score = 0
        
        for j, row2 in df.iterrows():
            if i != j:
                # Check if this similarity has been computed before
                if (i, j) in similarity_cache:
                    similarity = similarity_cache[(i, j)]
                elif (j, i) in similarity_cache:
                    similarity = similarity_cache[(j, i)]
                else:
                    embeddings2, labels2 = row2['embeddings'], row2['srl']
                    embeddings1, embeddings2, labels1, labels2, count = filter_labels_if_not_in_common(embeddings1, embeddings2, labels1, labels2)
                    similarity = sentence_similarity(embeddings1, embeddings2, labels1, labels2, count)
                    similarity_cache[(i, j)] = similarity
                    similarity_cache[(j, i)] = similarity
                
                sentence_score += similarity
        
        sentence_scores.append(sentence_score)
    
    return sentence_scores

In [None]:
length_article = len(data["article_id"].value_counts())
list_sentence_hyp = []
for i in tqdm(range(10), desc="Processing articles"):
    df = data[data["article_id"] == i]
    df["srl"] = df["srl"].apply(ast.literal_eval)
    df = reduce_same_sentence(df)
    df["labels"] =  df['srl'].apply(filter_words_labels)
    df["token"] = df.apply(filter_words_token, axis=1)
    df['embeddings'] = df["token"].apply(get_word_embeddings)
    score = calculate_sentence_scores(df)
    df["score"] = score
    df.sort_values(by="score", ascending=False, inplace=True)
    top_count = math.ceil(len(df) / 4)
    top = df.head(top_count)
    top.sort_values("sentence_id", inplace=True)
    sentences = ""
    for i, sentence in top.iterrows():
        sentences += " " + sentence["sentence"] + " ."
    list_sentence_hyp.append(sentences.strip())

Processing articles: 100%|██████████| 10/10 [00:52<00:00,  5.25s/it]


In [None]:
from rouge import Rouge

# Create a Rouge object
rouge = Rouge()
f1_scores = []

# Calculate ROUGE scores
for system, reference in zip(list_sentence_hyp, sentence_summary):
    scores = rouge.get_scores(system, reference)[0]  # get_scores returns a list of results
    f1_rouge1 = scores['rouge-1']['f']
    f1_rouge2 = scores['rouge-2']['f']
    f1_rougeL = scores['rouge-l']['f']
    
    # Collect F1 scores for all three ROUGE metrics
    f1_scores.append((f1_rouge1, f1_rouge2, f1_rougeL))

# Convert list of tuples to a NumPy array for easy averaging
f1_scores_array = np.array(f1_scores)

# Calculate average F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
average_f1_scores = np.mean(f1_scores_array, axis=0)
print(f"Average F1 Scores: ROUGE-1: {average_f1_scores[0]:.4f}, ROUGE-2: {average_f1_scores[1]:.4f}, ROUGE-L: {average_f1_scores[2]:.4f}")

Average F1 Scores: ROUGE-1: 0.4062, ROUGE-2: 0.2530, ROUGE-L: 0.3769
