In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForTokenClassification, BertPreTrainedModel, BertConfig
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from keras.utils import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
import math
import os
from torch import nn
from torch.nn import CrossEntropyLoss
import re

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

2024-12-12 15:38:54.759927: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-12 15:38:54.759987: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-12 15:38:54.760002: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 15:38:54.766408: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Pos Tagging

In [2]:
class BertForWordClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        subword_to_word_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        # average the token-level outputs to compute word-level representations
        max_seq_len = subword_to_word_ids.max() + 1
        word_latents = []
        for i in range(max_seq_len):
            mask = (subword_to_word_ids == i).unsqueeze(dim=-1)
            word_latents.append((sequence_output * mask).sum(dim=1) / mask.sum())
        word_batch = torch.stack(word_latents, dim=1)

        sequence_output = self.dropout(word_batch)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)


In [3]:
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()        
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)
        
    return loss, list_hyps, list_labels

In [4]:
class PosTagProsaDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'B-PPO': 0, 'B-KUA': 1, 'B-ADV': 2, 'B-PRN': 3, 'B-VBI': 4, 'B-PAR': 5, 'B-VBP': 6, 'B-NNP': 7, 'B-UNS': 8, 'B-VBT': 9, 'B-VBL': 10, 'B-NNO': 11, 'B-ADJ': 12, 'B-PRR': 13, 'B-PRK': 14, 'B-CCN': 15, 'B-$$$': 16, 'B-ADK': 17, 'B-ART': 18, 'B-CSN': 19, 'B-NUM': 20, 'B-SYM': 21, 'B-INT': 22, 'B-NEG': 23, 'B-PRI': 24, 'B-VBE': 25}
    INDEX2LABEL = {0: 'B-PPO', 1: 'B-KUA', 2: 'B-ADV', 3: 'B-PRN', 4: 'B-VBI', 5: 'B-PAR', 6: 'B-VBP', 7: 'B-NNP', 8: 'B-UNS', 9: 'B-VBT', 10: 'B-VBL', 11: 'B-NNO', 12: 'B-ADJ', 13: 'B-PRR', 14: 'B-PRK', 15: 'B-CCN', 16: 'B-$$$', 17: 'B-ADK', 18: 'B-ART', 19: 'B-CSN', 20: 'B-NUM', 21: 'B-SYM', 22: 'B-INT', 23: 'B-NEG', 24: 'B-PRI', 25: 'B-VBE'}
    NUM_LABELS = 26
    
    def load_dataset(self, data):
        # Prepare buffer
        dataset = []
        sentence = []
        seq_label = []
        for i in range (len(data)):
            for j in range (len(data[i]['tokens'])):
                sentence.append(data[i]['tokens'][j])
                seq_label.append(self.LABEL2INDEX[data[i]['pos_tags'][j]])
            dataset.append({
                    'sentence': sentence,
                    'seq_label': seq_label
                })
            sentence = []
            seq_label = []
        return dataset
    
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):
        data = self.data[index]
        sentence, seq_label = data['sentence'], data['seq_label']
        
        # Add CLS token
        subwords = [self.tokenizer.cls_token_id]
        subword_to_word_indices = [-1] # For CLS
        
        # Add subwords
        for word_idx, word in enumerate(sentence):
            subword_list = self.tokenizer.encode(word, add_special_tokens=False)
            subword_to_word_indices += [word_idx for i in range(len(subword_list))]
            subwords += subword_list
            
        # Add last SEP token
        subwords += [self.tokenizer.sep_token_id]
        subword_to_word_indices += [-1]
        
        return np.array(subwords), np.array(subword_to_word_indices), np.array(seq_label), data['sentence']
    
    def __len__(self):
        return len(self.data)


In [5]:
class PosTagDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(PosTagDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        max_tgt_len = max(map(lambda x: len(x[2]), batch))
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        subword_to_word_indices_batch = np.full((batch_size, max_seq_len), -1, dtype=np.int64)
        seq_label_batch = np.full((batch_size, max_tgt_len), -100, dtype=np.int64)

        seq_list = []
        for i, (subwords, subword_to_word_indices, seq_label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_to_word_indices = subword_to_word_indices[:max_seq_len]

            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            subword_to_word_indices_batch[i,:len(subwords)] = subword_to_word_indices
            seq_label_batch[i,:len(seq_label)] = seq_label

            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, subword_to_word_indices_batch, seq_label_batch, seq_list


In [6]:
class POSTagModel():
    def __init__(self, model_dir):
        self.model_dir = model_dir

    def preprocess_text(self, data):
        tokenized_data = []
        sentence_data = []
        sentences = data.split('. ')
        for i in range (len(sentences)):
            tokens = re.findall(r'\w+|[^\w\s]', sentences[i].lower())
            tokens.append('.')
            pos_tags = ['B-NNP' for token in range (len(tokens))]
            tokenized_data.append({'tokens': tokens, 'pos_tags': pos_tags})
            sentence_data.append({'sentence': " ".join(tokens), 'sentence_id': i})
        return tokenized_data, sentence_data

    def verb_extraction(self, data_text):
        tokenized_data, sentence_data = self.preprocess_text(data_text)
        tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p1')
        data = PosTagProsaDataset(tokenized_data, tokenizer, lowercase=True)
        data_loader = PosTagDataLoader(dataset=data, max_seq_len=512, batch_size=8, shuffle=False)

        config = BertConfig.from_pretrained('indobenchmark/indobert-large-p1')
        config.num_labels = PosTagProsaDataset.NUM_LABELS
        w2i, i2w = PosTagProsaDataset.LABEL2INDEX, PosTagProsaDataset.INDEX2LABEL
        model = BertForWordClassification.from_pretrained('indobenchmark/indobert-large-p1', config=config)
        model.cuda()
        model.load_state_dict(torch.load(self.model_dir))
        model.eval()
        torch.set_grad_enabled(False)

        list_hyp, list_label = [], []

        for batch_data in data_loader:     
            loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

            # Calculate evaluation metrics
            list_hyp += batch_hyp
            list_label += batch_label

        # Save prediction
        df = pd.DataFrame({'label':list_hyp}).reset_index()
        sentence_postag = []
        for i in range (len(sentence_data)):
            for j in range (len(df.loc[i]['label'])):
                if w2i[df.loc[i]['label'][j]] in [4,6,9,10,25]:
                    sentence_postag.append({'sentence': sentence_data[i]['sentence'], 'sentence_id': sentence_data[i]['sentence_id'], 'verb': tokenized_data[i]['tokens'][j]})
        verb_df = pd.DataFrame(sentence_postag)
        return verb_df

## Semantic Role Labeling

In [7]:
configurations = {
    "default": {
        "srl_labels": ["ARG0","ARG1","ARG2","AM-MOD","AM-ADV","REL","AM-TMP","AM-CAU","AM-LOC","AM-DIR", "AM-MNR","AM-DIS","AM-PRD","ARG3","ARG4","AM-LVB","AM-PRP","AM-COM","AM-GOL","AM-EXT","AM-REC","AM-NEG","AM-ADJ"],
        "max_len": 512,
    },
    "xlmr": {
      "model": "FacebookAI/xlm-roberta-large",
      "batch_size": 32,
      "model_location": "model/srl",
    },
}

In [8]:
class TransformerSRLModel():
    def __init__(self, config):
        self.name = config["model"]
        self.batch_size = config["batch_size"]
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer: AutoTokenizer = None
        self.model: AutoModelForTokenClassification = None

    def build_label_vocab(self) -> dict:
        label2index = {"O": 0}
        for j in range(2):
          tag = "B-" if j==0 else "I-"
          for labelset in configurations["default"]["srl_labels"]:
            if labelset not in label2index:
              label2index[tag+labelset] = len(label2index)
        return label2index

    def load_model(self, model_dir):
        self.model = AutoModelForTokenClassification.from_pretrained(model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model.to(self.device)

    def expand_label_token(self, original_sentence, original_labels):
        tmp_labels = []
        txt_sentences = " ".join(original_sentence)
        tokens = self.tokenizer(txt_sentences, padding="max_length", truncation=True, max_length=configurations["default"]["max_len"])
        sentence = self.tokenizer.tokenize(txt_sentences)
        for i, word in enumerate(original_sentence):
            word_pieces = self.tokenizer.tokenize(word)
            if len(word_pieces) == 1:
                tmp_labels.append(original_labels[i])
            else:
                tmp_labels.append(original_labels[i])
                for _ in range (1, len(word_pieces)):
                    if original_labels[i] == "O":
                        tmp_labels.append("O")
                    else:
                        tmp_labels.append("I" + original_labels[i][1:])
        labels = ["O"] + tmp_labels + ["O"]
        length = len(sentence)+2
        return tokens, labels, length

    def get_data(self, data):
        list_input_ids, verb_indicators, all_labels, attention_masks, length_sentences = [], [], [], [], []
        for _, obj in enumerate(data):
            tokens, labelset, length_sentence = self.expand_label_token(obj["words"], obj["arguments"])
            input_ids = tokens["input_ids"]
            attent_mask = tokens["attention_mask"]
            list_input_ids.append(input_ids)
            attention_masks.append(attent_mask)
            length_sentences.append(length_sentence)
            # Verb Indicator (which predicate to label)
            bio_verb = [1 if label[2:] == "REL" else 0 for label in labelset]
            verb_indicators.append(bio_verb)
            all_labels.append(labelset)

        return list_input_ids, verb_indicators, all_labels, attention_masks, length_sentences

    def load_srl_dataset(self, data):
        input_ids, verb_indicators, labels, attention_masks, seq_lengths = self.get_data(data)
        label_ixs = []
        label2index = self.build_label_vocab()
        # Convert label to their indices
        for i, labelset in enumerate(labels):
            label_ixs.append([label2index.get(l, 1) for l in labelset])
        # pad label and verb consequence
        input_is_pred = pad_sequences(verb_indicators, maxlen=configurations["default"]["max_len"], dtype="long", value=0, truncating="post", padding="post")
        label_ids = pad_sequences(label_ixs, maxlen=configurations["default"]["max_len"], dtype="long", value=0, truncating="post", padding="post")
        label_ids = torch.LongTensor(label_ids)
        return torch.LongTensor(input_ids), torch.LongTensor(attention_masks), label_ids,  torch.LongTensor(seq_lengths), torch.LongTensor(input_is_pred)

    def create_pred_dataloader(self, data):
        prediction_inputs, prediction_masks, gold_labels, seq_lens, gold_predicates = self.load_srl_dataset(data)
        # Create the DataLoader.
        prediction_data = TensorDataset(prediction_inputs, prediction_masks, gold_labels, seq_lens, gold_predicates)
        prediction_sampler = SequentialSampler(prediction_data)
        prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=self.batch_size)
        return prediction_dataloader


In [9]:
class SRLModel():
    def __init__(self, config):
        self.model_dir = config["model_location"]
        self.transformer_model: TransformerSRLModel = TransformerSRLModel(config)

    def preprocess_text(self,data):
        list_data = []
        for i, text in data.iterrows():
            words = text["sentence"].split(" ")
            labels = ["O" for i in range (len(words))]
            index = words.index(text["verb"])
            labels[index] = "B-REL"
            srl = {
            "words" : words,
            "arguments" : labels,
            "predicate" : text["verb"]
            }
            list_data.append(srl)
        return list_data
    
    def filter_label(self, text, gold_labels, pred_labels):
        new_gold_labels, new_pred_labels = [], []
        for i in range(len(text)):
            if text[i].startswith("▁"):
                new_gold_labels.append(gold_labels[i])
                new_pred_labels.append(pred_labels[i])
            else:
                continue
        return new_gold_labels, new_pred_labels

    def generate_label(self, data_text):
        data = self.preprocess_text(data_text)
        # Load Saved Model
        self.transformer_model.load_model(self.model_dir)
        model = self.transformer_model.model
        tokenizer =self.transformer_model.tokenizer
        label2index = self.transformer_model.build_label_vocab()
        index2label = {v: k for k, v in label2index.items()}
        # Load File for Predictions
        pred_dataloader = self.transformer_model.create_pred_dataloader(data)
        model.eval()
        total_sents = 0
        list_pred_labels = []


        for batch in pred_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(self.transformer_model.device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_lengths, b_preds = batch
            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=b_preds, attention_mask=b_input_mask)
            logits = outputs[0]
            class_probabilities = torch.softmax(logits, dim=-1)

            # Move class_probabilities and labels to CPU
            class_probabilities = class_probabilities.detach().cpu().numpy()
            argmax_indices = np.argmax(class_probabilities, axis=-1)

            label_ids = b_labels.to('cpu').numpy()
            seq_lengths = b_lengths.to('cpu').numpy()

            for ix in range(len(label_ids)):
                total_sents += 1
                text = tokenizer.convert_ids_to_tokens(b_input_ids[ix])
                # Store predictions and true labels
                pred_labels = [index2label[p] for p in argmax_indices[ix][:seq_lengths[ix]]]
                gold_labels = [index2label[g] for g in label_ids[ix]]
                # Delete unnecessary label
                idx_pad = text.index("<pad>")
                pred_labels = pred_labels[1:len(pred_labels)-1]
                gold_labels = gold_labels[1:idx_pad-1]
                text = text[1:idx_pad-1]
                _, pred_labels = self.filter_label(text, gold_labels, pred_labels)
                list_pred_labels.append(pred_labels)
        return list_pred_labels

## Summarization

In [10]:
class SummarizationModel():
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
        self.model = BertModel.from_pretrained('indobenchmark/indobert-base-p1')
        self.model.to(self.device)

    def get_word_embeddings(self, tokens):
        inputs = self.tokenizer(tokens, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.squeeze(0)
        return embeddings.cpu().numpy()

    def filter_labels_if_not_in_common(self, embed1, embed2, labels1, labels2):
        common_labels = set(labels1).intersection(set(labels2))

        embed1 = [embed for embed, label in zip(embed1, labels1) if label in common_labels]
        label1 = [label for label in labels1 if label in common_labels]
        embed2 = [embed for embed, label in zip(embed2, labels2) if label in common_labels]
        label2 = [label for label in labels2 if label in common_labels]

        all_elements = set(labels1).union(set(labels2))

        return embed1, embed2, label1, label2, len(all_elements)

    def filter_words_labels(self, srl_tag):
        filtered_labels = [label for label in srl_tag if label != 'O']
        filtered_labels = [label[2:] for label in filtered_labels]
        return filtered_labels

    def filter_words_token(self, row):
        tokens = row["sentence"].split()
        labels = row["srl"]
        filtered_tokens = [token for token, label in zip(tokens, labels) if label != 'O']
        return filtered_tokens

    def sentence_similarity(self, embeddings1, embeddings2, labels1, labels2, count):
        max_similarities = {}

        for i, (emb1, label1) in enumerate(zip(embeddings1, labels1)):
            for j, (emb2, label2) in enumerate(zip(embeddings2, labels2)):
                if label1 == label2:
                    # Calculate similarity
                    similarity = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]

                    if label1 not in max_similarities or similarity > max_similarities[label1]:
                        max_similarities[label1] = similarity

        # Sum up the maximum similarities for each label
        total_max_similarity = sum(max_similarities.values())

        return total_max_similarity / count

    def count_o_labels(self, srl_tags):
        return srl_tags.count('O')

    def reduce_same_sentence(self, data):
        data['o_label_count'] = data['srl'].apply(self.count_o_labels)
        data_sorted = data.sort_values(by='o_label_count', ascending=True)
        data_reduced = data_sorted.drop_duplicates(subset='sentence', keep='first')
        return data_reduced.sort_index()

    def calculate_sentence_scores(self, data):
        sentence_scores = []
        similarity_cache = {}  # Cache to store previously computed similarities
        for i, row1 in data.iterrows():
            embeddings1, labels1 = row1['embeddings'], row1['srl']
            sentence_score = 0
            
            for j, row2 in data.iterrows():
                if i != j:
                    # Check if this similarity has been computed before
                    if (i, j) in similarity_cache:
                        similarity = similarity_cache[(i, j)]
                    elif (j, i) in similarity_cache:
                        similarity = similarity_cache[(j, i)]
                    else:
                        embeddings2, labels2 = row2['embeddings'], row2['srl']
                        embeddings1, embeddings2, labels1, labels2, count = self.filter_labels_if_not_in_common(embeddings1, embeddings2, labels1, labels2)
                        similarity = self.sentence_similarity(embeddings1, embeddings2, labels1, labels2, count)
                        similarity_cache[(i, j)] = similarity
                        similarity_cache[(j, i)] = similarity
                    
                    sentence_score += similarity
            
            sentence_scores.append(sentence_score)
        
        return sentence_scores
    
    def summarize_data(self, data):
        data = self.reduce_same_sentence(data)
        data["labels"] =  data['srl'].apply(self.filter_words_labels)
        data["token"] = data.apply(self.filter_words_token, axis=1)
        data['embeddings'] = data["token"].apply(self.get_word_embeddings)
        score = self.calculate_sentence_scores(data)
        data["score"] = score
        data.sort_values(by="score", ascending=False, inplace=True)
        top_count = math.ceil(len(data) / 4)
        top = data.head(top_count)
        top.sort_values("sentence_id", inplace=True)
        sentences = ""
        for i, sentence in top.iterrows():
            sentences += " " + sentence["sentence"]
        return sentences

In [11]:
def summary_article(article):
    # pos tag model
    postag_model = POSTagModel('model/postag/postagger_indobert.pth')
    data = postag_model.verb_extraction(article)
    
    #srl model
    model_predict = SRLModel(configurations["xlmr"])
    list_srl_label = model_predict.generate_label(data)

    # summarization model
    summarization_model = SummarizationModel()
    data["srl"] = list_srl_label
    summary = summarization_model.summarize_data(data)
    return summary.strip()

article = (
    "Pendidikan merupakan pilar utama dalam pembangunan suatu bangsa. Pendidikan dipahami "
    "secara luas sebagai proses belajar terus menerus sepanjang hayat, maka pendidikan menjadi "
    "komponen penting. Melalui pengalaman hidup sehari-hari, proses ini alami, langsung atau "
    "tidak langsung. "
    "Pendidikan bertujuan untuk menggali dan memanfaatkan potensi keunikan individu dan "
    "menjadikannya berguna bagi diri sendiri dan lingkungan. Hal ini juga berarti bahwa pendidikan "
    "membantu manusia menemukan potensi dan bakatnya sendiri, serta mengembangkannya "
    "sesuai dengan keunikan dan keahliannya. Oleh karena itu, dapat dikatakan bahwa pendidikan"
    "adalah hak setiap orang. "
    "Pendidikan tidak hanya sebatas belajar di sekolah. Demikian pula sistem pendidikan tidak "
    "hanya eksis dalam bentuk formal yang dikenal dan berkembang di masyarakat. Artikel ini akan "
    "membahas mengenai pentingnya pendidikan dalam membentuk masyarakat yang cerdas, "
    "produktif, dan berdaya saing, serta peranannya dalam menciptakan masa depan yang lebih "
    "baik bagi generasi mendatang."
)

summary_article(article)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(self.model_dir))


'pendidikan bertujuan untuk menggali dan memanfaatkan potensi keunikan individu dan menjadikannya berguna bagi diri sendiri dan lingkungan . hal ini juga berarti bahwa pendidikan membantu manusia menemukan potensi dan bakatnya sendiri , serta mengembangkannya sesuai dengan keunikan dan keahliannya .'