# Summarizer


In [1]:
import nltk, re
import pandas as pd
import src.utils as utils
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import torch
import numpy as np

# Load Config
CONFIG_DATA = utils.config_load()

In [5]:
import re
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import src.utils as utils

# Read Data
def read_data(return_file=False):
    data = pd.read_excel(CONFIG_DATA['raw_dataset_path'])

    # Print data
    print('ready read file, data shape   :', data.shape)

    # Dump data
    utils.dump_json(data, CONFIG_DATA['data_set_path'])

    # Return data
    if return_file:
        return data
    
# Cleaning Data
def clean_data(data, return_file=True):
    # Remove multiple used of '='
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].dropna()
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].replace('=', ' ')
    
    # Remove '\n', '\r', and '\t'
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].replace('\n', ' ')
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].replace('\r', ' ')
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].replace('\t', ' ')
    
    # Remove Non ASCII
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].str.encode('ascii', 'ignore').str.decode('ascii')

    # Remove Multiple Space
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].replace(r'\s+', ' ', regex=True)
    
    # Remove Whitespace in Start and End
    data[CONFIG_DATA['text_column']] = data[CONFIG_DATA['text_column']].str.strip()
    
    # Tokenizing 
    data['tokens'] = data[CONFIG_DATA['text_column']].replace(r'[^0-9a-zA-Z ]', '', regex=True).replace(r'\s+', ' ', regex=True).astype(str).apply(word_tokenize)
    data['tokens'] = data['tokens'].apply(lambda x: [len(token) for token in x]).apply(sum)
    
    if return_file:
        return data

# Generate Preprocessor
def generate_preprocessor(return_file=False):
    # Load Data
    data = utils.load_json(CONFIG_DATA['data_set_path'])
    data = clean_data(data)
    
    # Print Data
    print('ready processed, data shape   :', data.shape)
    
    # Dump Data
    utils.dump_json(data, CONFIG_DATA['data_clean_path'])    
    
    if return_file:
        return data


[nltk_data] Downloading package punkt to /Users/ralali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
data = generate_preprocessor(return_file=True)
data

ready processed, data shape   : (99, 2)


Unnamed: 0,FullText,tokens
0,Hasil jajak pendapat yang diselenggarakan Litb...,761
1,"JAKARTA, KOMPAS.com - Pemerintah menargetkan p...",1837
10,Sejumlah nama disebut-sebut Presiden Joko Wido...,733
11,PDI-Perjuangan memberikan tanggapannya mengena...,642
12,Presiden Joko Widodo menyinggung sejumlah figu...,733
...,...,...
94,"JAKARTA, KOMPAS.com - Otoritas Jasa Keuangan (...",1567
95,Wakil Ketua Umum Gerindra Fadli Zon mengungkap...,819
96,Ketua Umum Partai Gerindra Prabowo Subianto ke...,493
97,KOMPAS.com - Salah satu faktor penyebaran hoak...,953


In [6]:
articles = 'asil jajak pendapat yang diselenggarakan Litbang Kompas pada 25 Januari 2023 menunjukkan bahwa Partai Nasdem mendapatkan efek ekor jas dari pencalonan Anies Baswedan sebagai bakal calon presiden. Pencapaian Nasdem itu lantas mengakibatkan turunnya elektabilitas parpol lain yang menjadi basis pemilih Anies, termasuk Partai Demokrat dan Partai Keadilan Sejahtera (PKS). Sebagai informasi, Nasdem, Demokrat, dan PKS adalah partai yang menjatuhkan dukungan kepada Anies sebagai calon presiden pada Pilpres 2024 mendatang. Mereka tengah menjajaki kerja sama politik yang diberi nama Koalisi Perubahan. Simak selengkapnya dalam video berikut. Video Jurnalis: HAM, YOI, TAL Penulis: Ardito Ramadhan Penulis Naskah: Fathira Deiza A Narator: Fathira Deiza A Video Editor: Fathira Deiza A Produser: Khairun Alfi Syahri MJ Music: Get Up - Coyote Hearing #JernihkanHarapan #ElektabilitasNasdem #SurveiLitbangKompas'

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
t5_model = T5ForConditionalGeneration.from_pretrained(CONFIG_DATA['sum_pretrained'])
model = t5_model.to(device)
tokenizer = T5Tokenizer.from_pretrained(CONFIG_DATA['sum_pretrained'])
# generate summary
input_ids = tokenizer(articles, return_tensors='pt', padding=True)["input_ids"].to(device)
summary_ids = model.generate(input_ids,
            max_length=100, 
            num_beams=2,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True)
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

Partai Nasdem mendapatkan efek ekor jas dari pencalonan Anies Baswedan sebagai bakal calon presiden. Pencapaian Nasdem itu lantas mengakibatkan turunnya elektabilitas parpol lain yang menjadi basis pemilih Anies, termasuk Partai Demokrat dan Partai Keadilan Sejahtera (PKS ).


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
t5_model = T5ForConditionalGeneration.from_pretrained(CONFIG_DATA['sum_pretrained'])
model = t5_model.to(device)
tokenizer = T5Tokenizer.from_pretrained(CONFIG_DATA['sum_pretrained'])

def generate_summarization(data):
    input_ids = tokenizer(data, return_tensors='pt', padding=True).to(device)
    input_token = input_ids["input_ids"]
    
    with torch.no_grad():
        summary_ids = model.generate(
            input_token,
            min_length=CONFIG_DATA['min_length'],
            max_length=CONFIG_DATA['max_length'],
            num_beams=2,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True
        )

    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text

In [8]:
def load_data_to_array(data, key=[CONFIG_DATA['text_column'], CONFIG_DATA['sum_token_column']]):
    data_key = data[key]
    data_key = data_key.dropna()
    data_i_key = data_key.reset_index()
    arr_i_key = data_i_key[['index'] + key].values
    return arr_i_key

In [9]:
def run_summarization(arr):
    arr_generated = []
    errors = []
    
    i = 0
    for row in tqdm(arr):
        index = row[0]
        text = row[1]
        try:
            sg = generate_summarization(text)
            generated = [[index, sg]]
            arr_generated = [*arr_generated, *generated]
        except Exception as e:
            errors = [*errors, *[index]]
                
    print(f'Generated {len(arr_generated)}, Errors {len(errors)}')
    return np.array(arr_generated), np.array(errors)

In [10]:
def save_summarization(sum_arr, err_arr, df_ori):
    df_gen = pd.DataFrame(sum_arr)
    df_gen = df_gen.rename(columns={
        0: 'index',
        1: 'summarized'
    })
    df_gen['index'] = df_gen['index'].astype(int)
    df_gen = df_gen[df_gen['index'] != -1]

    df_r_index = df_ori.reset_index()

    df_merge = df_r_index.merge(df_gen, left_on='index', right_on='index', how='left')
    df_merge = df_merge.set_index('index')
    
    # save error
    df_err = pd.DataFrame(err_arr)
    utils.dump_json(df_err,CONFIG_DATA['data_error_path'])
    
    return df_merge

In [11]:
arr = load_data_to_array(data)
sum_arr, err_arr = run_summarization(arr)

save_summarization(sum_arr, err_arr, data).head(5)

  0%|          | 0/99 [00:00<?, ?it/s]

100%|██████████| 99/99 [37:31<00:00, 22.74s/it]

Generated 99, Errors 0





Unnamed: 0_level_0,FullText,tokens,summarized
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Hasil jajak pendapat yang diselenggarakan Litb...,761,Partai Nasdem mendapatkan efek ekor jas dari p...
1,"JAKARTA, KOMPAS.com - Pemerintah menargetkan p...",1837,Pemerintah menargetkan pertumbuhan ekonomi di ...
10,Sejumlah nama disebut-sebut Presiden Joko Wido...,733,Presiden Joko Widodo sebagai kandidat potensia...
11,PDI-Perjuangan memberikan tanggapannya mengena...,642,Sekretaris Jenderal PDI-P Hasto Kristiyanto me...
12,Presiden Joko Widodo menyinggung sejumlah figu...,733,Jokowi menyinggung sejumlah figur yang menurut...


In [9]:
data = utils.load_json(CONFIG_DATA['data_summarized_path'])
data


Unnamed: 0,FullText,tokens,summarized
0,Hasil jajak pendapat yang diselenggarakan Litb...,761,Partai Nasdem mendapatkan efek ekor jas dari p...
1,"JAKARTA, KOMPAS.com - Pemerintah menargetkan p...",1837,Pemerintah menargetkan pertumbuhan ekonomi di ...
10,Sejumlah nama disebut-sebut Presiden Joko Wido...,733,Presiden Joko Widodo sebagai kandidat potensia...
11,PDI-Perjuangan memberikan tanggapannya mengena...,642,Sekretaris Jenderal PDI-P Hasto Kristiyanto me...
12,Presiden Joko Widodo menyinggung sejumlah figu...,733,Jokowi menyinggung sejumlah figur yang menurut...
...,...,...,...
94,"JAKARTA, KOMPAS.com - Otoritas Jasa Keuangan (...",1567,Otoritas Jasa Keuangan ( OJK ) melihat pemulih...
95,Wakil Ketua Umum Gerindra Fadli Zon mengungkap...,819,Wakil Ketua Umum Gerindra Fadli Zon mengungkap...
96,Ketua Umum Partai Gerindra Prabowo Subianto ke...,493,Prabowo Subianto kembali melempar pujian terha...
97,KOMPAS.com - Salah satu faktor penyebaran hoak...,953,Salah satu faktor penyebaran hoaks yang marak ...


# NER 

In [23]:
import math
import time
import gensim
import torch
from torch import nn
import TorchCRF
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchtext.data import Field, NestedField, BucketIterator
from torchtext.datasets import SequenceTaggingDataset
from torchtext.vocab import Vocab
from collections import Counter
from spacy.lang.id import Indonesian
from sklearn.metrics import f1_score, classification_report
from ner.lrfinder import LRFinder
import src.utils as utils

CONFIG_DATA = utils.config_load()

In [24]:
available_gpu = torch.cuda.is_available()
if available_gpu:
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
    use_device = torch.device("cuda")
else:
    use_device = torch.device("cpu")

In [39]:
class Corpus(object):

    def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None):
        # list all the fields
        self.word_field = Field(lower=True)  # [sent len, batch_size]
        self.tag_field = Field(unk_token=None)  # [sent len, batch_size]
        # Character-level input
        self.char_nesting_field = Field(tokenize=list)
        self.char_field = NestedField(self.char_nesting_field)  # [batch_size, sent len, max len char]
        # create dataset using built-in parser from torchtext
        self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits(
            path=input_folder,
            train="train.tsv",
            validation="val.tsv",
            test="test.tsv",
            fields=(
                (("word", "char"), (self.word_field, self.char_field)),
                ("tag", self.tag_field)
            )
        )
        # convert fields to vocabulary list
        if wv_file:
            self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file)
            self.embedding_dim = self.wv_model.vector_size
            word_freq = {word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab}
            word_counter = Counter(word_freq)
            self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq)
            vectors = []
            for word, idx in self.word_field.vocab.stoi.items():
                if word in self.wv_model.wv.vocab:
                    vectors.append(torch.as_tensor(self.wv_model.wv[word].tolist()))
                else:
                    vectors.append(torch.zeros(self.embedding_dim))
            self.word_field.vocab.set_vectors(
                stoi=self.word_field.vocab.stoi,
                vectors=vectors,
                dim=self.embedding_dim
            )
        else:
            self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq)
        # build vocab for tag and characters
        self.char_field.build_vocab(self.train_dataset.char)
        self.tag_field.build_vocab(self.train_dataset.tag)
        # create iterator for batch input
        self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits(
            datasets=(self.train_dataset, self.val_dataset, self.test_dataset),
            batch_size=batch_size
        )
        # prepare padding index to be ignored during model training/evaluation
        self.word_pad_idx = self.word_field.vocab.stoi[self.word_field.pad_token]
        self.char_pad_idx = self.char_field.vocab.stoi[self.char_field.pad_token]
        self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]

In [41]:
corpus = Corpus(
    input_folder=CONFIG_DATA['input_folder'],
    min_word_freq=3,
    batch_size=64,
    wv_file=CONFIG_DATA['wv_file_path']
)
print(f"Train set: {len(corpus.train_dataset)} sentences")
print(f"Val set: {len(corpus.val_dataset)} sentences")
print(f"Test set: {len(corpus.test_dataset)} sentences")

Train set: 3535 sentences
Val set: 470 sentences
Test set: 468 sentences


In [74]:
class Embeddings(nn.Module):

    def __init__(self,
                 word_input_dim,
                 word_emb_dim,
                 word_emb_pretrained,
                 word_emb_dropout,
                 word_emb_froze,
                 use_char_emb,
                 char_input_dim,
                 char_emb_dim,
                 char_emb_dropout,
                 char_cnn_filter_num,
                 char_cnn_kernel_size,
                 char_cnn_dropout,
                 word_pad_idx,
                 char_pad_idx,
                 device
                 ):
        super().__init__()
        self.device = device
        self.word_pad_idx = word_pad_idx
        self.char_pad_idx = char_pad_idx
        # Word Embedding
        # initialize embedding with pretrained weights if given
        if word_emb_pretrained is not None:
            self.word_emb = nn.Embedding.from_pretrained(
                embeddings=torch.as_tensor(word_emb_pretrained),
                padding_idx=self.word_pad_idx,
                freeze=word_emb_froze
            )
        else:
            self.word_emb = nn.Embedding(
                num_embeddings=word_input_dim,
                embedding_dim=word_emb_dim,
                padding_idx=self.word_pad_idx
            )
            self.word_emb.weight.data[self.word_pad_idx] = torch.zeros(word_emb_dim)
        self.word_emb_dropout = nn.Dropout(word_emb_dropout)
        self.output_dim = word_emb_dim
        # Char Embedding
        self.use_char_emb = use_char_emb
        if self.use_char_emb:
            self.char_emb_dim = char_emb_dim
            self.char_emb = nn.Embedding(
                num_embeddings=char_input_dim,
                embedding_dim=char_emb_dim,
                padding_idx=char_pad_idx
            )
            # initialize embedding for char padding as zero
            self.char_emb.weight.data[self.char_pad_idx] = torch.zeros(self.char_emb_dim)
            self.char_emb_dropout = nn.Dropout(char_emb_dropout)
            # Char CNN
            self.char_cnn = nn.Conv1d(
                in_channels=char_emb_dim,
                out_channels=char_emb_dim * char_cnn_filter_num,
                kernel_size=char_cnn_kernel_size,
                groups=char_emb_dim  # different 1d conv for each embedding dim
            )
            self.char_cnn_dropout = nn.Dropout(char_cnn_dropout)
            self.output_dim += char_emb_dim * char_cnn_filter_num

    def forward(self, words, chars):
        # words = [sentence length, batch size]
        # chars = [batch size, sentence length, word length)
        # tags = [sentence length, batch size]
        # embedding_out = [sentence length, batch size, embedding dim]
        embedding_out = self.word_emb_dropout(self.word_emb(words))
        if not self.use_char_emb: return embedding_out
        # character cnn layer forward
        # reference: https://github.com/achernodub/targer/blob/master/src/layers/layer_char_cnn.py
        # char_emb_out = [batch size, sentence length, word length, char emb dim]
        char_emb_out = self.char_emb_dropout(self.char_emb(chars))
        batch_size, sent_len, word_len, char_emb_dim = char_emb_out.shape
        char_cnn_max_out = torch.zeros(batch_size, sent_len, self.char_cnn.out_channels, device=self.device)
        for sent_i in range(sent_len):
            # sent_char_emb = [batch size, word length, char emb dim]
            sent_char_emb = char_emb_out[:, sent_i, :, :]
            # sent_char_emb_p = [batch size, char emb dim, word length]
            sent_char_emb_p = sent_char_emb.permute(0, 2, 1)
            # char_cnn_sent_out = [batch size, out channels * char emb dim, word length - kernel size + 1]
            char_cnn_sent_out = self.char_cnn(sent_char_emb_p)
            char_cnn_max_out[:, sent_i, :], _ = torch.max(char_cnn_sent_out, dim=2)
        char_cnn = self.char_cnn_dropout(char_cnn_max_out)
        # concat word and char embedding
        # char_cnn_p = [sentence length, batch size, char emb dim * num filter]
        char_cnn_p = char_cnn.permute(1, 0, 2)
        word_features = torch.cat((embedding_out, char_cnn_p), dim=2)
        return word_features


class LSTMAttn(nn.Module):

    def __init__(self,
                 input_dim,
                 lstm_hidden_dim,
                 lstm_layers,
                 lstm_dropout,
                 word_pad_idx,
                 attn_heads=None,
                 attn_dropout=None
                 ):
        super().__init__()
        self.word_pad_idx = word_pad_idx
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=lstm_hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            dropout=lstm_dropout if lstm_layers > 1 else 0
        )
        self.attn_heads = attn_heads
        if self.attn_heads:
            self.attn = nn.MultiheadAttention(
                embed_dim=lstm_hidden_dim * 2,
                num_heads=attn_heads,
                dropout=attn_dropout
            )

    def forward(self, words, word_features):
        lstm_out, _ = self.lstm(word_features)
        if not self.attn_heads: return lstm_out
        # create masking for paddings
        key_padding_mask = torch.as_tensor(words == self.word_pad_idx).permute(1, 0)
        attn_out, _ = self.attn(lstm_out, lstm_out, lstm_out, key_padding_mask=key_padding_mask)
        return attn_out


class CRF(nn.Module):

    def __init__(self,
                 input_dim,
                 fc_dropout,
                 word_pad_idx,
                 tag_names,
                 ):
        super().__init__()
        self.word_pad_idx = word_pad_idx
        # Fully-connected
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(input_dim, len(tag_names))
        # CRF
        self.crf = TorchCRF.CRF(num_tags=len(tag_names))
        self.init_crf_transitions(tag_names)

    def forward(self, words, word_features, tags):
        # fc_out = [sentence length, batch size, output dim]
        fc_out = self.fc(self.fc_dropout(word_features))
        crf_mask = words != self.word_pad_idx
        crf_out = self.crf.decode(fc_out, mask=crf_mask)
        crf_loss = -self.crf(fc_out, tags=tags, mask=crf_mask) if tags is not None else None
        return crf_out, crf_loss

    def init_crf_transitions(self, tag_names, imp_value=-100):
        num_tags = len(tag_names)
        for i in range(num_tags):
            tag_name = tag_names[i]
            # I and L and <pad> impossible as a start
            if tag_name[0] in ("I", "L") or tag_name == "<pad>":
                torch.nn.init.constant_(self.crf.start_transitions[i], imp_value)
            # B and I impossible as an end
            if tag_name[0] in ("B", "I"):
                torch.nn.init.constant_(self.crf.end_transitions[i], imp_value)
        # init impossible transitions between positions
        tag_is = {}
        for tag_position in ("B", "I", "O", "U", "L"):
            tag_is[tag_position] = [i for i, tag in enumerate(tag_names) if tag[0] == tag_position]
        tag_is["P"] = [i for i, tag in enumerate(tag_names) if tag == "tag"]
        impossible_transitions_position = {
            "B": "BOUP",
            "I": "BOUP",
            "O": "IL",
            "U": "IL"
        }
        for from_tag, to_tag_list in impossible_transitions_position.items():
            to_tags = list(to_tag_list)
            for from_tag_i in tag_is[from_tag]:
                for to_tag in to_tags:
                    for to_tag_i in tag_is[to_tag]:
                        torch.nn.init.constant_(
                            self.crf.transitions[from_tag_i, to_tag_i], imp_value
                        )
        # init impossible B and I transitions to different entity types
        impossible_transitions_tags = {
            "B": "IL",
            "I": "IL"
        }
        for from_tag, to_tag_list in impossible_transitions_tags.items():
            to_tags = list(to_tag_list)
            for from_tag_i in tag_is[from_tag]:
                for to_tag in to_tags:
                    for to_tag_i in tag_is[to_tag]:
                        if tag_names[from_tag_i].split("-")[1] != tag_names[to_tag_i].split("-")[1]:
                            torch.nn.init.constant_(
                                self.crf.transitions[from_tag_i, to_tag_i], imp_value
                            )


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class NERModel(nn.Module):

    def __init__(self,
                 word_input_dim,
                 word_pad_idx,
                 char_pad_idx,
                 tag_names,
                 device,
                 model_arch="bilstm",
                 word_emb_dim=300,
                 word_emb_pretrained=None,
                 word_emb_dropout=0.5,
                 word_emb_froze=False,
                 use_char_emb=False,
                 char_input_dim=None,
                 char_emb_dim=None,
                 char_emb_dropout=None,
                 char_cnn_filter_num=None,
                 char_cnn_kernel_size=None,
                 char_cnn_dropout=None,
                 lstm_hidden_dim=64,
                 lstm_layers=2,
                 lstm_dropout=0.1,
                 attn_heads=None,
                 attn_dropout=None,
                 trf_layers=None,
                 fc_hidden=None,
                 fc_dropout=0.25
                 ):
        super().__init__()
        # Embeddings
        self.embeddings = Embeddings(
            word_input_dim=word_input_dim,
            word_emb_dim=word_emb_dim,
            word_emb_pretrained=word_emb_pretrained,
            word_emb_dropout=word_emb_dropout,
            word_emb_froze=word_emb_froze,
            use_char_emb=use_char_emb,
            char_input_dim=char_input_dim,
            char_emb_dim=char_emb_dim,
            char_emb_dropout=char_emb_dropout,
            char_cnn_filter_num=char_cnn_filter_num,
            char_cnn_kernel_size=char_cnn_kernel_size,
            char_cnn_dropout=char_cnn_dropout,
            word_pad_idx=word_pad_idx,
            char_pad_idx=char_pad_idx,
            device=device
        )
        if model_arch.lower() == "bilstm":
            # LSTM-Attention
            self.encoder = LSTMAttn(
                 input_dim=self.embeddings.output_dim,
                 lstm_hidden_dim=lstm_hidden_dim,
                 lstm_layers=lstm_layers,
                 lstm_dropout=lstm_dropout,
                 word_pad_idx=word_pad_idx,
                 attn_heads=attn_heads,
                 attn_dropout=attn_dropout
            )
            encoder_output_dim = lstm_hidden_dim * 2
        else:
            raise ValueError("param `model_arch` must be either 'bilstm' or 'transformer'")
        # CRF
        self.crf = CRF(
            input_dim=encoder_output_dim,
            fc_dropout=fc_dropout,
            word_pad_idx=word_pad_idx,
            tag_names=tag_names
        )

    def forward(self, words, chars, tags=None):
        word_features = self.embeddings(words, chars)
        # lstm_out = [sentence length, batch size, hidden dim * 2]
        encoder_out = self.encoder(words, word_features)
        # fc_out = [sentence length, batch size, output dim]
        crf_out, crf_loss = self.crf(words, encoder_out, tags)
        return crf_out, crf_loss

    def save_state(self, path):
        torch.save(self.state_dict(), path)

    def load_state(self, path):
        self.load_state_dict(torch.load(path))

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [75]:
class Trainer(object):

    def __init__(self, model, data, optimizer, device, checkpoint_path=None):
        self.device = device
        self.model = model.to(self.device)
        self.data = data
        self.optimizer = optimizer
        self.checkpoint_path = checkpoint_path

    @staticmethod
    def epoch_time(start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    def f1_positive(self, preds, y, full_report=False):
        index_o = self.data.tag_field.vocab.stoi["O"]
        # take all labels except padding and "O"
        positive_labels = [i for i in range(len(self.data.tag_field.vocab.itos))
                           if i not in (self.data.tag_pad_idx, index_o)]
        # make the prediction one dimensional to follow sklearn f1 score input param
        flatten_preds = [pred for sent_pred in preds for pred in sent_pred]
        # remove prediction for padding and "O"
        positive_preds = [pred for pred in flatten_preds
                          if pred not in (self.data.tag_pad_idx, index_o)]
        # make the true tags one dimensional to follow sklearn f1 score input param
        flatten_y = [tag for sent_tag in y for tag in sent_tag]
        if full_report:
            # take all names except padding and "O"
            positive_names = [self.data.tag_field.vocab.itos[i]
                              for i in range(len(self.data.tag_field.vocab.itos))
                              if i not in (self.data.tag_pad_idx, index_o)]
            print(classification_report(
                y_true=flatten_y,
                y_pred=flatten_preds,
                labels=positive_labels,
                target_names=positive_names
            ))
        # average "micro" means we take weighted average of the class f1 score
        # weighted based on the number of support
        return f1_score(
            y_true=flatten_y,
            y_pred=flatten_preds,
            labels=positive_labels,
            average="micro"
        ) if len(positive_preds) > 0 else 0

    def epoch(self):
        epoch_loss = 0
        true_tags_epoch = []
        pred_tags_epoch = []
        self.model.train()
        for batch in self.data.train_iter:
            # words = [sent len, batch size]
            words = batch.word.to(self.device)
            # chars = [batch size, sent len, char len]
            chars = batch.char.to(self.device)
            # tags = [sent len, batch size]
            true_tags = batch.tag.to(self.device)
            self.optimizer.zero_grad()
            pred_tags_list, batch_loss = self.model(words, chars, true_tags)
            pred_tags_epoch += pred_tags_list
            # to calculate the loss and f1, we flatten true tags
            true_tags_epoch += [
                [tag for tag in sent_tag if tag != self.data.tag_pad_idx]
                for sent_tag in true_tags.permute(1, 0).tolist()
            ]
            batch_loss.backward()
            self.optimizer.step()
            epoch_loss += batch_loss.item()
        epoch_score = self.f1_positive(pred_tags_epoch, true_tags_epoch)
        return epoch_loss / len(self.data.train_iter), epoch_score

    def evaluate(self, iterator, full_report=False):
        epoch_loss = 0
        true_tags_epoch = []
        pred_tags_epoch = []
        self.model.eval()
        with torch.no_grad():
            # similar to epoch() but model is in evaluation mode and no backprop
            for batch in iterator:
                words = batch.word.to(self.device)
                chars = batch.char.to(self.device)
                true_tags = batch.tag.to(self.device)
                pred_tags, batch_loss = self.model(words, chars, true_tags)
                pred_tags_epoch += pred_tags
                true_tags_epoch += [
                    [tag for tag in sent_tag if tag != self.data.tag_pad_idx]
                    for sent_tag in true_tags.permute(1, 0).tolist()
                ]
                epoch_loss += batch_loss.item()
        epoch_score = self.f1_positive(pred_tags_epoch, true_tags_epoch, full_report)
        return epoch_loss / len(iterator), epoch_score

    ### BEGIN MODIFIED SECTION: LEARNING RATE ###
    def train(self, max_epochs, no_improvement=None):
        history = {
            "num_params": self.model.count_parameters(),
            "train_loss": [],
            "train_f1": [],
            "val_loss": [],
            "val_f1": [],
        }
        elapsed_train_time = 0
        best_val_f1 = 0
        best_epoch = None
        # scheduler object from pytorch
        # reduce learning rate by a factor of 0.3 if there is no performance
        # improvement after 3 epochs
        lr_scheduler = ReduceLROnPlateau(
            optimizer=self.optimizer,
            patience=3,
            factor=0.3,
            mode="max",
            verbose=True
        )
        epoch = 1
        n_stagnant = 0  # preparation for early stopping
        stop = False
        while not stop:
            start_time = time.time()
            train_loss, train_f1 = self.epoch()
            end_time = time.time()
            elapsed_train_time += end_time - start_time
            history["train_loss"].append(train_loss)
            history["train_f1"].append(train_f1)
            val_loss, val_f1 = self.evaluate(self.data.val_iter)
            lr_scheduler.step(val_f1)  # inform the scheduler
            # take the current model if it is at least 1% better than the previous best F1
            if self.checkpoint_path and val_f1 > (1.01 * best_val_f1):
                print(f"Epoch {epoch:5d}: found better Val F1: {val_f1:.4f} (Train F1: {train_f1:.4f}), saving model...")
                self.model.save_state(self.checkpoint_path)
                best_val_f1 = val_f1
                best_epoch = epoch
                n_stagnant = 0
            else:
                n_stagnant += 1
            history["val_loss"].append(val_loss)
            history["val_f1"].append(val_f1)
            if epoch >= max_epochs:
                print(f"Reach maximum number of epoch: {epoch}, stop training.")
                stop = True
            elif no_improvement is not None and n_stagnant >= no_improvement:
                print(f"No improvement after {n_stagnant} epochs, stop training.")
                stop = True
            else:
                epoch += 1
        if self.checkpoint_path and best_val_f1 > 0:
            self.model.load_state(self.checkpoint_path)
        test_loss, test_f1 = self.evaluate(self.data.test_iter)
        history["best_val_f1"] = best_val_f1
        history["best_epoch"] = best_epoch
        history["test_loss"] = test_loss
        history["test_f1"] = test_f1
        history["elapsed_train_time"] = elapsed_train_time
        return history
    ### END MODIFIED SECTION ###

    def infer(self, sentence, true_tags=None):
        self.model.eval()
        # tokenize sentence
        nlp = Indonesian()
        tokens = [token.text for token in nlp(sentence)]
        max_word_len = max([len(token) for token in tokens])
        # transform to indices based on corpus vocab
        numericalized_tokens = [self.data.word_field.vocab.stoi[token.lower()] for token in tokens]
        numericalized_chars = []
        char_pad_id = self.data.char_pad_idx
        for token in tokens:
            numericalized_chars.append(
                [self.data.char_field.vocab.stoi[char] for char in token]
                + [char_pad_id for _ in range(max_word_len - len(token))]
            )
        # find unknown words
        unk_idx = self.data.word_field.vocab.stoi[self.data.word_field.unk_token]
        unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]
        # begin prediction
        token_tensor = torch.as_tensor(numericalized_tokens)
        token_tensor = token_tensor.unsqueeze(-1).to(self.device)
        char_tensor = torch.as_tensor(numericalized_chars)
        char_tensor = char_tensor.unsqueeze(0).to(self.device)
        predictions, _ = self.model(token_tensor, char_tensor)
        # convert results to tags
        predicted_tags = [self.data.tag_field.vocab.itos[t] for t in predictions[0]]
        # print inferred tags
        max_len_token = max([len(token) for token in tokens] + [len('word')])
        max_len_tag = max([len(tag) for tag in predicted_tags] + [len('pred')])
        print(
            f"{'word'.ljust(max_len_token)}\t{'unk'.ljust(max_len_token)}\t{'pred tag'.ljust(max_len_tag)}"
            + ("\ttrue tag" if true_tags else "")
        )
        for i, token in enumerate(tokens):
            is_unk = "✓" if token in unks else ""
            print(
                f"{token.ljust(max_len_token)}\t{is_unk.ljust(max_len_token)}\t{predicted_tags[i].ljust(max_len_tag)}"
                + (f"\t{true_tags[i]}" if true_tags else "")
            )
        return tokens, predicted_tags, unks

In [76]:
# configurations building block
base = {
    "word_input_dim": len(corpus.word_field.vocab),
    "char_pad_idx": corpus.char_pad_idx,
    "word_pad_idx": corpus.word_pad_idx,
    "tag_names": corpus.tag_field.vocab.itos,
    "device": use_device
}
w2v = {
    "word_emb_pretrained": corpus.word_field.vocab.vectors if corpus.wv_model else None
}
cnn = {
    "use_char_emb": True,
    "char_input_dim": len(corpus.char_field.vocab),
    "char_emb_dim": 37,
    "char_emb_dropout": 0.25,
    "char_cnn_filter_num": 4,
    "char_cnn_kernel_size": 3,
    "char_cnn_dropout": 0.25
}
configs = {
    "bilstm+w2v+cnn": {**base, **w2v, **cnn}
}
search_space = {
    "bilstm+w2v+cnn": (1e-5, 2)
}
for model_name in configs:
    print(f"Begin LR Finder for model: {model_name}")
    model = NERModel(**configs[model_name])
    start_lr, end_lr = search_space[model_name]
    lr_finder = LRFinder(model, Adam(model.parameters(), lr=start_lr, weight_decay=1e-2), device=use_device)
    lr_finder.range_test(corpus.train_iter, corpus.val_iter, end_lr=end_lr, num_iter=55, step_mode="exp", diverge_th=3, disable_progress_bar=True)

Begin LR Finder for model: bilstm+w2v+cnn
Learning rate search finished. See the graph with {finder_name}.plot()


In [78]:
# New initial learning rate
lrs = {
    "bilstm": 1e-2,
    "bilstm+w2v": 1e-2,
    "bilstm+w2v+cnn": 3e-3,
    "bilstm+w2v+cnn+attn": 3e-3,
    "transformer+w2v+cnn": 4e-4
}
max_epochs = 50
no_improvement = 10
histories = {}
for model_name in configs:
    print(f"Start Training: {model_name}")
    model = NERModel(**configs[model_name])
    trainer = Trainer(
        model=model,
        data=corpus,
        optimizer=Adam(model.parameters(), lr=lrs[model_name], weight_decay=1e-2),  # add weight decay for Adam
        device=use_device,
        checkpoint_path=CONFIG_DATA['best_model_path']
    )
    histories[model_name] = trainer.train(max_epochs, no_improvement)
    print(f"Done Training: {model_name}")
    print()

Start Training: bilstm+w2v+cnn
Epoch     2: found better Val F1: 0.3155 (Train F1: 0.1523), saving model...
Epoch     3: found better Val F1: 0.5876 (Train F1: 0.4533), saving model...
Epoch     4: found better Val F1: 0.6141 (Train F1: 0.5906), saving model...
Epoch     5: found better Val F1: 0.6211 (Train F1: 0.6691), saving model...
Epoch     6: found better Val F1: 0.6763 (Train F1: 0.7041), saving model...
Epoch     7: found better Val F1: 0.6835 (Train F1: 0.7307), saving model...
Epoch    11: found better Val F1: 0.6948 (Train F1: 0.8030), saving model...
Epoch    12: found better Val F1: 0.7105 (Train F1: 0.8176), saving model...
Epoch 00016: reducing learning rate of group 0 to 9.0000e-04.
Epoch 00020: reducing learning rate of group 0 to 2.7000e-04.
No improvement after 10 epochs, stop training.
Done Training: bilstm+w2v+cnn



In [81]:
for model_name in configs:
    print(f"Sample inferences for model: {model_name}")
    trainer.model = NERModel(**configs[model_name]).to(use_device)
    trainer.model.load_state(CONFIG_DATA['best_model_path'])
 
    # https://regional.kompas.com/read/2020/07/15/16583081/banjir-bandang-di-masamba-19-korban-meninggal-23-hilang-15000-mengungsi
    sentence = "Sementara itu, Kepala Pelaksana BPBD Luwu Utara Muslim Muchtar mengatakan, terdapat 15.000 jiwa mengungsi akibat banjir bandang."
    tags = ["O", "O", "O", "O", "O", "B-ORGANIZATION", "I-ORGANIZATION", "L-ORGANIZATION", "B-PERSON", "L-PERSON", "O", "O", "O", "U-QUANTITY", "O", "O", "O", "O", "O", "O"]
    words, infer_tags, unknown_tokens = trainer.infer(sentence=sentence, true_tags=tags)
    print()

Sample inferences for model: bilstm+w2v+cnn
word      	unk       	pred tag  	true tag
Sementara 	          	O         	O
itu       	          	O         	O
,         	✓         	O         	O
Kepala    	          	O         	O
Pelaksana 	          	O         	O
BPBD      	✓         	O         	B-ORGANIZATION
Luwu      	          	O         	I-ORGANIZATION
Utara     	          	B-LOCATION	L-ORGANIZATION
Muslim    	          	I-LOCATION	B-PERSON
Muchtar   	          	L-LOCATION	L-PERSON
mengatakan	          	O         	O
,         	✓         	O         	O
terdapat  	          	O         	O
15.000    	✓         	B-QUANTITY	U-QUANTITY
jiwa      	          	L-QUANTITY	O
mengungsi 	          	O         	O
akibat    	          	O         	O
banjir    	          	O         	O
bandang   	          	O         	O
.         	          	O         	O



# NER 2


In [2]:
import pandas as pd
import spacy
from IPython.display import HTML
import src.utils as utils

CONFIG_DATA=utils.config_load()

# Load the NER model
nlp = spacy.load('xx_ent_wiki_sm')

# Load your NER dataset (adjust the file path as needed)
ner_df = pd.read_csv(CONFIG_DATA['ner_data_path'], sep="\t")

# Create a dictionary mapping words to their corresponding tags
word_tag_dict = dict(zip(ner_df['word'], ner_df['tag']))

# Function to implement NER on the testing dataset
def implement_ner(text):
    doc = nlp(text)
    entities = [(token.text, word_tag_dict.get(token.text, "O")) for token in doc]
    return entities

# Load your testing dataset 
test_df = utils.load_json(CONFIG_DATA['data_summarized_path'])

# Apply the NER function to create a new column with NER results
test_df['ner_results'] = test_df['summarized'].apply(implement_ner)

# Dictionary to map entity labels to fixed colors
entity_colors = {
    'O': None,  # No color for 'O' entities
    'ORGANIZATION': '#FFD700',  # Gold color for 'ORGANIZATION' entities
    'PERSON': '#00FF00',        # Green color for 'PERSON' entities
    'ORGANIZATION': '#FFD700',  # Gold color for 'ORGANIZATION' entities
    'TIME': '#FFA500',          # Orange color for 'TIME' entities
    'LOCATION': '#6495ED',      # Cornflower Blue color for 'LOCATION' entities
    'EVENT': '#9932CC',      # Dark Orchid color for 'EVENT' entities
}

# Function to visualize NER results as HTML with fixed colors for each entity label
def visualize_ner(text, entities):
    for ent_text, ent_label in entities:
        if isinstance(ent_label, str):
            ent_color = entity_colors.get(ent_label, None)
            if ent_color is not None:
                text = text.replace(ent_text, f'<span style="background-color: {ent_color};">{ent_text}</span>')
    return f'<p>{text}</p>'

# Display the DataFrame with NER results and visualization
pd.set_option('display.max_colwidth', None)
test_df['ner_visualization'] = test_df.apply(lambda row: visualize_ner(row['summarized'], row['ner_results']), axis=1)
HTML(test_df[['ner_visualization']].to_html(escape=False, header=False, index=False))

0
"Partai Nasdem mendapatkan efek ekor jas dari pencalonan Anies Baswedan sebagai bakal calon presiden. Pencapaian Nasdem itu lantas mengakibatkan turunnya elektabilitas parpol lain yang menjadi basis pemilih Anies, termasuk Partai Demokrat dan Partai Keadilan Sejahtera (PKS ). Sebagai informasi, Nasdem, Demokrat, dan PKS adalah partai yang menjatuhkan dukungan kepada Anies sebagai calon Presiden pada Pilpres 2024 mendatang. Mereka tengah menjajaki kerja sama politik yang diberi nama Koalisi Perubahan. Klik di sini untuk maklumat lanjut. Sementara itu, Politico melaporkan bahwa"
"Pemerintah menargetkan pertumbuhan ekonomi di 2024 bisa di kisaran 5,3 persen-5,7 persen. Namun, ekonom menilai target tersebut cukup menantang, bahkan sulit untuk bisa mencapai batas atas di tengah kondisi tahun politik. Seperti diketahui, pemilihan presiden (pilpres) akan berlangsung pada UTHM, yang sekaligus menjadi tahun terakhir pemerintahan Presiden Joko Widodo. Klik untuk maklumat lanjut mengenai pilihan raya 2016. atau klik di sini untuk melihat siapa terpilih pemimpin baru. dan kabinet."
"Presiden Joko Widodo sebagai kandidat potensial calon presiden dan calon wakil presiden Pemilu 2024. Nama-Nama tersebut meliputi beberapa menteri dan ketua umum partai politik, yang hadir dalam acara Hari Lahir Partai Persatuan pembangunan ke-50 pada Jumat (17/2/2023 ). Presiden lalu menyebutkan sosok yang berulang kali disinggung Jokowi, seperti Ketum Partai Gerindra Prabowo Subianto dan Ketucuan Partai Demokrat Agus Harimurti Yudhoyono. Klik untuk maklumat lanjut. atau lihat bagaimana perasaan Presiden Jokowi."
"Sekretaris Jenderal PDI-P Hasto Kristiyanto menganggap bahwa Partai Ummat tidak memahami aspek fundamental pembentukan bangsa Indonesia. Ia menyebut bahwa partai politik seharusnya memahami sejarah kemerdekaan dan memahami ideologi Pancasila. Hastos pun meyakini bahwa politik identitas yang diusung oleh Partai UGMmat tak akan mendapatkan tempat. Simak informasinya dalam video berikut. (Sementara itu, Pemilu 2019 di Florida mulai memasuki hari kedelapannya tahun ini. Klik untuk maklumat lanjut. atau baca mengenai pemenang pemilihan semula Obama pada Selasa."
"Jokowi menyinggung sejumlah figur yang menurutnya bakal menjadi calon presiden di Pilpres 2024. Momen itu terjadi dalam perayaan Hari Lahir ke-50 Partai Persatuan Pembangunan (PPP) di ICE BSD, Tangerang, Jumat [17/2/2023]. Jokowi turut menyebut nama Ketua Umum Partai Demokrat Agus Harimurti Yudhoyono sebagai kandidat capres. Atas hal ini, Direktur Lembaga Kajian Politik Nusakom Pratama, Ari Junaedi menilai, Jokowi tengah menyampaikan politik damai dengan ikut menyebut AHY sebagai calon calon kepresidenan."
"Direktur Lembaga Kajian Politik Nusakom Pratama Ari Junaedi menilai, Presiden Joko Widodo tengah menyampaikan politik damai dengan turut menyebut Ketua Umum Partai Demokrat Agus Harimurti Yudhoyono (AHY) sebagai kandidat calon presiden. Padahal, Demokrat merupakan partai politik -parpol oposisi pemerintahan Jokowi sejak 2014. Jokowi kerap menunjukan politik itu berwajah humanis, bukan kekuasaan semata, ujar Ari pada Kompas.com, Senin ""20/2/2023"". Dia menambahkan bahwa rivalitas tidak selamanya harus terus dikorbankan."
"Jajak pendapat Litbang Kompas Februari 2023 menunjukkan kepuasan publik pada pemerintah yang tertinggi terletak pada bidang politik dan keamanan. Dikutip dari Harian Kompas, Senin (20/2/2023), angka kepuasan umum pada pemerintahan Presiden Joko Widodo, dan Wakil Presiden Maruf Amin di bidang Politik dan Keamanan mencapai 79,2 persen. Sementara itu, sebanyak 20,8 persen responden yang menyatakan belum puas. Kemudian, sektor kedua yang memiliki tingkat kepuasan tertinggi adalah kesejahteraan sosial. Klik untuk maklumat lanjut. ["
"Ketua Badan Pengawas Pemilu (Bawaslu ) Rahmat Bagja berharap agar peserta pemilu 2024 tidak menggunakan politik identitas, seperti pada Pemilu 2019 lalu. Ia menilai bahwa politik identiti merupakan permasalahan yang besar. Apalagi, jika melakukan kampanye atau sosialisasi di tempat ibadah, hal itu akan menimbulkan politisasi berbasis SARA yang dapat menyebabkan perpecahan antar masyarakat untuk ke depannya. Untuk itu, anda boleh berspekulasi dengan cara ini. Anda juga dapat mendengarkan orang-orang dari berbagai sudut. dan gunakan kekerasan."
"Nama presenter Uya Kuya dan pesinetron Verrel Bramasta yang resmi jadi kader PAN. Ketua Umum PAN Zulkifli Hasan mengatakan, artis-artis tersebut merupakan rombongan Eko Patrio, yang merupakan kaderPAN. Zulhas menuturkan telah kalah pamor dengan pelatih PAN yang berasal dari kalangan artis. Dia mencontohkan pengikut akun Instagramnya yang kalah dengan Verrell. Simak selengkapnya dalam video berikut. Klik untuk maklumat lanjut mengenai skandal Vp PAN ini atau klik di sini untuk mengetahui lebih lanjut."
"Wakil Ketua Dewan Pembina PSI, Grace Natalie mengkritik soal kampanye politik identitas yang dimainkan partai politik pada pemilu yang akan datang. Ia menyayangkan adanya praktik politik identifikasi yang membuat masyarakat terbuai sehingga melupakan aspek penting untuk membangun bangsa. Menurut Grace, saat ini aspek solidaritas mulai menurun dalam kontestasi politik. Menurutnya, ketika ini partai politiknya lebih menonjolkan perbedaan. Simak selengkapnya dalam video tersebut.. [2] Jakarta / Makassar. Ini adalah hari yang sangat penting bagi Indonesia."


# 5W+1H

In [1]:
import src.utils as utils
CONFIG_DATA = utils.config_load()

import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import re

# Function to extract 5W+1H points from the given text
def extract_5w1h(text):
    sentences = [re.split(r'[.!?]', text) for text in text_summaries]
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    with torch.no_grad():
        inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        embeddings = torch.mean(outputs.last_hidden_state, dim=1)

    # Define question keywords
    questions = {
        "Who": ["person", "people", "man", "woman", "he", "she"],
        "What": ["what", "thing"],
        "When": ["when", "time", "date", "year"],
        "Where": ["where", "place", "location"],
        "Why": ["why", "reason", "cause"],
        "How": ["how", "way", "method"],
    }

    results = []
    for emb, text_summary in zip(embeddings, text_summaries):
        result = {}
        for key, keywords in questions.items():
            relevant_sentences = [sentence for sentence in sentences if any(kw in emb.lower() for kw in keywords)]
            result[key] = ". ".join(relevant_sentences)
        results.append(result)

    return results

def main():
    # Assuming you have a CSV file named "dataset.csv" with a column "text summary"
    # Adjust the file path and column name accordingly
    df = CONFIG_DATA['data_summarized_path']

    text_summaries = df["summarized"].tolist()
    points_list = extract_5w1h(text_summaries)

    for text_summary, points in zip(text_summaries, points_list):
        print("Text Summary:")
        print(text_summary)
        print("5W+1H Points:")
        for key, value in points.items():
            print(f"{key}: {value}")
        print("=" * 30)



In [2]:
main()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/ralali/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/_x/973zq6t143b441n_5wg3xvrw0000gn/T/ipykernel_7194/451043146.py", line 1, in <module>
    main()
  File "/var/folders/_x/973zq6t143b441n_5wg3xvrw0000gn/T/ipykernel_7194/1162447966.py", line 46, in main
    text_summaries = df["summarized"].tolist()
TypeError: string indices must be integers

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ralali/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py", line 2057, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/Users/ralali/Library/Python/3.8/lib/python/site-packages/IPython/core/ultratb.py", line 1118, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/Users/ra