In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import datetime
import numpy as np
import random, time, os
from torch.nn import Embedding
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from transformers import get_linear_schedule_with_warmup, pipeline
from keras.utils import pad_sequences
from collections import defaultdict
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm
from rouge import Rouge
import ast
import math

In [47]:
pd.options.mode.chained_assignment = None
data = pd.read_csv("indosum_srl_test.csv")
indosum_data = load_dataset("maryantocinn/indosum", trust_remote_code=True)

In [None]:
# split punctuation in every word
sentence_summary = []
for text in indosum_data["validation"]:
    new_text = text["summary"].split(",")
    new_text = " ,".join(new_text)
    new_text = new_text.split(".")
    new_text = " .".join(new_text)
    new_text = new_text.split("?")
    new_text = " ?".join(new_text)
    new_text = new_text.lower()
    sentence_summary.append(new_text)

In [None]:
device = "cuda:7"
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = BertModel.from_pretrained('indobenchmark/indobert-base-p1')
model.to(device)

def get_word_embeddings(tokens):
    inputs = tokenizer(tokens, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.squeeze(0)
    return embeddings.cpu().numpy()

def filter_labels_if_not_in_common(embed1, embed2, labels1, labels2):
    common_labels = set(labels1).intersection(set(labels2))

    embed1 = [embed for embed, label in zip(embed1, labels1) if label in common_labels]
    label1 = [label for label in labels1 if label in common_labels]
    embed2 = [embed for embed, label in zip(embed2, labels2) if label in common_labels]
    label2 = [label for label in labels2 if label in common_labels]

    all_elements = set(labels1).union(set(labels2))

    return embed1, embed2, label1, label2, len(all_elements)

def filter_words_labels(srl_tag):
    filtered_labels = [label for label in srl_tag if label != 'O']
    filtered_labels = [label[2:] for label in filtered_labels]
    return filtered_labels

def filter_words_token(row):
    tokens = row["sentence"].split()
    labels = row["srl"]
    filtered_tokens = [token for token, label in zip(tokens, labels) if label != 'O']
    return filtered_tokens

def sentence_similarity(embeddings1, embeddings2, labels1, labels2, count):
    max_similarities = {}

    for i, (emb1, label1) in enumerate(zip(embeddings1, labels1)):
        for j, (emb2, label2) in enumerate(zip(embeddings2, labels2)):
            if label1 == label2:
                # Calculate similarity
                similarity = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]

                if label1 not in max_similarities or similarity > max_similarities[label1]:
                    max_similarities[label1] = similarity

    # Sum up the maximum similarities for each label
    total_max_similarity = sum(max_similarities.values())

    return total_max_similarity / count

def count_o_labels(srl_tags):
    return srl_tags.count('O')

def reduce_same_sentence(data):
    data['o_label_count'] = data['srl'].apply(count_o_labels)
    df_sorted = data.sort_values(by='o_label_count', ascending=True)
    df_reduced = df_sorted.drop_duplicates(subset='sentence', keep='first')
    return df_reduced.sort_index()

def calculate_sentence_scores(df):
    sentence_scores = []
    similarity_cache = {}  # Cache to store previously computed similarities
    for i, row1 in tqdm(df.iterrows(), total=df.shape[0], desc="Processing sentences"):
        embeddings1, labels1 = row1['embeddings'], row1['srl']
        sentence_score = 0
        
        for j, row2 in df.iterrows():
            if i != j:
                # Check if this similarity has been computed before
                if (i, j) in similarity_cache:
                    similarity = similarity_cache[(i, j)]
                elif (j, i) in similarity_cache:
                    similarity = similarity_cache[(j, i)]
                else:
                    embeddings2, labels2 = row2['embeddings'], row2['srl']
                    embeddings1, embeddings2, labels1, labels2, count = filter_labels_if_not_in_common(embeddings1, embeddings2, labels1, labels2)
                    similarity = sentence_similarity(embeddings1, embeddings2, labels1, labels2, count)
                    similarity_cache[(i, j)] = similarity
                    similarity_cache[(j, i)] = similarity
                    
                sentence_score += similarity
        
        sentence_scores.append(sentence_score)
    
    return sentence_scores


In [None]:
length_article = len(data["article_id"].value_counts())
list_sentence_hyp = []
for i in tqdm(length_article, desc="Processing articles"):
    df = data[data["article_id"] == i]
    df["srl"] = df["srl"].apply(ast.literal_eval)
    df = reduce_same_sentence(df)
    df["labels"] =  df['srl'].apply(filter_words_labels)
    df["token"] = df.apply(filter_words_token, axis=1)
    df['embeddings'] = df["token"].apply(get_word_embeddings)
    # Calculate sentence scores
    score = calculate_sentence_scores(df)
    df["score"] = score
    df.sort_values(by="score", ascending=False, inplace=True)
    top_count = math.ceil(len(df) / 4)
    top = df.head(top_count)
    top.sort_values("sentence_id", inplace=True)
    sentences = ""
    for i, sentence in top.iterrows():
        sentences += " " + sentence["sentence"] + " ."
    list_sentence_hyp.append(sentences.strip())

In [None]:
# Create a Rouge object
rouge = Rouge()
f1_scores = []

# Calculate ROUGE scores
for system, reference in zip(list_sentence_hyp, sentence_summary):
    scores = rouge.get_scores(system, reference)[0]  # get_scores returns a list of results
    f1_rouge1 = scores['rouge-1']['f']
    f1_rouge2 = scores['rouge-2']['f']
    f1_rougeL = scores['rouge-l']['f']
    
    # Collect F1 scores for all three ROUGE metrics
    f1_scores.append((f1_rouge1, f1_rouge2, f1_rougeL))

# Convert list of tuples to a NumPy array for easy averaging
f1_scores_array = np.array(f1_scores)

# Calculate average F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L
average_f1_scores = np.mean(f1_scores_array, axis=0)
print(f"Average F1 Scores: ROUGE-1: {average_f1_scores[0]:.4f}, ROUGE-2: {average_f1_scores[1]:.4f}, ROUGE-L: {average_f1_scores[2]:.4f}")