## BERTScore , BARTScore , EM/F1 Score - Exact Match (%) , F1 Score (%)

In [1]:
! pip install transformers
! pip install bert-score
#! !pip install scipy transformers pandas torch



### 임의의 csv 2개 생성해서 테스트 해봄 (Please Modify this)

In [7]:
import pandas as pd
import numpy as np

# 랜덤 텍스트 생성 함수
def generate_random_text(num_samples):
    random_texts = []
    for _ in range(num_samples):
        length = np.random.randint(5, 15)  # 문장의 길이는 5~15 단어 사이로 랜덤 설정
        text = ' '.join(np.random.choice(['이것은', '랜덤', '텍스트', '생성', '예제입니다', '테스트', '데이터', '입니다', '여기에', '다양한', '단어가', '포함됩니다'], length))
        random_texts.append(text)
    return random_texts

# 데이터 생성
num_samples = 50
reference_texts = generate_random_text(num_samples)
predicted_texts = generate_random_text(num_samples)

# 데이터프레임 생성
reference_df = pd.DataFrame({'id': range(1, num_samples + 1), 'reference': reference_texts})
predicted_df = pd.DataFrame({'id': range(1, num_samples + 1), 'predicted': predicted_texts})

# CSV 파일로 저장
reference_df.to_csv('reference.csv', index=False)
predicted_df.to_csv('predicted.csv', index=False)

print("CSV files have been created.")


CSV files have been created.


In [18]:
import pandas as pd
import numpy as np
import re
import string
import time
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from transformers import BartTokenizer, BartForConditionalGeneration
from scipy.special import softmax
import torch


# KoBERT 모델과 토크나이저 로드
bert_model_name = "monologg/kobert"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name).to('cuda' if torch.cuda.is_available() else 'cpu')

# KoBART 모델과 토크나이저 로드
bart_model_name = "gogamza/kobart-base-v2"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name).to('cuda' if torch.cuda.is_available() else 'cpu')


# BARTScore 계산 함수 정의
def bart_score(candidate, reference):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # 토크나이징 및 입력 길이 제한
    candidate_ids = bart_tokenizer(candidate, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)
    reference_ids = bart_tokenizer(reference, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)

    with torch.no_grad():
        # 입력된 텍스트의 logits 계산
        candidate_logits = bart_model(candidate_ids).logits
        reference_logits = bart_model(reference_ids).logits

    # softmax를 사용하여 확률 계산
    candidate_probs = softmax(candidate_logits.cpu().numpy(), axis=-1)
    reference_probs = softmax(reference_logits.cpu().numpy(), axis=-1)

    # BARTScore 계산 (로그 확률의 합)
    bart_score_value = 0
    for i in range(min(candidate_ids.shape[1], reference_ids.shape[1])):
        bart_score_value += reference_probs[0, i, candidate_ids[0, i].item()]

    return bart_score_value

# 텍스트 정규화 함수 정의
def normalize_answer(s):
    """소문자 변환, 불필요한 기호 제거 등을 통해 텍스트를 정규화"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# F1 점수 계산 함수 정의
def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = set(prediction_tokens) & set(ground_truth_tokens)
    if len(common) == 0:
        return 0
    precision = len(common) / len(prediction_tokens)
    recall = len(common) / len(ground_truth_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Exact Match 점수 계산 함수 정의
def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# 전체 데이터셋에 대해 F1 및 Exact Match 점수를 계산하는 함수 정의
def evaluate(predictions, references):
    f1 = exact_match = total = 0
    for pred, ref in zip(predictions, references):
        total += 1
        exact_match += exact_match_score(pred, ref)
        f1 += f1_score(pred, ref)
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    return {'exact_match': exact_match, 'f1': f1}


# KoBERT를 사용하여 BERTScore 계산
def bert_score_kobert(predictions, references):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    P, R, F1 = [], [], []
    for pred, ref in zip(predictions, references):
        # 토크나이징
        pred_tokens = bert_tokenizer(pred, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        ref_tokens = bert_tokenizer(ref, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            pred_embeddings = bert_model(**pred_tokens).last_hidden_state.mean(dim=1).cpu().numpy()
            ref_embeddings = bert_model(**ref_tokens).last_hidden_state.mean(dim=1).cpu().numpy()

        # 코사인 유사도 계산
        cosine_sim = cosine_similarity(pred_embeddings, ref_embeddings).item()
        P.append(cosine_sim)
        R.append(cosine_sim)
        F1.append(cosine_sim)

    return np.mean(P), np.mean(R), np.mean(F1)

# CSV 파일로부터 데이터 불러오기
reference_df = pd.read_csv("reference.csv")
predicted_df = pd.read_csv("predicted.csv")

# 정답과 예측값 리스트 생성
references = reference_df['reference'].tolist()
predictions = predicted_df['predicted'].tolist()

# 평가 시간 측정 시작
start_time = time.time()


# F1 및 Exact Match 점수 계산
results = evaluate(predictions, references)
print("Exact Match:", results['exact_match'])
print("F1 Score:", results['f1'])

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'BartTokenizer'.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Exact Match: 0.0
F1 Score: 35.14734201436181


In [19]:
# BERTScore 계산 (KoBERT 사용)
P, R, F1 = bert_score_kobert(predictions, references)
print("BERTScore (Precision):", P)
print("BERTScore (Recall):", R)
print("BERTScore (F1):", F1)

BERTScore (Precision): 0.9258360838890076
BERTScore (Recall): 0.9258360838890076
BERTScore (F1): 0.9258360838890076


In [20]:
# BARTScore 계산
bart_scores = [bart_score(pred, ref) for pred, ref in zip(predictions, references)]
average_bart_score = np.mean(bart_scores)
stddev_bart_score = np.std(bart_scores)

# BARTScore 결과 출력
print(f"BARTScore (avg): {average_bart_score:.2f} ± {stddev_bart_score:.2f}")

# 평가 시간 측정 종료
end_time = time.time()
evaluation_time = end_time - start_time

print(f"Evaluation Time: {evaluation_time:.2f} seconds")

BARTScore (avg): 7.07 ± 3.74
Evaluation Time: 87.68 seconds
