In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_metric
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd
from datasets import Dataset
from tqdm import tqdm



In [2]:
# 載入訓練好的模型和 tokenizer
model_path = "./NetflixGPT-english"  # 修改為你訓練模型的儲存路徑
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50260, bias=False)
)

In [3]:

# 定義生成描述的函數
def generate_description(title, max_length=100):
    input_text = f"<|startoftext|>Title: {title}<|sep|>Description:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.replace(input_text, "").strip()



In [4]:
# 加載測試數據集
data = pd.read_csv('data/netflix_test_en.csv')
data = data[['title', 'description']]
test_data = Dataset.from_pandas(data)


In [5]:
# 初始化 BLEU 和 ROUGE 評分器
bleu_scores = { "BLEU-1": [], "BLEU-2": [], "BLEU-3": [], "BLEU-4": [] }
rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

# 計算 BLEU 和 ROUGE 分數
for example in tqdm(test_data, desc="Evaluating on test dataset"):
    title = example['title']
    reference = example['description']
    prediction = generate_description(title)

    # 將參考文本和預測文本分詞
    reference_tokens = [reference.split()]
    prediction_tokens = prediction.split()
    
    # 計算 BLEU-1 到 BLEU-4 分數
    bleu_scores["BLEU-1"].append(sentence_bleu(reference_tokens, prediction_tokens, weights=(1, 0, 0, 0)))
    bleu_scores["BLEU-2"].append(sentence_bleu(reference_tokens, prediction_tokens, weights=(0.5, 0.5, 0, 0)))
    bleu_scores["BLEU-3"].append(sentence_bleu(reference_tokens, prediction_tokens, weights=(0.33, 0.33, 0.33, 0)))
    bleu_scores["BLEU-4"].append(sentence_bleu(reference_tokens, prediction_tokens, weights=(0.25, 0.25, 0.25, 0.25)))

    # 計算 ROUGE 分數
    rouge_result = rouge_scorer_instance.score(reference, prediction)
    rouge_scores['rouge1'].append(rouge_result['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(rouge_result['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(rouge_result['rougeL'].fmeasure)

# 計算每個 BLEU 和 ROUGE 分數的平均值
average_bleu1 = sum(bleu_scores["BLEU-1"]) / len(bleu_scores["BLEU-1"])
average_bleu2 = sum(bleu_scores["BLEU-2"]) / len(bleu_scores["BLEU-2"])
average_bleu3 = sum(bleu_scores["BLEU-3"]) / len(bleu_scores["BLEU-3"])
average_bleu4 = sum(bleu_scores["BLEU-4"]) / len(bleu_scores["BLEU-4"])

average_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
average_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
average_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# 輸出結果
print("Average BLEU-1 Score:", average_bleu1)
print("Average BLEU-2 Score:", average_bleu2)
print("Average BLEU-3 Score:", average_bleu3)
print("Average BLEU-4 Score:", average_bleu4)

print("Average ROUGE-1 Score:", average_rouge1)
print("Average ROUGE-2 Score:", average_rouge2)
print("Average ROUGE-L Score:", average_rougeL)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluating on test dataset: 100%|██████████| 1762/1762 [05:37<00:00,  5.23it/s]

Average BLEU-1 Score: 0.12813252341268005
Average BLEU-2 Score: 0.017049738103892587
Average BLEU-3 Score: 0.002921133683896236
Average BLEU-4 Score: 0.000998687316397085
Average ROUGE-1 Score: 0.17700172676113987
Average ROUGE-2 Score: 0.017623737580960617
Average ROUGE-L Score: 0.13143432925502094



