In [1]:
import json
import spacy
import re
import torch
import numpy as np
from tqdm import tqdm
from collections import Counter
from evaluation_utils import evaluate_fluency, compute_stage_matching, calculate_ingredient_coverage_and_hallucination
from evaluation_utils import RecipeInferenceDataset, extract_instruction
from dataclass import load_tokenizer
from generator import RecipeGenerator
from transformers import BartForConditionalGeneration, BartTokenizer

# from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load data and model

In [2]:
planner_result_path='/home/yinhong/Documents/source/RecipeWithPlans/model-checkpoint/planner_results/'
with open(planner_result_path+'planner_prediction_test.json') as f:
    test_predicted_plan = json.load(f)['planner_prediction']

test_data_path='/home/yinhong/Documents/datasets/recipe1m+/preprocessed_data/test_dataset.json'
with open(test_data_path) as f:
    test_data = json.load(f)
test_data, test_plan = test_data['text'], test_data['stage_label']

len(test_predicted_plan), len(test_data)

(121084, 121084)

In [8]:
generator_path = '/home/yinhong/Documents/source/RecipeWithPlans/model-checkpoint/generator_results/checkpoint-90000'
classifier_path = '/home/yinhong/Documents/source/RecipeWithPlans/model-checkpoint/classifier_results/checkpoint-45000'
planner_path = '/home/yinhong/Documents/source/RecipeWithPlans/model-checkpoint/planner_results/checkpoint-8000'

# Load device
device = torch.device('cuda')

# Load tokenizer 
tokenizer = load_tokenizer(generator_path)

# # Load planner
# planner_model = BartForConditionalGeneration.from_pretrained(planner_path)
# bart_tok = BartTokenizer.from_pretrained(planner_path)

# Load generator
generator_model = RecipeGenerator(generator_path, 
                                tokenizer=tokenizer, 
                                device=device, 
                                classifier_path=classifier_path
                                )

# Evaluate classifier and planner

In [7]:
def exact_match(predict_seq, reference_seq):
    match_cnt = 0
    total_cnt = 0
    for predicted_plan, reference_plan in zip(predict_seq, reference_seq):
        for p1, p2 in zip(predicted_plan, reference_plan):
            if p1 and p2 and p1==p2:
                match_cnt += 1

        total_cnt += len(predicted_plan)
    return match_cnt/total_cnt

def plan_to_unigram(plan):
    return [(stage) for stage in plan]
    
def plan_to_bigram(plan):
    result = []
    for i in range(len(plan)-1):
        result.append(tuple(plan[i:i+2]))
    return result

def plan_to_trigram(plan):
    result = []
    for i in range(len(plan)-2):
        result.append(tuple(plan[i:i+3]))
    return result


def n_gram_match_rate(predict_seq, reference_seq, ngram=1):
    if ngram==1:
        reference_ngram = [plan_to_unigram(plan) for plan in reference_seq]
        prediction_ngram = [plan_to_unigram(plan) for plan in predict_seq]

    elif ngram==2:
        reference_ngram = [plan_to_bigram(plan) for plan in reference_seq]
        prediction_ngram = [plan_to_bigram(plan) for plan in predict_seq]

    elif ngram==3:
        reference_ngram = [plan_to_trigram(plan) for plan in reference_seq]
        prediction_ngram = [plan_to_trigram(plan) for plan in predict_seq]
    else:
        print('Wrong n-gram number. ')

    average_match_rate = []
    for ngram1, ngram2 in zip(reference_ngram, prediction_ngram):
        ngram1_cnt = Counter(ngram1)
        ngram2_cnt = Counter(ngram2)
        match_cnt = 0
        for ngram in ngram2_cnt.keys():
            if ngram in ngram1_cnt:
                # print(bigram1_cnt[bigram],bigram2_cnt[bigram])
                # print(min(bigram1_cnt[bigram], bigram2_cnt[bigram]))
                match_cnt += min(ngram1_cnt[ngram], ngram2_cnt[ngram])
        if sum(ngram2_cnt.values()) != 0:
            match_rate = match_cnt / sum(ngram2_cnt.values())
            average_match_rate.append(match_rate)
    print('Unigram match rates', np.mean(average_match_rate))




In [8]:
print(n_gram_match_rate(test_predicted_plan, test_plan, ngram=1))
print(n_gram_match_rate(test_predicted_plan, test_plan, ngram=2))
print(n_gram_match_rate(test_predicted_plan, test_plan, ngram=3))


Unigram match rates 0.676046819285604
None
Unigram match rates 0.3312019771473539
None
Unigram match rates 0.1311368567142325
None


In [44]:
from generator import StageClassifierModule

stage_classifier = StageClassifierModule(classifier_path, device)

correct_cnt = 0
total_cnt = 0
pbar = tqdm(total=len(test_plan))
for text, reference_stages in zip(test_data, test_plan):
    pbar.update(1)
    _, text_list = extract_instruction(text, return_list=True)
    if text_list!=[]:
        predicted_stage = stage_classifier.compute_bert_stage_scores(text_list)
        predicted_stage = torch.argmax(predicted_stage, dim=1)
        for a, b in zip(predicted_stage, reference_stages):
            if a==b:
                correct_cnt+=1
        total_cnt += len(reference_stages)
pbar.close()

accuracy = correct_cnt/total_cnt
print('Accuracy of the classifier on the test data: {}'.format(accuracy))
    


100%|██████████| 121084/121084 [10:41<00:00, 188.86it/s]

Accuracy of the classifier on the test data: 0.9570099670526061





# Evaluate recipe

In [9]:

test_dataset = RecipeInferenceDataset(
                    {'text': test_data, 
                    'stage_label': test_predicted_plan
                    }, 
                    tokenizer,
                    max_length=512,
                    use_special_token=True,
                    with_stage_label=False
                    )


In [10]:
a, b = 0, 0.35
evaluate_size = 200
generation_doc, generation_doc_list = [], []
reference_doc, reference_stage_label = [], []
for i in tqdm(range(evaluate_size)):
    datapoint = test_dataset[i]
    stage_plan = datapoint['stage_label']
    outputs = generator_model.structure_search(
                    datapoint['input_ids'].to(device),
                    beam_width=5,
                    alpha=a,
                    beta=b,
                    stage_plan=stage_plan,
                    max_length=512,
                    )
    generation = tokenizer.decode(outputs, skip_special_tokens=False )
    generated_text, generated_text_list = extract_instruction(generation, return_list=True)
    generation_doc.append(generated_text)
    generation_doc_list.append(generated_text_list)
    reference_doc.append(datapoint['reference_text'])
    reference_stage_label.append(stage_plan)
    

100%|██████████| 200/200 [05:40<00:00,  1.70s/it]


In [11]:
print(evaluate_fluency(generation_doc, reference_doc))

BLEU = 12.47 50.4/20.5/9.7/5.3 (BP = 0.824 ratio = 0.838 hyp_len = 17808 ref_len = 21258)
Rouge-L Score: 0.3828653154102322
None


In [12]:
print(compute_stage_matching(generation_doc, reference_stage_label))

100%|██████████| 200/200 [01:43<00:00,  1.93it/s]

0.0464485791985792



