In [3]:
import json
import spacy
import re
import torch
import numpy as np
from tqdm import tqdm
from collections import Counter
from evaluation_utils import evaluate_fluency, compute_stage_matching, calculate_ingredient_coverage_and_hallucination
from evaluation_utils import RecipeInferenceDataset, extract_instruction
from dataclass import load_tokenizer
from generator import RecipeGenerator
from transformers import BartForConditionalGeneration, BartTokenizer

# from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load data and model

In [7]:
planner_result_path='/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/planner_results/'
with open(planner_result_path+'planner_prediction_test.json') as f:
    test_predicted_plan = json.load(f)['planner_prediction']

test_data_path='/mnt/nas_home/yl535/datasets/recipe1m+/preprocessed_data/test_dataset.json'
with open(test_data_path) as f:
    test_data = json.load(f)
test_data, test_plan = test_data['text'], test_data['stage_label']

len(test_predicted_plan), len(test_data)

(121084, 121084)

In [26]:
from collections import Counter
bigram_counter = []
bigrams = []
plan_bigrams = [plan_to_bigram(plan) for plan in test_plan]

for bi in plan_bigrams:
    bigrams += bi

In [31]:
bigram_cnter = Counter(bigrams)
print(bigram_cnter)
bigram_cnter.most_common()


Counter({(2, 2): 46763, (1, 1): 40751, (4, 4): 38740, (2, 4): 37730, (1, 2): 35604, (1, 4): 35505, (2, 1): 33449, (4, 2): 31593, (4, 5): 29284, (4, 1): 28776, (5, 4): 25747, (1, 0): 22475, (1, 5): 22473, (2, 5): 20615, (3, 4): 20288, (0, 4): 19185, (4, 0): 19054, (0, 1): 19049, (5, 1): 18062, (0, 2): 17973, (4, 6): 17514, (1, 3): 17373, (5, 5): 16922, (2, 0): 16821, (0, 0): 16371, (2, 3): 14589, (0, 5): 13635, (5, 0): 13136, (3, 1): 12050, (3, 5): 12002, (4, 3): 11953, (5, 2): 11925, (3, 2): 11666, (5, 6): 9693, (1, 6): 8977, (3, 0): 8901, (0, 3): 8656, (2, 6): 8018, (5, 3): 7993, (0, 6): 6973, (3, 3): 6793, (6, 0): 5593, (6, 6): 4324, (6, 1): 3973, (6, 4): 3893, (6, 5): 3872, (3, 6): 3532, (6, 2): 2750, (6, 3): 1691})


[((2, 2), 46763),
 ((1, 1), 40751),
 ((4, 4), 38740),
 ((2, 4), 37730),
 ((1, 2), 35604),
 ((1, 4), 35505),
 ((2, 1), 33449),
 ((4, 2), 31593),
 ((4, 5), 29284),
 ((4, 1), 28776),
 ((5, 4), 25747),
 ((1, 0), 22475),
 ((1, 5), 22473),
 ((2, 5), 20615),
 ((3, 4), 20288),
 ((0, 4), 19185),
 ((4, 0), 19054),
 ((0, 1), 19049),
 ((5, 1), 18062),
 ((0, 2), 17973),
 ((4, 6), 17514),
 ((1, 3), 17373),
 ((5, 5), 16922),
 ((2, 0), 16821),
 ((0, 0), 16371),
 ((2, 3), 14589),
 ((0, 5), 13635),
 ((5, 0), 13136),
 ((3, 1), 12050),
 ((3, 5), 12002),
 ((4, 3), 11953),
 ((5, 2), 11925),
 ((3, 2), 11666),
 ((5, 6), 9693),
 ((1, 6), 8977),
 ((3, 0), 8901),
 ((0, 3), 8656),
 ((2, 6), 8018),
 ((5, 3), 7993),
 ((0, 6), 6973),
 ((3, 3), 6793),
 ((6, 0), 5593),
 ((6, 6), 4324),
 ((6, 1), 3973),
 ((6, 4), 3893),
 ((6, 5), 3872),
 ((3, 6), 3532),
 ((6, 2), 2750),
 ((6, 3), 1691)]

In [3]:
generator_path = '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/generator_results/checkpoint-150000'
classifier_path = '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/classifier_results/checkpoint-45000'
planner_path = '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/planner_results/checkpoint-8000'

# Load device
device = torch.device('cuda')

# Load tokenizer 
tokenizer = load_tokenizer(generator_path)

# # Load planner
# planner_model = BartForConditionalGeneration.from_pretrained(planner_path)
# bart_tok = BartTokenizer.from_pretrained(planner_path)

# Load generator
generator_model = RecipeGenerator(generator_path, 
                                tokenizer=tokenizer, 
                                device=device, 
                                classifier_path=classifier_path
                                )

OSError: Can't load tokenizer for '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/generator_results/checkpoint-180000'. Make sure that:

- '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/generator_results/checkpoint-180000' is a correct model identifier listed on 'https://huggingface.co/models'
  (make sure '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/generator_results/checkpoint-180000' is not a path to a local directory with something else, in that case)

- or '/mnt/nas_home/yl535/RecipeWithPlans/model-checkpoint/generator_results/checkpoint-180000' is the correct path to a directory containing relevant tokenizer files



# Evaluate classifier and planner

In [22]:
def exact_match(predict_seq, reference_seq):
    match_cnt = 0
    total_cnt = 0
    for predicted_plan, reference_plan in zip(predict_seq, reference_seq):
        for p1, p2 in zip(predicted_plan, reference_plan):
            if p1 and p2 and p1==p2:
                match_cnt += 1

        total_cnt += len(predicted_plan)
    return match_cnt/total_cnt

def plan_to_unigram(plan):
    return [(stage) for stage in plan]
    
def plan_to_bigram(plan):
    result = []
    for i in range(len(plan)-1):
        result.append(tuple(plan[i:i+2]))
    return result

def plan_to_trigram(plan):
    result = []
    for i in range(len(plan)-2):
        result.append(tuple(plan[i:i+3]))
    return result


def n_gram_match_rate(predict_seq, reference_seq, ngram=1):
    if ngram==1:
        reference_ngram = [plan_to_unigram(plan) for plan in reference_seq]
        prediction_ngram = [plan_to_unigram(plan) for plan in predict_seq]

    elif ngram==2:
        reference_ngram = [plan_to_bigram(plan) for plan in reference_seq]
        prediction_ngram = [plan_to_bigram(plan) for plan in predict_seq]

    elif ngram==3:
        reference_ngram = [plan_to_trigram(plan) for plan in reference_seq]
        prediction_ngram = [plan_to_trigram(plan) for plan in predict_seq]
    else:
        print('Wrong n-gram number. ')

    average_match_rate = []
    for ngram1, ngram2 in zip(reference_ngram, prediction_ngram):
        ngram1_cnt = Counter(ngram1)
        ngram2_cnt = Counter(ngram2)
        match_cnt = 0
        for ngram in ngram2_cnt.keys():
            if ngram in ngram1_cnt:
                # print(bigram1_cnt[bigram],bigram2_cnt[bigram])
                # print(min(bigram1_cnt[bigram], bigram2_cnt[bigram]))
                match_cnt += min(ngram1_cnt[ngram], ngram2_cnt[ngram])
        if sum(ngram2_cnt.values()) != 0:
            match_rate = match_cnt / sum(ngram2_cnt.values())
            average_match_rate.append(match_rate)
    print('Unigram match rates', np.mean(average_match_rate))




In [8]:
print(n_gram_match_rate(test_predicted_plan, test_plan, ngram=1))
print(n_gram_match_rate(test_predicted_plan, test_plan, ngram=2))
print(n_gram_match_rate(test_predicted_plan, test_plan, ngram=3))


Unigram match rates 0.676046819285604
None
Unigram match rates 0.3312019771473539
None
Unigram match rates 0.1311368567142325
None


In [44]:
from generator import StageClassifierModule

stage_classifier = StageClassifierModule(classifier_path, device)

correct_cnt = 0
total_cnt = 0
pbar = tqdm(total=len(test_plan))
for text, reference_stages in zip(test_data, test_plan):
    pbar.update(1)
    _, text_list = extract_instruction(text, return_list=True)
    if text_list!=[]:
        predicted_stage = stage_classifier.compute_bert_stage_scores(text_list)
        predicted_stage = torch.argmax(predicted_stage, dim=1)
        for a, b in zip(predicted_stage, reference_stages):
            if a==b:
                correct_cnt+=1
        total_cnt += len(reference_stages)
pbar.close()

accuracy = correct_cnt/total_cnt
print('Accuracy of the classifier on the test data: {}'.format(accuracy))
    


100%|██████████| 121084/121084 [10:41<00:00, 188.86it/s]

Accuracy of the classifier on the test data: 0.9570099670526061





# Evaluate recipe

In [4]:

test_dataset = RecipeInferenceDataset(
                    {'text': test_data, 
                    'stage_label': test_predicted_plan
                    }, 
                    tokenizer,
                    max_length=512,
                    use_special_token=True,
                    with_stage_label=False
                    )


In [5]:
a, b = 0, 0.35
evaluate_size = 200
input_text = []
generation_doc, generation_doc_list = [], []
reference_doc, reference_stage_label = [], []
for i in tqdm(range(evaluate_size)):
    datapoint = test_dataset[i]
    input_text.append(tokenizer.decode(test_dataset[i]['input_ids'][0]))
    stage_plan = datapoint['stage_label']
    outputs = generator_model.structure_search(
                    datapoint['input_ids'].to(device),
                    beam_width=5,
                    alpha=a,
                    beta=b,
                    stage_plan=stage_plan,
                    max_length=512,
                    )
    generation = tokenizer.decode(outputs, skip_special_tokens=False )
    generated_text, generated_text_list = extract_instruction(generation, return_list=True)
    generation_doc.append(generated_text)
    generation_doc_list.append(generated_text_list)
    reference_doc.append(datapoint['reference_text'])
    reference_stage_label.append(stage_plan)
    

100%|██████████| 200/200 [04:54<00:00,  1.47s/it]


In [6]:
print(evaluate_fluency(generation_doc, reference_doc))

BLEU = 13.65 55.5/23.5/11.8/6.8 (BP = 0.759 ratio = 0.784 hyp_len = 16668 ref_len = 21258)
Rouge-L Score: 0.3909915645829786
None


In [7]:
print(compute_stage_matching(generation_doc_list, reference_stage_label))

100%|██████████| 200/200 [00:02<00:00, 89.62it/s]

0.7570194943944943





In [40]:
# Oracle version
a, b = 0, 0.2
evaluate_size = 2000
generation_doc, generation_doc_list = [], []
reference_doc, reference_stage_label = [], []
for i in tqdm(range(evaluate_size)):
    datapoint = test_dataset[i]
    stage_plan = test_plan[i]
    outputs = generator_model.structure_search(
                    datapoint['input_ids'].to(device),
                    beam_width=5,
                    alpha=a,
                    beta=b,
                    stage_plan=stage_plan,
                    max_length=512,
                    )
    generation = tokenizer.decode(outputs, skip_special_tokens=False )
    generated_text, generated_text_list = extract_instruction(generation, return_list=True)
    generation_doc.append(generated_text)
    generation_doc_list.append(generated_text_list)
    reference_doc.append(datapoint['reference_text'])
    reference_stage_label.append(stage_plan)
    

100%|██████████| 2000/2000 [58:52<00:00,  1.77s/it] 


In [41]:
print(evaluate_fluency(generation_doc, reference_doc))
print(compute_stage_matching(generation_doc_list, reference_stage_label))

BLEU = 14.85 49.9/21.1/10.9/6.4 (BP = 0.902 ratio = 0.907 hyp_len = 183233 ref_len = 202049)
Rouge-L Score: 0.3990109247049082
None


100%|██████████| 2000/2000 [00:29<00:00, 67.62it/s]

0.5987462468087469



