# 0. imports, definitions, processing, hyperparameters


In [None]:
import pandas as pd
import os
import random
import re
import json
import sys
import time
import math
import pprint
import numpy as np
import torch as T
import openai
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import PhrasalConstraint
from transformers import pipeline, set_seed
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from word_forms.word_forms import get_word_forms

import rouge 
rs = rouge.Rouge()

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.data
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

import spacy
spacy.prefer_gpu()

pos_tagger = spacy.load('en_core_web_sm')
ps = PorterStemmer()
rs = rouge.Rouge()

template_len = 4
batch_size = 32
sample_every = 100
epochs = 8
learning_rate = 5e-5
warmup_steps = 1e2
epsilon = 1e-8
max_length = 128
device = T.device("cuda")

SEP = '[SEP]'
PAD = '[PAD]'
BOS = '[BOS]'
EOS = '[EOS]'

starting_texts = [
                ## template from this paper https://dl.acm.org/doi/abs/10.1145/3409256.3409817#:~:text=Recent%20research%20on%20conversational%20search%20highlights%20the%20importance,lexical%20baseline%2Cthat%20significantly%20outperforms%20the%20existing%20naive%20baselines.
                "[SEP] are you looking for",
                "[SEP] do you want to know",
                "[SEP] would you like to",
                "[SEP] are you interested in",
                "[SEP] do you need information",
                "[SEP] do you want information",
                "[SEP] do you need to",
                "[SEP] do you want to",
            ]

openai.api_key = ''
gpt3_examples = ["Find condos in Florida. Ask a question that contains words in the list ['specific', 'city']. Are you interested in any specific city in florida?",
        "What should I know about living in India? Ask a question that contains words in the list ['challenges']. Would you like to know about the economic challenges of living in India?"
        "Tell me more about Euclid. Ask a question that contains words in the list ['greece', 'math']. would you like to know what impact Euclid had on mathematics in ancient Greece?"
]

seed_val = 2022
random.seed(seed_val)
np.random.seed(seed_val)
T.manual_seed(seed_val)
T.cuda.manual_seed_all(seed_val)
set_seed(seed_val)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token=BOS, eos_token=EOS, pad_token=PAD)
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False, bos_token=BOS, eos_token=EOS, pad_token=PAD)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration) 
model.resize_token_embeddings(len(tokenizer))
model.cuda()

ppl_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
ppl_configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
ppl_model = GPT2LMHeadModel.from_pretrained("gpt2", config=ppl_configuration) 
ppl_model.resize_token_embeddings(len(ppl_tokenizer))
ppl_model.cuda()

## create train/test split for reproduction, RUN ONLY ONCE!
if not (os.path.exists('data/question_cases_answered_test.csv') and os.path.exists('data/question_cases_answered_train_dev.csv')):
    print("Generating Usi train/test split for reproduction.")
    usi_train_file = 'data/question_cases_answered.csv'
    usi_train_data = pd.read_csv(usi_train_file) 

    topic_id_set = set(usi_train_data['topic_id'])
    test_topic = random.sample(topic_id_set, int(0.2 * len(topic_id_set)))

    usi_test = usi_train_data.loc[usi_train_data['topic_id'].isin(test_topic)]
    usi_train_dev = usi_train_data.drop(usi_test.index)

    usi_test = usi_test.sort_values(by='topic_id')
    usi_train_dev = usi_train_dev.sort_values(by='topic_id')

    usi_test.to_csv('data/question_cases_answered_test.csv', index=False)
    usi_train_dev.to_csv('data/question_cases_answered_train_dev.csv', index=False)

## create train/test split for reproduction, RUN ONLY ONCE!
if not (os.path.exists('data/clariq_f/ClariQ-FKw-trial.tsv') and os.path.exists('data/clariq_f/ClariQ-FKw-train_no_trial.tsv') ):
    print("Generating clariq-f train/test split for reproduction.")
    train_file = 'data/clariq_f/ClariQ-FKw-train.tsv'
    train_data = pd.read_csv(train_file, sep='\t') 

    topic_id_set = set(train_data['topic_id'])
    random.seed(17)
    topics = random.sample(topic_id_set, int(0.056 * len(topic_id_set)))

    trial_data = train_data.loc[train_data['topic_id'].isin(topics)]
    train_no_trial = train_data.drop(trial_data.index)

    trial_data = trial_data.sort_values(by='topic_id')
    train_no_trial = train_no_trial.sort_values(by='topic_id')

    trial_data.to_csv('data/clariq_f/ClariQ-FKw-trial.tsv', index=False, sep = '\t')
    train_no_trial.to_csv('data/clariq_f/ClariQ-FKw-train_no_trial.tsv', index=False, sep = '\t')

def process_clariq_f(data):
    data_dict = {}
    data = data.dropna(subset=['question', 'initial_request'])
    for iter, row in data.iterrows():
        q = str(data.at[iter, 'initial_request'])
        cq = str(data.at[iter, 'question'])
        f = str(data.at[iter, 'facet_desc'])

        data.at[iter, 'f_q'] = f + SEP + q
        data.at[iter, 'f_q_cq'] = f + SEP + q + BOS + cq + EOS
        data.at[iter, 'q_f'] = q + SEP + f
        data.at[iter, 'q_f_cq'] = q + SEP + f + BOS + cq + EOS
        data.at[iter, 'instructional_q_f_cq'] = q + ' '+ "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+w+"'" for w in f.split()])  + '].' + ' ' + cq
        data.at[iter, 'instructional_q_f'] = q + ' '+ "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+w+"'" for w in f.split()])  + '].'
        
    return data_dict, data

def compute_average_rouge(rouge_list):
    '''
    this function computes the average rouge f,p,r of a list of rouge scores
    rouge_list is a list of dictionaries in the following format:
    {
        "rouge-1": {
            "f": 0.4786324739396596,
            "p": 0.6363636363636364,
            "r": 0.3835616438356164
            },
        "rouge-2": {
            "f": 0.2608695605353498,
            "p": 0.3488372093023256,
            "r": 0.20833333333333334
            },
        "rouge-l": {
            "f": 0.44705881864636676,
            "p": 0.5277777777777778,
            "r": 0.3877551020408163
            }
    }
    '''
    # if length of rouge_list is 1 or it is not cast as a list of dicts
    if isinstance(rouge_list, dict):
        return rouge_list
    r_dict = {
        "rouge-1": {
            "f": 0,
            "p": 0,
            "r": 0
            },
        "rouge-2": {
            "f": 0,
            "p": 0,
            "r": 0
            },
        "rouge-l": {
            "f": 0,
            "p": 0,
            "r": 0
            }
    }
    for d in rouge_list:
        for k_len in d.keys():
            for k in d[k_len].keys():
                r_dict[k_len][k] += d[k_len][k]
    n_hyps = len(rouge_list)
    for k_len in r_dict.keys():
        for k in r_dict[k_len].keys():
            r_dict[k_len][k] /= n_hyps
    return r_dict

def calculatePerplexity(sentence,model,tokenizer):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = T.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)]).to(device)
    loss=model(tensor_input, labels=tensor_input)[0]
    return math.exp(loss)

def process_generation(generation):
    processed_generation = re.sub(r'\[SEP\]',' ', generation)  # remove [SEP]
    processed_generation = re.sub(r'[^\w\s]',' ', processed_generation)  # remove punctuation
    return processed_generation

def calculate_WSDM(query, doc_list):
    lambda_t = 1
    lambda_o = 1
    lambda_u = 1
    mu = 25
    collection = ' '.join(doc_list)
    collection_size = len(collection)

    def tfq(word, doc):
        many_forms = get_word_forms(word)
        word_forms = [word for k in many_forms.keys() for word in many_forms[k]] 
        return sum( [sum([1 if w == wf else 0 for w in doc]) for wf in word_forms])

    def tf1(qk, qk1, doc):    
        many_formsk = get_word_forms(qk)
        word_formsk = list(set([word for k in many_formsk.keys() for word in many_formsk[k]] + [qk])) 
        many_formsk1 = get_word_forms(qk1)
        word_formsk1 = list(set([word for k in many_formsk1.keys() for word in many_formsk1[k]] + [qk1]))
        return sum( [sum([1 if qkf == doc[k] and qk1f == doc[k+1] else 0 for k in range(len(doc)-1)]) for qkf in word_formsk for qk1f in word_formsk1])
    
    def tfuw(qk, qj, doc):
        wsz = 2
        many_formsk = get_word_forms(qk)
        word_formsk = list(set([word for k in many_formsk.keys() for word in many_formsk[k]] + [qk])) 
        many_formsj = get_word_forms(qj)
        word_formsj = list(set([word for k in many_formsj.keys() for word in many_formsj[k]] + [qj])) 
        return sum( [sum([1 if qkf == doc[k] and qjf in doc[max(k-wsz,0):min(k+wsz,len(doc))] else 0 for k in range(len(doc))]) for qkf in word_formsk for qjf in word_formsj])

    def f_t(query, doc, collection):
        res = sum([(tfq(word, doc.split()) + mu * tfq(word, collection.split())/collection_size) / (len(doc.split()) + mu) for word in query.split() ])
        #print(query, doc)
        #print("ft", res)
        return res
    
    def f_o(query, doc, collection):
        query = query.split()
        if len(query) < 2:
            return 0
        res = sum([(tf1(query[k], query[k+1], doc.split()) + mu * tf1(query[k], query[k+1], collection.split())/collection_size) / (len(doc.split()) + mu)  for k in range(len(query)-1)])
        
        #print(query, doc)
        #print("fo", res)
        return res

    def f_u(query, doc, collection):
        query = list(set(query.split()))
        l = len(query)
        if l < 2:
            return 0
        res = sum([(tfuw(query[k], query[j], doc.split()) + mu * tfuw(query[k], query[j], collection.split())/collection_size) / (len(doc.split()) + mu)  for k in range(l) for j in range(k+1, l)])
        #print(query, doc)
        #print("fu", res)
        return res

    return {
        doc:lambda_t * f_t(query, doc, collection) + \
            lambda_o * f_o(query, doc, collection) + \
            lambda_u * f_u(query, doc, collection) 
        for doc in doc_list
    }

def round_metric(num):
    return round(num * 100, 2)

def auto_evaluation(ref, hyp, facet):
    ref = ref.strip()
    hyp = hyp.strip()
    tokenized_ref = word_tokenize(ref)
    tokenized_hyp = word_tokenize(hyp)

    rouge_score = rs.get_scores(hyp, ref)[0]['rouge-l']['f'] if hyp != '' else 0

    return sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 0, 0, 0),
                            smoothing_function = SmoothingFunction().method1),\
            sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 1, 0, 0),
                            smoothing_function = SmoothingFunction().method1),\
            sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 1, 1, 0),
                            smoothing_function = SmoothingFunction().method1),\
            sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 1, 1, 1),
                            smoothing_function = SmoothingFunction().method1),\
            meteor_score([' '.join(tokenized_ref)], ' '.join(tokenized_hyp)),\
            rouge_score,\
            1 - sum([1 if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(hyp)]) else 0 for constraint in facet.split() ]) / len(facet.split())

def evaluate_from_output(model_output):
    
    b1, b2, b3, b4 = [], [], [], []
    m = []
    r = []
    c = []

    t_b1, t_b2, t_b3, t_b4 = [], [], [], []
    t_m = []
    t_r = []
    t_c = []

    model_output_data = pd.read_csv(model_output)
    for iter, row in model_output_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = model_output_data.at[iter, 'facet']
        ref = model_output_data.at[iter, 'reference']
        generated_cq = model_output_data.at[iter, 'candidate']
        
        if iter % sample_every == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
        
        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)
    
    return b1, b2, b3, b4, m, r, c,\
           t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c

: 

# RQ1.  How effective is facet information for clarifying question generation?

To answer this question, we compare our proposed zero-shot facet-constrained approach with a similar method but using query subject instead of facet for constraints.

## 1.1 No-facet (query subject) + neurologic decoding + WSDM ranker
### 1.1.1 Generate inputs for neurologic decoding. 

* Generate constraints file from query subjects
* Generate generation inputs file in the form of {query} + {template}

In [2]:
from word_forms.word_forms import get_word_forms
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')

write_to_file = 'neurologic_decoding/dataset/clean/constraint/test.constraint.json'
prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen.test.init.txt'
no_prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen_no_prompt.test.init.txt'

pos_tagger = spacy.load('en_core_web_sm')

all_queries= []
all_constraints = []
for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter, 'facet_desc']
    query = facet_test_data.at[iter, 'initial_request']
    noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
    propn_in_query = ' '.join([token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN'])
    all_queries.append(query)
    constraints = [[term] for term in noun_in_query]
    if propn_in_query != '':
        constraints += [[propn_in_query]] 
    all_constraints.append(constraints)


with open(write_to_file, 'w') as output:
    for constraints in all_constraints:
        for k, prompt in enumerate(starting_texts):
            json_str = json.dumps(constraints)
            output.write(json_str)
            output.write('\n')

with open(prompt_write_to_file, 'w') as output:
    for query in all_queries:
        for k, prompt in enumerate(starting_texts):
            output.write(query + prompt)
            output.write('\n')

with open(no_prompt_write_to_file, 'w') as output:
    for query in all_queries:
        output.write(query)
        output.write('\n')

### 1.1.2 Run neurologic decoding.
In AML terminal:

Set up neurologic decoding environment.
```
cd neurologic_decoding
conda create -n hug python=3.7
conda activate hug
pip install -r huggingface.txt
```
Run the generation code.
```
cd neurologic_decoding/zero_shot
conda activate hug
export PYTHONPATH=/home/azureuser/cloudfiles/code/Users/t-zhendwang/srconvsearch/neurologic_decoding
bash decode_pt.sh 0 test gpt2nofacet
``` 

Make sure we get the generation file 'gpt2nofacet'.

### 1.1.3 Evaluate

In [3]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

r = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

t_r = []
t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_c = []

model_output = 'zeroshot_subject_nd_wsdm.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2nofacet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]
    for iter, row in facet_test_data.iterrows():
        query = facet_test_data.at[iter, 'initial_request']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']

        generated_cqs = []
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            generated_cqs.append(generated_cq)
        
        noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
        propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

        template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query), doc_list=generated_cqs)
        sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] # Tie breaker? 
        facet_test_data.at[iter, 'generated'] = generated_cq
        
        if iter % sample_every == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            #pprint.pprint(sorted_template_scores)

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)
    
    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_subject_nd_b1 = np.mean(b1)
zero_subject_nd_b2 = np.mean(b2)
zero_subject_nd_b3 = np.mean(b3)
zero_subject_nd_b4 = np.mean(b4)
zero_subject_nd_m = np.mean(m)
zero_subject_nd_r = np.mean(r)
zero_subject_nd_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_subject_nd_b1 = np.mean(t_b1)
t_zero_subject_nd_b2 = np.mean(t_b2)
t_zero_subject_nd_b3 = np.mean(t_b3)
t_zero_subject_nd_b4 = np.mean(t_b4)
t_zero_subject_nd_m = np.mean(t_m)
t_zero_subject_nd_r = np.mean(t_r)
t_zero_subject_nd_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - do you want to know what cass county cass county is cass county
100 Find information on ontario california airport. - directions location - would you like to visit ontario ontario information ontario
200 Where can I buy pressure washers? - washer - are you looking for washers or pressure washer
300 Tell me more about Rocky Mountain News - recent events historical - are you looking for news about rocky mountain news
400 Where should I order dog clean-up bags - specif bag type - would you like to order dog cleaning bags
Full reference evaluation
b1 0.2915715948377834 b2 0.12806654260239403 b3 0.07602362595745463 b4 0.051360580789430024
rouge-L 0.34784465264878195
m 0.28592940767560077
c 0.09819607843137255
Question body evaluation
b1 0.1451359435328646 b2 0.043740224796047354 b3 0.020391620599505958 b4 0.025673300004377137
rouge-L 0.1939367199929307
m 0.14507277229967497
c 0.09447058823529413


## 1.2 Using facet
### 1.2.1 Generate inputs for neurologic decoding. 

In [4]:
from word_forms.word_forms import get_word_forms
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')

write_to_file = 'neurologic_decoding/dataset/clean/constraint/test.constraint.json'
prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen.test.init.txt'
no_prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen_no_prompt.test.init.txt'

pos_tagger = spacy.load('en_core_web_sm')

all_queries= []
all_constraints = []
for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter, 'facet_desc']
    query = facet_test_data.at[iter, 'initial_request']
    noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
    propn_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'PROPN']
    all_queries.append(query)

    constraints = [[term] for term in facet.split()]
    #if propn_in_query != []: constraints += [[' '.join(propn_in_query)]]
    #for facet_word in facet.split():
    #for facet_word in facet.split():
    #    many_forms = get_word_forms(facet_word)
    #    constraints.append(list(set([word for k in many_forms.keys() for word in many_forms[k] ]+[facet_word])))
    all_constraints.append(constraints)

with open(write_to_file, 'w') as output:
    for constraints in all_constraints:
        for k, prompt in enumerate(starting_texts):
            json_str = json.dumps(constraints)
            output.write(json_str)
            output.write('\n')

with open(prompt_write_to_file, 'w') as output:
    for query in all_queries:
        for k, prompt in enumerate(starting_texts):
            output.write(query + prompt)
            output.write('\n')

with open(no_prompt_write_to_file, 'w') as output:
    for query in all_queries:
        output.write(query)
        output.write('\n')

### 1.2.2 Run neurologic decoding.
In a terminal:

Run the generation code.
```
cd neurologic_decoding/zero
conda activate hug
export PYTHONPATH=/home/azureuser/cloudfiles/code/Users/t-zhendwang/srconvsearch/neurologic_decoding
bash decode_pt.sh 0 test gpt2facet
``` 

Make sure we get the generation file 'gpt2facet'.

### 1.2.3 Evaluate

In [5]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

model_output = 'zeroshot_nd_wsdm.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]
    for iter, row in facet_test_data.iterrows():
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)
        facet_list = facet.split()

        generated_cqs = []
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            generated_cqs.append(generated_cq)
        
        noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
        propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

        template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
        sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        
        if iter % sample_every == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            #pprint.pprint(sorted_template_scores)

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv('zeroshot_nd_wsdm.csv')

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_nd_wsdm_b1 = np.mean(b1)
zero_nd_wsdm_b2 = np.mean(b2)
zero_nd_wsdm_b3 = np.mean(b3)
zero_nd_wsdm_b4 = np.mean(b4)
zero_nd_wsdm_m = np.mean(m)
zero_nd_wsdm_r = np.mean(r)
zero_nd_wsdm_c = np.mean(c)


# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_nd_wsdm_b1 = np.mean(t_b1)
t_zero_nd_wsdm_b2 = np.mean(t_b2)
t_zero_nd_wsdm_b3 = np.mean(t_b3)
t_zero_nd_wsdm_b4 = np.mean(t_b4)
t_zero_nd_wsdm_m = np.mean(t_m)
t_zero_nd_wsdm_r = np.mean(t_r)
t_zero_nd_wsdm_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - do you need information on the homes sale list
100 Find information on ontario california airport. - directions location - do you want information and directions to your location
200 Where can I buy pressure washers? - washer - would you like to buy washer washers
300 Tell me more about Rocky Mountain News - recent events historical - are you interested in recent historical events
400 Where should I order dog clean-up bags - specif bag type - do you want to know specifical bag type
Full reference evaluation
b1 0.41802206389295793 b2 0.17567385672035452 b3 0.0970553332319387 b4 0.0644596743271716
rouge-L 0.4418867567785013
m 0.3851975009273996
c 0.9896078431372548
Question body evaluation
b1 0.3850744050543095 b2 0.18168457375984648 b3 0.1080585210854704 b4 0.08461452029751804
rouge-L 0.43791545548265426
m 0.3746952017378225
c 0.9866666666666666


## 1.3 Comparing 1.1 and 1.2, and get the conclusion of RQ1: "Facet is indeed very useful for clarifying question generation"

In [6]:
print("-----------------------------------------------------------------------------------------")
print("|                               Full reference evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('No facet', 
                                                            round_metric(zero_subject_nd_b1), 
                                                            round_metric(zero_subject_nd_b2), 
                                                            round_metric(zero_subject_nd_b3), 
                                                            round_metric(zero_subject_nd_b4), 
                                                            round_metric(zero_subject_nd_m), 
                                                            round_metric(zero_subject_nd_r), 
                                                            round_metric(zero_subject_nd_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('With facet', 
                                                            round_metric(zero_nd_wsdm_b1), 
                                                            round_metric(zero_nd_wsdm_b2), 
                                                            round_metric(zero_nd_wsdm_b3), 
                                                            round_metric(zero_nd_wsdm_b4), 
                                                            round_metric(zero_nd_wsdm_m), 
                                                            round_metric(zero_nd_wsdm_r),
                                                            round_metric(zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")

print("-----------------------------------------------------------------------------------------")
print("|                                Question body evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('No facet', 
                                                            round_metric(t_zero_subject_nd_b1), 
                                                            round_metric(t_zero_subject_nd_b2), 
                                                            round_metric(t_zero_subject_nd_b3), 
                                                            round_metric(t_zero_subject_nd_b4), 
                                                            round_metric(t_zero_subject_nd_m), 
                                                            round_metric(t_zero_subject_nd_r), 
                                                            round_metric(t_zero_subject_nd_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('With facet', 
                                                            round_metric(t_zero_nd_wsdm_b1), 
                                                            round_metric(t_zero_nd_wsdm_b2), 
                                                            round_metric(t_zero_nd_wsdm_b3), 
                                                            round_metric(t_zero_nd_wsdm_b4), 
                                                            round_metric(t_zero_nd_wsdm_m), 
                                                            round_metric(t_zero_nd_wsdm_r),
                                                            round_metric(t_zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")


-----------------------------------------------------------------------------------------
|                               Full reference evaluation                               |
-----------------------------------------------------------------------------------------
|MODEL                    BLEU1    BLEU2    BLEU3    BLEU4    METEOR   ROUGE    COVERAGE|
-----------------------------------------------------------------------------------------
|No facet                 29.16    12.81    7.6      5.14     28.59    34.78    9.82    |
-----------------------------------------------------------------------------------------
|With facet               41.8     17.57    9.71     6.45     38.52    44.19    98.96   |
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
|                                Question body evaluation                               |
----------

# RQ2. How does our zero-shot facet-constrained approach compare to existing facet-driven baselines?

To answer this research question, we include some existing methods and a few other reasonable solutions not mentioned by previous works as our baseline models. Some of them are zero-shot, while others are not. However, we still compare their performances altogether to demonstrate the power of our zero-shot approach.

## 2.1 Our approach.
The same as in Section 1.2.

## 2.2 Template append facet approach
This method appends facet words directly to question templates. This baseline is not ideal. Admittedly, it can generate good questions such as:

𝑞: "I am looking for information about South Africa."

𝑓 : "population"

𝑐𝑞: "Are you interested in \[population\]"

However, sometimes the case is the facet itself cannot form a meaningful question:

𝑞: "I am interested in poker tournaments."

𝑓 : "online"

𝑐𝑞: "Are you interested in \[online\]"

In [7]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

model_output = 'template_facet.csv'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    for iter, row in facet_test_data.iterrows():
        facet = facet_test_data.at[iter, 'facet_desc']
        query = facet_test_data.at[iter, 'initial_request']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)

        template_scores = {}
        for s_t in starting_texts:
            generated_cq = s_t + ' ' + facet
            generated_cq = process_generation(generated_cq)
            generated_cq = ' '.join(word_tokenize(generated_cq))
            template_scores[generated_cq] = calculatePerplexity(sentence=generated_cq, model=ppl_model, tokenizer=ppl_tokenizer)
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 
        facet_test_data.at[iter, 'generated'] = generated_cq

        if iter % sample_every == 0:
            print(iter, query, '-', facet, '-', generated_cq)
            #pprint.pprint(template_scores)
        
        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

template_facet_b1 = np.mean(b1)
template_facet_b2 = np.mean(b2)
template_facet_b3 = np.mean(b3)
template_facet_b4 = np.mean(b4)
template_facet_m = np.mean(m)
template_facet_r = np.mean(r)
template_facet_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_template_facet_b1 = np.mean(t_b1)
t_template_facet_b2 = np.mean(t_b2)
t_template_facet_b3 = np.mean(t_b3)
t_template_facet_b4 = np.mean(t_b4)
t_template_facet_m = np.mean(t_m)
t_template_facet_r = np.mean(t_r)
t_template_facet_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - do you want to know list homes sale
100 Find information on ontario california airport. - directions location - do you want to know directions location
200 Where can I buy pressure washers? - washer - would you like to washer
300 Tell me more about Rocky Mountain News - recent events historical - do you want to know recent events historical
400 Where should I order dog clean-up bags - specif bag type - do you want to know specif bag type
Full reference evaluation
b1 0.3855760869404698 b2 0.14068762090838577 b3 0.09981932597720988 b4 0.07806508022694261
rouge-L 0.4602337846738828
m 0.338690733001687
c 1.0
Question body evaluation
b1 0.2548890473046932 b2 0.05751338297329412 b3 0.03272406390161522 b4 0.03350955982568657
rouge-L 0.38575429251434623
m 0.2208677078751569
c 1.0


## 2.3 Finetuned GPT2 approach ([Previous SOTA by Sekulic](https://dl.acm.org/doi/abs/10.1145/3471158.3472257)), which uses inputs structured as:

## {facet} \[SEP\] {query} \[BOS\] {clarifying question} \[EOS\]

In [8]:
batch_size = 32
sample_every = 100
epochs = 8
learning_rate = 5e-5
warmup_steps = 1e2
epsilon = 1e-8
max_length = 128
prompt_instruction = ''

facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

model_output = 'sekulic.csv'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    clariq_f_train_file = 'data/clariq_f/ClariQ-FKw-train.tsv'
    clariq_f_train_data = pd.read_csv(clariq_f_train_file, sep='\t') 
    clariq_f_train_dict, clariq_f_train_data = process_clariq_f(clariq_f_train_data)
    clariq_f_train_text_list = clariq_f_train_data['f_q_cq']

    class GPT2Dataset(Dataset):
        def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
            self.tokenizer = tokenizer
            self.input_ids = []
            self.attn_masks = []
        
            print("training text example", txt_list[0])
            for txt in txt_list:
                encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
                self.input_ids.append(T.tensor(encodings_dict['input_ids']))
                self.attn_masks.append(T.tensor(encodings_dict['attention_mask']))
        
        def __len__(self):
            return len(self.input_ids)
        
        def __getitem__(self, idx):
            return self.input_ids[idx], self.attn_masks[idx] 
            
    dataset = GPT2Dataset(clariq_f_train_text_list, tokenizer, max_length=max_length)

    train_size = int(0.99 * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    print('{:>5,} train /{:>5,} val'.format(train_size, val_size))

    train_dataloader = DataLoader(
        train_dataset,
        sampler = RandomSampler(train_dataset),
        batch_size = batch_size
        )

    validation_dataloader = DataLoader(
        val_dataset,
        sampler = SequentialSampler(val_dataset),
        batch_size = batch_size
        )

    device = T.device("cuda")
    model.cuda()

    optimizer = AdamW(model.parameters(),
        lr = learning_rate,
        eps = epsilon
        )

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
        num_warmup_steps = warmup_steps, 
        num_training_steps = total_steps
        )

    training_stats = []

    model = model.to(device)

    for epoch_i in range(0, epochs):
        # ========================================
        #               Training
        # ========================================
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        total_train_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            model.zero_grad()        

            outputs = model(  b_input_ids,
                            labels=b_labels, 
                            attention_mask = b_masks,
                            token_type_ids=None
                            )
            loss = outputs[0]  

            batch_loss = loss.item()
            total_train_loss += batch_loss

            # Get sample every x batches.
            if step % sample_every == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))
                model.eval()
                model.train()

            loss.backward()
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)       
        
        # Measure how long this epoch took.

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))       
        # ========================================
        #               Validation
        # ========================================
        print("")
        print("Running Validation...")
        model.eval()
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:       
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)
            
            with T.no_grad():        
                outputs  = model(b_input_ids, 
                                attention_mask = b_masks,
                                labels=b_labels)         
                loss = outputs[0]  
                
            batch_loss = loss.item()
            total_eval_loss += batch_loss        

        avg_val_loss = total_eval_loss / len(validation_dataloader)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
            }
        )

    print("")
    print("Training complete!")

    output_dir = './model_save/'+prompt_instruction+str(epochs)+'/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    clariq_f_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
    clariq_f_test_data = pd.read_csv(clariq_f_test_file, sep='\t') 
    clariq_f_test_dict, clariq_f_test_data = process_clariq_f(clariq_f_test_data)

    rs = rouge.Rouge()
    rs_list = []
    b1, b2, b3, b4 = [], [], [], []
    m = []
    
    for iter, row in clariq_f_test_data.iterrows():
        query = clariq_f_test_data.at[iter, 'f_q']
        ref = clariq_f_test_data.at[iter, 'question']
        prompt_input = prompt_instruction + query
        prompt_input_BOS = prompt_input + BOS
        tokenized_prompt_input = T.tensor(tokenizer.encode(prompt_input_BOS)).unsqueeze(0)
        tokenized_prompt_input = tokenized_prompt_input.to(device)
        generated_text = ''
        generated_cq = ''
        attempt, max_attempt = 0, 4
        while generated_cq == '' and attempt <= max_attempt: # to ensure the generation is not empty
            attempt += 1
            sample_outputs = model.generate(
                tokenized_prompt_input,
                do_sample=True,   
                top_k=0, 
                max_length = len(tokenized_prompt_input[0]) + 10,
                top_p=0.9, 
                temperature = 0.7,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id
                )

            generated_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
            generated_cq = generated_text[len(prompt_input):]
            
            clariq_f_test_data.at[iter, 'generated'] = process_generation(generated_cq)

            if generated_cq == '':
                generated_cq = 'nan'

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = clariq_f_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

sekulic_b1 = np.mean(b1)
sekulic_b2 = np.mean(b2)
sekulic_b3 = np.mean(b3)
sekulic_b4 = np.mean(b4)
sekulic_m = np.mean(m)
sekulic_r = np.mean(r)
sekulic_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_sekulic_b1 = np.mean(t_b1)
t_sekulic_b2 = np.mean(t_b2)
t_sekulic_b3 = np.mean(t_b3)
t_sekulic_b4 = np.mean(t_b4)
t_sekulic_m = np.mean(t_m)
t_sekulic_r = np.mean(t_r)
t_sekulic_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you looking for a specific list of the county homes sale in south america
100 Find information on ontario california airport. - directions location - do you want to know about the location of the ontario cal
200 Where can I buy pressure washers? - washer - do you want to know the difference between a vacuum was
300 Tell me more about Rocky Mountain News - recent events historical -  would you like to know about historical events that happened
400 Where should I order dog clean-up bags - specif bag type - would you like to know the type of bag
Full reference evaluation
b1 0.27745683326901827 b2 0.10785140305649912 b3 0.0603472849754279 b4 0.038306506703975875
rouge-L 0.3170536641435762
m 0.2855930527921367
c 0.20854901960784314
Question body evaluation
b1 0.16383397692101645 b2 0.05368179453201418 b3 0.026705204131886495 b4 0.023738660852511858
rouge-L 0.20138411844336415
m 0.18413326944858474
c 0.20207843137254902


## 2.4 Prompt-based finetuned GPT2 approach, which uses inputs structured as:

## {query} Ask a question that contains words in the list \[{facet}\] {clarifying question}

In [9]:
batch_size = 32
sample_every = 100
epochs = 8
learning_rate = 5e-5
warmup_steps = 1e2
epsilon = 1e-8
max_length = 128
prompt_instruction = ''

temperature = 0.1

facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

model_output = 'ftgpt2_prompt' + '_temp' + str(temperature) + '.csv'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:  
    print("Output file not found, generating output.") 
    model_dir = './model_save/'+str(epochs)+'/'
    if os.path.exists(model_dir):
        tokenizer = GPT2Tokenizer.from_pretrained(model_dir, bos_token=BOS, eos_token=EOS, pad_token=PAD) 
        configuration = GPT2Config.from_pretrained(model_dir, output_hidden_states=False)
        model = GPT2LMHeadModel.from_pretrained(model_dir, config=configuration)
        model.resize_token_embeddings(len(tokenizer))
        model.cuda()
    else:
        print("Model checkpoint not found, finetuning.")
        clariq_f_train_file = 'data/clariq_f/ClariQ-FKw-train.tsv'
        clariq_f_train_data = pd.read_csv(clariq_f_train_file, sep='\t') 
        clariq_f_train_dict, clariq_f_train_data = process_clariq_f(clariq_f_train_data)
        clariq_f_train_text_list = clariq_f_train_data['instructional_q_f_cq']

        class GPT2Dataset(Dataset):
            def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
                self.tokenizer = tokenizer
                self.input_ids = []
                self.attn_masks = []
            
                print("training text example", txt_list[0])
                for txt in txt_list:
                    encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
                    self.input_ids.append(T.tensor(encodings_dict['input_ids']))
                    self.attn_masks.append(T.tensor(encodings_dict['attention_mask']))
            
            def __len__(self):
                return len(self.input_ids)
            
            def __getitem__(self, idx):
                return self.input_ids[idx], self.attn_masks[idx] 
            
        dataset = GPT2Dataset(clariq_f_train_text_list, tokenizer, max_length=max_length)

        train_size = int(0.99 * len(dataset))
        val_size = len(dataset) - train_size

        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        print('{:>5,} train /{:>5,} val'.format(train_size, val_size))

        train_dataloader = DataLoader(
        train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

        validation_dataloader = DataLoader(
        val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

        optimizer = AdamW(model.parameters(),
            lr = learning_rate,
            eps = epsilon
        )

        total_steps = len(train_dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, 
            num_warmup_steps = warmup_steps, 
            num_training_steps = total_steps
        )

        training_stats = []

        for epoch_i in range(0, epochs):
            # ========================================
            #               Training
            # ========================================
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

            total_train_loss = 0

            model.train()

            for step, batch in enumerate(train_dataloader):

                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_masks = batch[1].to(device)

                model.zero_grad()        

                outputs = model(  b_input_ids,
                                labels=b_labels, 
                                attention_mask = b_masks,
                                token_type_ids=None
                                )
                loss = outputs[0]  

                batch_loss = loss.item()
                total_train_loss += batch_loss

                # Get sample every x batches.
                if step % sample_every == 0 and not step == 0:
                    print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))
                    model.eval()
                    model.train()

                loss.backward()
                optimizer.step()
                scheduler.step()

            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len(train_dataloader)       
            
            # Measure how long this epoch took.

            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))       
            # ========================================
            #               Validation
            # ========================================
            print("")
            print("Running Validation...")
            model.eval()
            total_eval_loss = 0
            nb_eval_steps = 0

            # Evaluate data for one epoch
            for batch in validation_dataloader:       
                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_masks = batch[1].to(device)
                
                with T.no_grad():        
                    outputs  = model(b_input_ids, 
                                    attention_mask = b_masks,
                                    labels=b_labels)         
                    loss = outputs[0]  
                    
                batch_loss = loss.item()
                total_eval_loss += batch_loss        

            avg_val_loss = total_eval_loss / len(validation_dataloader)


            print("  Validation Loss: {0:.2f}".format(avg_val_loss))

            # Record all statistics from this epoch.
            training_stats.append(
                {
                    'epoch': epoch_i + 1,
                    'Training Loss': avg_train_loss,
                    'Valid. Loss': avg_val_loss,
                }
            )

        print("")
        print("Training complete!")

        output_dir = model_dir
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print("Saving model to %s" % output_dir)

        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

    clariq_f_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
    clariq_f_test_data = pd.read_csv(clariq_f_test_file, sep='\t') 
    clariq_f_test_dict, clariq_f_test_data = process_clariq_f(clariq_f_test_data)

    rs = rouge.Rouge()
    rs_list = []
    b1, b2, b3, b4 = [], [], [], []
    m = []
    
    for iter, row in clariq_f_test_data.iterrows():
        query = clariq_f_test_data.at[iter, 'instructional_q_f']
        ref = clariq_f_test_data.at[iter, 'question']
        tokenized_input = T.tensor(tokenizer.encode(query)).unsqueeze(0).to(device)
        generated_text = ''
        generated_cq = ''
        
        sample_outputs = model.generate(
                tokenized_input,
                do_sample=True,   
                top_k=20, 
                max_length = len(tokenized_input[0]) + 32,
                top_p=0.9, 
                temperature = temperature,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
        
        generated_cq = generated_text[len(query):].strip()
        generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
        generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()

        clariq_f_test_data.at[iter, 'generated'] = generated_cq

        if iter % sample_every == 0: 
            print(iter, query, '-', generated_cq)

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = clariq_f_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)


# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

ftgpt2_prompt_b1 = np.mean(b1)
ftgpt2_prompt_b2 = np.mean(b2)
ftgpt2_prompt_b3 = np.mean(b3)
ftgpt2_prompt_b4 = np.mean(b4)
ftgpt2_prompt_m = np.mean(m)
ftgpt2_prompt_r = np.mean(r)
ftgpt2_prompt_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_ftgpt2_prompt_b1 = np.mean(t_b1)
t_ftgpt2_prompt_b2 = np.mean(t_b2)
t_ftgpt2_prompt_b3 = np.mean(t_b3)
t_ftgpt2_prompt_b4 = np.mean(t_b4)
t_ftgpt2_prompt_m = np.mean(t_m)
t_ftgpt2_prompt_r = np.mean(t_r)
t_ftgpt2_prompt_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you looking for homes in the city of missouri
100 Find information on ontario california airport. - directions location - are you looking for directions to the nearest airport
200 Where can I buy pressure washers? - washer - are you looking for a pressure was are you looking for a washing machine or a washing machine
300 Tell me more about Rocky Mountain News - recent events historical - are you interested in historical events in the Rocky Mountain National park
400 Where should I order dog clean-up bags - specif bag type - are you looking for a type of bag for the dog that contains the specif referring to
Full reference evaluation
b1 0.3278795935788096 b2 0.1457659141540518 b3 0.08566811950120864 b4 0.05633487804157568
rouge-L 0.40809193442439157
m 0.37547626523857963
c 0.7254901960784315
Question body evaluation
b1 0.24126668552229252 b2 0.10373898361432152 b3 0.05732044392802645 b4 0.04493044603526353
rouge-L 0.32992141402

## 2.5 Comparing 2.1-2.4, and get the conclusion for RQ2. "Our zero-shot facet-constrained approach significantly improve baseline methods."

In [10]:
print("-----------------------------------------------------------------------------------------")
print("|                               Full reference evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Template append facet', 
                                                            round_metric(template_facet_b1), 
                                                            round_metric(template_facet_b2), 
                                                            round_metric(template_facet_b3), 
                                                            round_metric(template_facet_b4), 
                                                            round_metric(template_facet_m), 
                                                            round_metric(template_facet_r), 
                                                            round_metric(template_facet_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Existing finetuned GPT2', 
                                                            round_metric(sekulic_b1), 
                                                            round_metric(sekulic_b2), 
                                                            round_metric(sekulic_b3), 
                                                            round_metric(sekulic_b4), 
                                                            round_metric(sekulic_m), 
                                                            round_metric(sekulic_r),
                                                            round_metric(sekulic_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Prompt finetuned GPT2', 
                                                            round_metric(ftgpt2_prompt_b1), 
                                                            round_metric(ftgpt2_prompt_b2), 
                                                            round_metric(ftgpt2_prompt_b3), 
                                                            round_metric(ftgpt2_prompt_b4), 
                                                            round_metric(ftgpt2_prompt_m), 
                                                            round_metric(ftgpt2_prompt_r),
                                                            round_metric(ftgpt2_prompt_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Ours', 
                                                            round_metric(zero_nd_wsdm_b1), 
                                                            round_metric(zero_nd_wsdm_b2), 
                                                            round_metric(zero_nd_wsdm_b3), 
                                                            round_metric(zero_nd_wsdm_b4), 
                                                            round_metric(zero_nd_wsdm_m), 
                                                            round_metric(zero_nd_wsdm_r),
                                                            round_metric(zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")


print("-----------------------------------------------------------------------------------------")
print("|                                Question body evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Template append facet', 
                                                            round_metric(t_template_facet_b1), 
                                                            round_metric(t_template_facet_b2), 
                                                            round_metric(t_template_facet_b3), 
                                                            round_metric(t_template_facet_b4), 
                                                            round_metric(t_template_facet_m), 
                                                            round_metric(t_template_facet_r), 
                                                            round_metric(t_template_facet_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Existing finetuned GPT2', 
                                                            round_metric(t_sekulic_b1), 
                                                            round_metric(t_sekulic_b2), 
                                                            round_metric(t_sekulic_b3), 
                                                            round_metric(t_sekulic_b4), 
                                                            round_metric(t_sekulic_m), 
                                                            round_metric(t_sekulic_r),
                                                            round_metric(t_sekulic_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Prompt finetuned GPT2', 
                                                            round_metric(t_ftgpt2_prompt_b1), 
                                                            round_metric(t_ftgpt2_prompt_b2), 
                                                            round_metric(t_ftgpt2_prompt_b3), 
                                                            round_metric(t_ftgpt2_prompt_b4), 
                                                            round_metric(t_ftgpt2_prompt_m), 
                                                            round_metric(t_ftgpt2_prompt_r),
                                                            round_metric(t_ftgpt2_prompt_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Ours', 
                                                            round_metric(t_zero_nd_wsdm_b1), 
                                                            round_metric(t_zero_nd_wsdm_b2), 
                                                            round_metric(t_zero_nd_wsdm_b3), 
                                                            round_metric(t_zero_nd_wsdm_b4), 
                                                            round_metric(t_zero_nd_wsdm_m), 
                                                            round_metric(t_zero_nd_wsdm_r),
                                                            round_metric(t_zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")



-----------------------------------------------------------------------------------------
|                               Full reference evaluation                               |
-----------------------------------------------------------------------------------------
|MODEL                    BLEU1    BLEU2    BLEU3    BLEU4    METEOR   ROUGE    COVERAGE|
-----------------------------------------------------------------------------------------
|Template append facet    38.56    14.07    9.98     7.81     33.87    46.02    100.0   |
-----------------------------------------------------------------------------------------
|Existing finetuned GPT2  27.75    10.79    6.03     3.83     28.56    31.71    20.85   |
-----------------------------------------------------------------------------------------
|Prompt finetuned GPT2    32.79    14.58    8.57     5.63     37.55    40.81    72.55   |
-----------------------------------------------------------------------------------------
|Ours     

# RQ3. How does our question ranking model compare to other methods?

## 3.1 Perplexity

In [11]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

r = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

t_r = []
t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_c = []

model_output = 'zeroshot_nd_pp.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]

    for iter, row in facet_test_data.iterrows():
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']

        template_scores = {}
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            constraint_penalty = 1
            for constraint in force_flexible:
                if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                    constraint_penalty *= 2
            template_scores[generated_cq] = calculatePerplexity(sentence=full_sentence, model=ppl_model, tokenizer=ppl_tokenizer) * constraint_penalty
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 
        facet_test_data.at[iter, 'generated'] = generated_cq
        
        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)
    
    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)


# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_nd_pp_b1 = np.mean(b1)
zero_nd_pp_b2 = np.mean(b2)
zero_nd_pp_b3 = np.mean(b3)
zero_nd_pp_b4 = np.mean(b4)
zero_nd_pp_m = np.mean(m)
zero_nd_pp_r = np.mean(r)
zero_nd_pp_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_nd_pp_b1 = np.mean(t_b1)
t_zero_nd_pp_b2 = np.mean(t_b2)
t_zero_nd_pp_b3 = np.mean(t_b3)
t_zero_nd_pp_b4 = np.mean(t_b4)
t_zero_nd_pp_m = np.mean(t_m)
t_zero_nd_pp_r = np.mean(t_r)
t_zero_nd_pp_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - do you want to know the list of sale homes
100 Find information on ontario california airport. - directions location - do you want to know the location and directions of the airport
200 Where can I buy pressure washers? - washer - are you interested in buying washer/dryer washers or do you have any questions
300 Tell me more about Rocky Mountain News - recent events historical - would you like to know more about recent historical events
400 Where should I order dog clean-up bags - specif bag type - do you need to be specifical about the type of bag you want to order
Full reference evaluation
b1 0.4277656503772235 b2 0.19007325089147045 b3 0.11408309937495881 b4 0.07559809979039746
rouge-L 0.45158240338278777
m 0.40982020211837566
c 0.9782352941176471
Question body evaluation
b1 0.36796210388220796 b2 0.1661403144295007 b3 0.0912581858918981 b4 0.06500624937497966
rouge-L 0.40577732395656707
m 0.37771407676902025
c 0.9747058823529

## 3.2 AutoScores

In [12]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

r = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

t_r = []
t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_c = []

model_output = 'zeroshot_nd_auto.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]

    for iter, row in facet_test_data.iterrows():
        query = facet_test_data.at[iter, 'initial_request']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        facet_list = facet.split()

        noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
        propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

        template_scores = {}
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()  
            template_scores[generated_cq] = rs.get_scores(generated_cq, ' '.join(noun_in_query + propn_in_query + facet_list))[0]['rouge-l']['f'] \
                                + sentence_bleu([word_tokenize(' '.join(noun_in_query + propn_in_query + facet_list))], word_tokenize(generated_cq), 
                                weights=(1, 1, 1, 1), smoothing_function = SmoothingFunction().method1) + \
                                meteor_score([word_tokenize(' '.join(noun_in_query + propn_in_query + facet_list))], word_tokenize(generated_cq))
        
        sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
        facet_test_data.at[iter, 'generated'] = generated_cq

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_nd_auto_b1 = np.mean(b1)
zero_nd_auto_b2 = np.mean(b2)
zero_nd_auto_b3 = np.mean(b3)
zero_nd_auto_b4 = np.mean(b4)
zero_nd_auto_m = np.mean(m)
zero_nd_auto_r = np.mean(r)
zero_nd_auto_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_nd_auto_b1 = np.mean(t_b1)
t_zero_nd_auto_b2 = np.mean(t_b2)
t_zero_nd_auto_b3 = np.mean(t_b3)
t_zero_nd_auto_b4 = np.mean(t_b4)
t_zero_nd_auto_m = np.mean(t_m)
t_zero_nd_auto_r = np.mean(t_r)
t_zero_nd_auto_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you looking for a list of sale homes
100 Find information on ontario california airport. - directions location - would you like to send directions to your location
200 Where can I buy pressure washers? - washer - are you looking for a washer
300 Tell me more about Rocky Mountain News - recent events historical - do you need information on recent historical events
400 Where should I order dog clean-up bags - specif bag type - do you want to know specifical bag type
Full reference evaluation
b1 0.4342383247669103 b2 0.20474687017114682 b3 0.12927541734987133 b4 0.09641742120931246
rouge-L 0.4786832211408991
m 0.41033188202381593
c 0.9827843137254902
Question body evaluation
b1 0.3488173362287711 b2 0.1624716052304113 b3 0.10472807656908299 b4 0.08069872668358835
rouge-L 0.434214050123522
m 0.3388249958920614
c 0.9582745098039215


## 3.3 [Cross-encoder](https://www.bing.com/search?q=poly+encoder+paper&cvid=46293035bd454d6d9745d27396391cfc&aqs=edge..69i57j0l2j69i59j69i64j69i11004.7939j0j1&pglt=41&FORM=ANNAB1&PC=LCTS)

In [13]:
import sys
sys.path.append("/home/azureuser/cloudfiles/code/Users/t-zhendwang/srconvsearch")
sys.path.append("/home/azureuser/cloudfiles/code/Users/t-zhendwang/srconvsearch/conversationalQA/ParlAI")
from conversationalQA.ParlAI.parlai.scripts.interactive import Interactive, rerank

facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

r = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

t_r = []
t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_c = []

model_output = 'zeroshot_nd_cross.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)
    
else:
    question_reranker = Interactive.main(model = 'transformer/crossencoder', \
                            model_file = 'zoo:pretrained_transformers/cross_model_huge_reddit/model',  \
                            encode_candidate_vecs = False,  eval_candidates = 'inline', interactive_candidates = 'inline',
                            return_cand_scores = True)

    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]

    for iter, row in facet_test_data.iterrows():
        query = facet_test_data.at[iter, 'initial_request']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        
        generated_follow_ups = [re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip() for full_sentence in generated_cq_grouped[iter]]
        generated_cqs = [re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip() for generated_follow_up in generated_follow_ups]

        questions, questions_scores = rerank(question_reranker, query, '', generated_cqs)
        generated_cq = questions[0]
        facet_test_data.at[iter, 'generated'] = generated_cq

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)
    
# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_nd_cross_b1 = np.mean(b1)
zero_nd_cross_b2 = np.mean(b2)
zero_nd_cross_b3 = np.mean(b3)
zero_nd_cross_b4 = np.mean(b4)
zero_nd_cross_m = np.mean(m)
zero_nd_cross_r = np.mean(r)
zero_nd_cross_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_nd_cross_b1 = np.mean(t_b1)
t_zero_nd_cross_b2 = np.mean(t_b2)
t_zero_nd_cross_b3 = np.mean(t_b3)
t_zero_nd_cross_b4 = np.mean(t_b4)
t_zero_nd_cross_m = np.mean(t_m)
t_zero_nd_cross_r = np.mean(t_r)
t_zero_nd_cross_c = np.mean(t_c)


0 tell me about cass county missouri - list homes sale - do you want to know the list of sale homes
100 Find information on ontario california airport. - directions location - do you want to get directions to this location
200 Where can I buy pressure washers? - washer - do you want to know which washer to use
300 Tell me more about Rocky Mountain News - recent events historical - are you looking for historical information about recent events
400 Where should I order dog clean-up bags - specif bag type - are you looking for specifc type bag
Full reference evaluation
b1 0.41373244461884806 b2 0.176513247123985 b3 0.10313598447532396 b4 0.06910221676005299
rouge-L 0.4417860678647769
m 0.3977539356223055
c 0.9165490196078431
Question body evaluation
b1 0.3625420522173297 b2 0.1688791182506196 b3 0.09992607000759766 b4 0.07243788969725326
rouge-L 0.40447814113736086
m 0.3724821276449544
c 0.9087058823529411


## 3.4 [NTES](https://arxiv.org/pdf/2010.14202.pdf) (Pretrained clarifying question ranker)
In AML terminal, run

```
cd Clariq_System
python rank.py
```
Make sure you get the generated question output "zero\_nd\_ntes.csv"

In [19]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

r = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

t_r = []
t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_c = []

model_output = 'zeroshot_nd_ntes.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)
else:
    print("missing output file from NTES code, please run the ranker first")


# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_nd_ntes_b1 = np.mean(b1)
zero_nd_ntes_b2 = np.mean(b2)
zero_nd_ntes_b3 = np.mean(b3)
zero_nd_ntes_b4 = np.mean(b4)
zero_nd_ntes_m = np.mean(m)
zero_nd_ntes_r = np.mean(r)
zero_nd_ntes_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_nd_ntes_b1 = np.mean(t_b1)
t_zero_nd_ntes_b2 = np.mean(t_b2)
t_zero_nd_ntes_b3 = np.mean(t_b3)
t_zero_nd_ntes_b4 = np.mean(t_b4)
t_zero_nd_ntes_m = np.mean(t_m)
t_zero_nd_ntes_r = np.mean(t_r)
t_zero_nd_ntes_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - would you like to list the sale of homes

100 Find information on ontario california airport. - directions location - are you interested in getting directions to this location

200 Where can I buy pressure washers? - washer - are you interested in buying washer/dryer washers or do you have any questions

300 Tell me more about Rocky Mountain News - recent events historical - do you want information about recent historical events

400 Where should I order dog clean-up bags - specif bag type - are you looking for specifc type bag

Full reference evaluation
b1 0.36920461276522515 b2 0.16492584626033616 b3 0.09993981802772525 b4 0.0738805427686033
rouge-L 0.41492680176106006
m 0.3569165665608735
c 0.7732156862745099
Question body evaluation
b1 0.29458733795768594 b2 0.1403471043416165 b3 0.09190463977437048 b4 0.07100761161471664
rouge-L 0.33816654471658264
m 0.2796704582312601
c 0.7732156862745099


## 3.5 WSDM (Ours)

Same as 1.2.

## 3.6 Oracle ranker

In [15]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

r = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

t_r = []
t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_c = []

model_output = 'zeroshot_nd_oracle.csv'
generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]

    for iter, row in facet_test_data.iterrows():
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']

        template_scores = {}
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()  
            template_scores[generated_cq] = rs.get_scores(generated_cq, ref)[0]['rouge-l']['f']
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
        facet_test_data.at[iter, 'generated'] = generated_cq

        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

zero_nd_oracle_b1 = np.mean(b1)
zero_nd_oracle_b2 = np.mean(b2)
zero_nd_oracle_b3 = np.mean(b3)
zero_nd_oracle_b4 = np.mean(b4)
zero_nd_oracle_m = np.mean(m)
zero_nd_oracle_r = np.mean(r)
zero_nd_oracle_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_zero_nd_oracle_b1 = np.mean(t_b1)
t_zero_nd_oracle_b2 = np.mean(t_b2)
t_zero_nd_oracle_b3 = np.mean(t_b3)
t_zero_nd_oracle_b4 = np.mean(t_b4)
t_zero_nd_oracle_m = np.mean(t_m)
t_zero_nd_oracle_r = np.mean(t_r)
t_zero_nd_oracle_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you interested in a list of sale homes
100 Find information on ontario california airport. - directions location - would you like to send directions to your location
200 Where can I buy pressure washers? - washer - are you looking for a washer
300 Tell me more about Rocky Mountain News - recent events historical - do you want information about recent historical events
400 Where should I order dog clean-up bags - specif bag type - are you looking for specifc type bag
Full reference evaluation
b1 0.6160936744403871 b2 0.43807769108461336 b3 0.3346436810798287 b4 0.2414089762434254
rouge-L 0.6897123207824376
m 0.6517379490599675
c 0.9269019607843136
Question body evaluation
b1 0.4074544213117534 b2 0.21095173470334003 b3 0.13615480970344712 b4 0.09920045448032966
rouge-L 0.479522132674582
m 0.4090943786312325
c 0.9123921568627451


## 3.7 Comparing 3.1-3.6, getting the conclusion that "WSDM" is the best ranker choice in terms of question body quality.

In [20]:
print("-----------------------------------------------------------------------------------------")
print("|                               Full reference evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Perplexity', 
                                                            round_metric(zero_nd_pp_b1), 
                                                            round_metric(zero_nd_pp_b2), 
                                                            round_metric(zero_nd_pp_b3), 
                                                            round_metric(zero_nd_pp_b4), 
                                                            round_metric(zero_nd_pp_m), 
                                                            round_metric(zero_nd_pp_r), 
                                                            round_metric(zero_nd_pp_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('AutoScore', 
                                                            round_metric(zero_nd_auto_b1), 
                                                            round_metric(zero_nd_auto_b2), 
                                                            round_metric(zero_nd_auto_b3), 
                                                            round_metric(zero_nd_auto_b4), 
                                                            round_metric(zero_nd_auto_m), 
                                                            round_metric(zero_nd_auto_r), 
                                                            round_metric(zero_nd_auto_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Cross encoder', 
                                                            round_metric(zero_nd_cross_b1), 
                                                            round_metric(zero_nd_cross_b2), 
                                                            round_metric(zero_nd_cross_b3), 
                                                            round_metric(zero_nd_cross_b4), 
                                                            round_metric(zero_nd_cross_m), 
                                                            round_metric(zero_nd_cross_r), 
                                                            round_metric(zero_nd_cross_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('NTES', 
                                                            round_metric(zero_nd_ntes_b1), 
                                                            round_metric(zero_nd_ntes_b2), 
                                                            round_metric(zero_nd_ntes_b3), 
                                                            round_metric(zero_nd_ntes_b4), 
                                                            round_metric(zero_nd_ntes_m), 
                                                            round_metric(zero_nd_ntes_r), 
                                                            round_metric(zero_nd_ntes_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('WSDM', 
                                                            round_metric(zero_nd_wsdm_b1), 
                                                            round_metric(zero_nd_wsdm_b2), 
                                                            round_metric(zero_nd_wsdm_b3), 
                                                            round_metric(zero_nd_wsdm_b4), 
                                                            round_metric(zero_nd_wsdm_m), 
                                                            round_metric(zero_nd_wsdm_r),
                                                            round_metric(zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Oracle', 
                                                            round_metric(zero_nd_oracle_b1), 
                                                            round_metric(zero_nd_oracle_b2), 
                                                            round_metric(zero_nd_oracle_b3), 
                                                            round_metric(zero_nd_oracle_b4), 
                                                            round_metric(zero_nd_oracle_m), 
                                                            round_metric(zero_nd_oracle_r), 
                                                            round_metric(zero_nd_oracle_c)))
print("-----------------------------------------------------------------------------------------")



print("-----------------------------------------------------------------------------------------")
print("|                                Question body evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Perplexity', 
                                                            round_metric(t_zero_nd_pp_b1), 
                                                            round_metric(t_zero_nd_pp_b2), 
                                                            round_metric(t_zero_nd_pp_b3), 
                                                            round_metric(t_zero_nd_pp_b4), 
                                                            round_metric(t_zero_nd_pp_m), 
                                                            round_metric(t_zero_nd_pp_r), 
                                                            round_metric(t_zero_nd_pp_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('AutoScore', 
                                                            round_metric(t_zero_nd_auto_b1), 
                                                            round_metric(t_zero_nd_auto_b2), 
                                                            round_metric(t_zero_nd_auto_b3), 
                                                            round_metric(t_zero_nd_auto_b4), 
                                                            round_metric(t_zero_nd_auto_m), 
                                                            round_metric(t_zero_nd_auto_r), 
                                                            round_metric(t_zero_nd_auto_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Cross encoder', 
                                                            round_metric(t_zero_nd_cross_b1), 
                                                            round_metric(t_zero_nd_cross_b2), 
                                                            round_metric(t_zero_nd_cross_b3), 
                                                            round_metric(t_zero_nd_cross_b4), 
                                                            round_metric(t_zero_nd_cross_m), 
                                                            round_metric(t_zero_nd_cross_r), 
                                                            round_metric(t_zero_nd_cross_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('NTES', 
                                                            round_metric(t_zero_nd_ntes_b1), 
                                                            round_metric(t_zero_nd_ntes_b2), 
                                                            round_metric(t_zero_nd_ntes_b3), 
                                                            round_metric(t_zero_nd_ntes_b4), 
                                                            round_metric(t_zero_nd_ntes_m), 
                                                            round_metric(t_zero_nd_ntes_r), 
                                                            round_metric(t_zero_nd_ntes_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('WSDM', 
                                                            round_metric(t_zero_nd_wsdm_b1), 
                                                            round_metric(t_zero_nd_wsdm_b2), 
                                                            round_metric(t_zero_nd_wsdm_b3), 
                                                            round_metric(t_zero_nd_wsdm_b4), 
                                                            round_metric(t_zero_nd_wsdm_m), 
                                                            round_metric(t_zero_nd_wsdm_r),
                                                            round_metric(t_zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Oracle', 
                                                            round_metric(t_zero_nd_oracle_b1), 
                                                            round_metric(t_zero_nd_oracle_b2), 
                                                            round_metric(t_zero_nd_oracle_b3), 
                                                            round_metric(t_zero_nd_oracle_b4), 
                                                            round_metric(t_zero_nd_oracle_m), 
                                                            round_metric(t_zero_nd_oracle_r), 
                                                            round_metric(t_zero_nd_oracle_c)))
print("-----------------------------------------------------------------------------------------")


-----------------------------------------------------------------------------------------
|                               Full reference evaluation                               |
-----------------------------------------------------------------------------------------
|MODEL                    BLEU1    BLEU2    BLEU3    BLEU4    METEOR   ROUGE    COVERAGE|
-----------------------------------------------------------------------------------------
|Perplexity               42.78    19.01    11.41    7.56     40.98    45.16    97.82   |
-----------------------------------------------------------------------------------------
|AutoScore                43.42    20.47    12.93    9.64     41.03    47.87    98.28   |
-----------------------------------------------------------------------------------------
|Cross encoder            41.37    17.65    10.31    6.91     39.78    44.18    91.65   |
-----------------------------------------------------------------------------------------
|NTES     

# RQ4. How good is Neurologic Decoding for facet-driven clarifying? Specifically, how much does it improve over other facet-driven methods and how far is it from perfect?

Can be merged with RQ2.

# RQ5. How does our GPT-2-based zero-shot facet-constrained approach compare to using Large Language Models such as zero-shot GPT-3? 
To answer this question, we compare our proposed method with a few-shot prompt guided GPT-3 method, but we mainly compare zero-shot with zero-shot.
The few-shot GPT-3 method uses the prompt structure as in our proposed GPT-2 finetune method, which is:

## {query} Ask a question that contains words in the list \[{facet}\] {clarifying question}

## 5.1 Zero-shot GPT3

In [17]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

temperature = 0
use_examples = 0
model_output = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) + '.csv'
model_output_all_templates = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) +'_all'

all_generations = [] # cache the generations to save time and load from calling gpt3

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)
    
else:
    if os.path.isfile(model_output_all_templates):
        generated_cq_all_templates = open(model_output_all_templates, 'r').readlines()
        generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                    for l in range(len(starting_texts))] 
                                    for k in range(int(len(generated_cq_all_templates)/8))]

        for iter, row in facet_test_data.iterrows():
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for full_sentence in generated_cq_grouped[iter]:
                query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
                generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
                generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
                generated_cqs.append(generated_cq)
            
            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq
            
            if iter % sample_every == 0: 
                print(iter, query, "-", facet, '-', generated_cq)
                pprint.pprint(sorted_template_scores)

            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    else:
        for iter, row in facet_test_data.iterrows():
            query = facet_test_data.at[iter, 'initial_request']
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for s_t in starting_texts:
                s_t = re.sub('\[SEP\]', ' ', s_t).strip()
                prompt = ' '.join(gpt3_examples[:use_examples]) + ' ' + query + ' ' + "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+f+"'" for f in facet.split()])  + '].' + s_t
                response = openai.Completion.create(
                    model="text-davinci-002",
                    prompt= prompt,
                    temperature=temperature,
                    max_tokens=32,
                    top_p=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0,
                    stop=["\n"]
                )

                generated_cq = s_t + response['choices'][0]['text']
                generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
                generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()
                generated_cqs.append(generated_cq)
                all_generations.append(generated_cq)

            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq

            if iter % sample_every == 0: 
                print(iter, query, "-", facet_list, '-', ' '.join(tokenized_hyp))
                pprint.pprint(sorted_template_scores)
        
            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

with open(model_output_all_templates, 'w') as outputfile:
    for generation in all_generations:
        outputfile.write(generation)
        outputfile.write('\n')

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

gpt3_0_b1 = np.mean(b1)
gpt3_0_b2 = np.mean(b2)
gpt3_0_b3 = np.mean(b3)
gpt3_0_b4 = np.mean(b4)
gpt3_0_m = np.mean(m)
gpt3_0_r = np.mean(r)
gpt3_0_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_gpt3_0_b1 = np.mean(t_b1)
t_gpt3_0_b2 = np.mean(t_b2)
t_gpt3_0_b3 = np.mean(t_b3)
t_gpt3_0_b4 = np.mean(t_b4)
t_gpt3_0_m = np.mean(t_m)
t_gpt3_0_r = np.mean(t_r)
t_gpt3_0_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you looking for a list of homes for sale in cass county missouri
100 Find information on ontario california airport. - directions location - do you need information on the ontario california airport
200 Where can I buy pressure washers? - washer - do you want information about a washing machine or a pressure washer
300 Tell me more about Rocky Mountain News - recent events historical - do you need information on recent events or historical events
400 Where should I order dog clean-up bags - specif bag type - do you want to know what type of bag to use for your dog
Full reference evaluation
b1 0.42959420323127595 b2 0.2144905020365997 b3 0.12975964645585925 b4 0.08756123899640086
rouge-L 0.4668552513632458
m 0.4595009686068181
c 0.8526666666666668
Question body evaluation
b1 0.3943690841916204 b2 0.22815387204349083 b3 0.15282352826624315 b4 0.1193501579145087
rouge-L 0.4620705224770397
m 0.4636303615409833
c 0.829529411764706

## 5.2 One-shot GPT3

In [18]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

temperature = 0
use_examples = 1
model_output = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) + '.csv'
model_output_all_templates = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) +'_all'

all_generations = [] # cache the generations to save time and load from calling gpt3

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)
    
else:
    if os.path.isfile(model_output_all_templates):
        generated_cq_all_templates = open(model_output_all_templates, 'r').readlines()
        generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                    for l in range(len(starting_texts))] 
                                    for k in range(int(len(generated_cq_all_templates)/8))]

        for iter, row in facet_test_data.iterrows():
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for full_sentence in generated_cq_grouped[iter]:
                query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
                generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
                generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
                generated_cqs.append(generated_cq)
            
            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq
            
            if iter % sample_every == 0: 
                print(iter, query, "-", facet, '-', generated_cq)
                pprint.pprint(sorted_template_scores)

            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    else:
        for iter, row in facet_test_data.iterrows():
            query = facet_test_data.at[iter, 'initial_request']
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for s_t in starting_texts:
                s_t = re.sub('\[SEP\]', ' ', s_t).strip()
                prompt = ' '.join(gpt3_examples[:use_examples]) + ' ' + query + ' ' + "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+f+"'" for f in facet.split()])  + '].' + s_t
                response = openai.Completion.create(
                    model="text-davinci-002",
                    prompt= prompt,
                    temperature=temperature,
                    max_tokens=32,
                    top_p=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0,
                    stop=["\n"]
                )

                generated_cq = s_t + response['choices'][0]['text']
                generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
                generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()
                generated_cqs.append(generated_cq)
                all_generations.append(generated_cq)

            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq

            if iter % sample_every == 0: 
                print(iter, query, "-", facet_list, '-', ' '.join(tokenized_hyp))
                pprint.pprint(sorted_template_scores)
        
            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

with open(model_output_all_templates, 'w') as outputfile:
    for generation in all_generations:
        outputfile.write(generation)
        outputfile.write('\n')

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

gpt3_1_b1 = np.mean(b1)
gpt3_1_b2 = np.mean(b2)
gpt3_1_b3 = np.mean(b3)
gpt3_1_b4 = np.mean(b4)
gpt3_1_m = np.mean(m)
gpt3_1_r = np.mean(r)
gpt3_1_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_gpt3_1_b1 = np.mean(t_b1)
t_gpt3_1_b2 = np.mean(t_b2)
t_gpt3_1_b3 = np.mean(t_b3)
t_gpt3_1_b4 = np.mean(t_b4)
t_gpt3_1_m = np.mean(t_m)
t_gpt3_1_r = np.mean(t_r)
t_gpt3_1_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - do you want information about a specific home or a list of homes for sale
100 Find information on ontario california airport. - directions location - do you want information on the ontario california airport location or directions to the ontario california airport
200 Where can I buy pressure washers? - washer - do you need to buy a pressure washer
300 Tell me more about Rocky Mountain News - recent events historical - are you looking for recent events or historical events
400 Where should I order dog clean-up bags - specif bag type - do you want to know about a specific bag or type of bag
Full reference evaluation
b1 0.45450442193977114 b2 0.23043683433896323 b3 0.1453323074712589 b4 0.10288596669634774
rouge-L 0.48206610468118216
m 0.46551388301551
c 0.9543529411764706
Question body evaluation
b1 0.4253424715913936 b2 0.23735638076510057 b3 0.15818386619872948 b4 0.11904117520780316
rouge-L 0.47590761996212216
m 0.4598323230233

## 5.3 Two-shot GPT3

In [19]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

temperature = 0
use_examples = 2
model_output = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) + '.csv'
model_output_all_templates = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) +'_all'

all_generations = [] # cache the generations to save time and load from calling gpt3

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)
    
else:
    if os.path.isfile(model_output_all_templates):
        generated_cq_all_templates = open(model_output_all_templates, 'r').readlines()
        generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                    for l in range(len(starting_texts))] 
                                    for k in range(int(len(generated_cq_all_templates)/8))]

        for iter, row in facet_test_data.iterrows():
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for full_sentence in generated_cq_grouped[iter]:
                query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
                generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
                generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
                generated_cqs.append(generated_cq)
            
            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq
            
            if iter % sample_every == 0: 
                print(iter, query, "-", facet, '-', generated_cq)
                pprint.pprint(sorted_template_scores)

            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    else:
        for iter, row in facet_test_data.iterrows():
            query = facet_test_data.at[iter, 'initial_request']
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for s_t in starting_texts:
                s_t = re.sub('\[SEP\]', ' ', s_t).strip()
                prompt = ' '.join(gpt3_examples[:use_examples]) + ' ' + query + ' ' + "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+f+"'" for f in facet.split()])  + '].' + s_t
                response = openai.Completion.create(
                    model="text-davinci-002",
                    prompt= prompt,
                    temperature=temperature,
                    max_tokens=32,
                    top_p=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0,
                    stop=["\n"]
                )

                generated_cq = s_t + response['choices'][0]['text']
                generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
                generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()
                generated_cqs.append(generated_cq)
                all_generations.append(generated_cq)

            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq

            if iter % sample_every == 0: 
                print(iter, query, "-", facet_list, '-', ' '.join(tokenized_hyp))
                pprint.pprint(sorted_template_scores)
        
            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

with open(model_output_all_templates, 'w') as outputfile:
    for generation in all_generations:
        outputfile.write(generation)
        outputfile.write('\n')

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

gpt3_2_b1 = np.mean(b1)
gpt3_2_b2 = np.mean(b2)
gpt3_2_b3 = np.mean(b3)
gpt3_2_b4 = np.mean(b4)
gpt3_2_m = np.mean(m)
gpt3_2_r = np.mean(r)
gpt3_2_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_gpt3_2_b1 = np.mean(t_b1)
t_gpt3_2_b2 = np.mean(t_b2)
t_gpt3_2_b3 = np.mean(t_b3)
t_gpt3_2_b4 = np.mean(t_b4)
t_gpt3_2_m = np.mean(t_m)
t_gpt3_2_r = np.mean(t_r)
t_gpt3_2_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you looking for a list of homes for sale in cass county missouri
100 Find information on ontario california airport. - directions location - do you want information on how to get to the ontario california airport or information about its location
200 Where can I buy pressure washers? - washer - are you interested in purchasing a pressure washer
300 Tell me more about Rocky Mountain News - recent events historical - are you looking for recent events or historical events
400 Where should I order dog clean-up bags - specif bag type - are you interested in a specific type of bag or just any bag that will work for dog clean-up
Full reference evaluation
b1 0.42323949261930216 b2 0.21974406668664828 b3 0.13476144197480297 b4 0.09177524838662333
rouge-L 0.4577721705677639
m 0.5238679886110711
c 0.9356470588235294
Question body evaluation
b1 0.3894379076796257 b2 0.22635225534200076 b3 0.14553205532907226 b4 0.10370774309698337
rouge-

## 5.4 Three-shot GPT3

In [20]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

temperature = 0
use_examples = 3
model_output = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) + '.csv'
model_output_all_templates = 'fewshot_gpt3' + '_examples' + str(use_examples) + '_temp' + str(temperature) +'_all'

all_generations = [] # cache the generations to save time and load from calling gpt3

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)
    
else:
    if os.path.isfile(model_output_all_templates):
        generated_cq_all_templates = open(model_output_all_templates, 'r').readlines()
        generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                    for l in range(len(starting_texts))] 
                                    for k in range(int(len(generated_cq_all_templates)/8))]

        for iter, row in facet_test_data.iterrows():
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for full_sentence in generated_cq_grouped[iter]:
                query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
                generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
                generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
                generated_cqs.append(generated_cq)
            
            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq
            
            if iter % sample_every == 0: 
                print(iter, query, "-", facet, '-', generated_cq)
                pprint.pprint(sorted_template_scores)

            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    else:
        for iter, row in facet_test_data.iterrows():
            query = facet_test_data.at[iter, 'initial_request']
            facet = facet_test_data.at[iter, 'facet_desc']
            ref = facet_test_data.at[iter, 'question']
            facet_list = facet.split()

            generated_cqs = []
            for s_t in starting_texts:
                s_t = re.sub('\[SEP\]', ' ', s_t).strip()
                prompt = ' '.join(gpt3_examples[:use_examples]) + ' ' + query + ' ' + "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+f+"'" for f in facet.split()])  + '].' + s_t
                response = openai.Completion.create(
                    model="text-davinci-002",
                    prompt= prompt,
                    temperature=temperature,
                    max_tokens=32,
                    top_p=1,
                    frequency_penalty=0.0,
                    presence_penalty=0.0,
                    stop=["\n"]
                )

                generated_cq = s_t + response['choices'][0]['text']
                generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
                generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()
                generated_cqs.append(generated_cq)
                all_generations.append(generated_cq)

            noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
            propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

            template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query+facet_list), doc_list=generated_cqs)
            sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)
            generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
            facet_test_data.at[iter, 'generated'] = generated_cq

            if iter % sample_every == 0: 
                print(iter, query, "-", facet_list, '-', ' '.join(tokenized_hyp))
                pprint.pprint(sorted_template_scores)
        
            # full reference evaluation
            hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

            b1.append(hyp_b1)
            b2.append(hyp_b2)
            b3.append(hyp_b3)
            b4.append(hyp_b4)
            m.append(hyp_m)
            r.append(hyp_r)
            c.append(hyp_c)

            # question body evaluation
            truncate_ref = ' '.join(ref.split()[template_len:])
            truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
            
            t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

            t_b1.append(t_hyp_b1)
            t_b2.append(t_hyp_b2)
            t_b3.append(t_hyp_b3)
            t_b4.append(t_hyp_b4)
            t_m.append(t_hyp_m)
            t_r.append(t_hyp_r)
            t_c.append(t_hyp_c)

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

with open(model_output_all_templates, 'w') as outputfile:
    for generation in all_generations:
        outputfile.write(generation)
        outputfile.write('\n')

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

gpt3_3_b1 = np.mean(b1)
gpt3_3_b2 = np.mean(b2)
gpt3_3_b3 = np.mean(b3)
gpt3_3_b4 = np.mean(b4)
gpt3_3_m = np.mean(m)
gpt3_3_r = np.mean(r)
gpt3_3_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_gpt3_3_b1 = np.mean(t_b1)
t_gpt3_3_b2 = np.mean(t_b2)
t_gpt3_3_b3 = np.mean(t_b3)
t_gpt3_3_b4 = np.mean(t_b4)
t_gpt3_3_m = np.mean(t_m)
t_gpt3_3_r = np.mean(t_r)
t_gpt3_3_c = np.mean(t_c)

0 tell me about cass county missouri - list homes sale - are you looking for a list of homes for sale in cass county missouri
100 Find information on ontario california airport. - directions location - do you want information on how to get to the ontario california airport or information about its location
200 Where can I buy pressure washers? - washer - are you interested in purchasing a pressure washer
300 Tell me more about Rocky Mountain News - recent events historical - are you looking for recent events or historical events
400 Where should I order dog clean-up bags - specif bag type - are you interested in a specific type of bag or just any bag that will work for dog clean-up
Full reference evaluation
b1 0.42387925535792526 b2 0.2209577594993678 b3 0.13653434343394935 b4 0.09335854470351058
rouge-L 0.4580263700523444
m 0.5252491996168146
c 0.9352549019607843
Question body evaluation
b1 0.3899627493086948 b2 0.22707399306205175 b3 0.1470466676760886 b4 0.10557090206974408
rouge-L 

## 5.5 Ours
Same as 1.2

## 5.6 Comparing 3.1-3.5, get the conclusion for RQ3. "Our approach is a good replacement for GPT3."

In [23]:
print("-----------------------------------------------------------------------------------------")
print("|                               Full reference evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Ours-WSDM', 
                                                            round_metric(zero_nd_wsdm_b1), 
                                                            round_metric(zero_nd_wsdm_b2), 
                                                            round_metric(zero_nd_wsdm_b3), 
                                                            round_metric(zero_nd_wsdm_b4), 
                                                            round_metric(zero_nd_wsdm_m), 
                                                            round_metric(zero_nd_wsdm_r), 
                                                            round_metric(zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Ours-AutoScore', 
                                                            round_metric(zero_nd_auto_b1), 
                                                            round_metric(zero_nd_auto_b2), 
                                                            round_metric(zero_nd_auto_b3), 
                                                            round_metric(zero_nd_auto_b4), 
                                                            round_metric(zero_nd_auto_m), 
                                                            round_metric(zero_nd_auto_r), 
                                                            round_metric(zero_nd_auto_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Zero-shot GPT3', 
                                                            round_metric(gpt3_0_b1), 
                                                            round_metric(gpt3_0_b2), 
                                                            round_metric(gpt3_0_b3), 
                                                            round_metric(gpt3_0_b4), 
                                                            round_metric(gpt3_0_m), 
                                                            round_metric(gpt3_0_r),
                                                            round_metric(gpt3_0_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('One-shot GPT3', 
                                                            round_metric(gpt3_1_b1), 
                                                            round_metric(gpt3_1_b2), 
                                                            round_metric(gpt3_1_b3), 
                                                            round_metric(gpt3_1_b4), 
                                                            round_metric(gpt3_1_m), 
                                                            round_metric(gpt3_1_r),
                                                            round_metric(gpt3_1_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Two-shot GPT3', 
                                                            round_metric(gpt3_2_b1), 
                                                            round_metric(gpt3_2_b2), 
                                                            round_metric(gpt3_2_b3), 
                                                            round_metric(gpt3_2_b4), 
                                                            round_metric(gpt3_2_m), 
                                                            round_metric(gpt3_2_r),
                                                            round_metric(gpt3_2_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Three-shot GPT3', 
                                                            round_metric(gpt3_3_b1), 
                                                            round_metric(gpt3_3_b2), 
                                                            round_metric(gpt3_3_b3), 
                                                            round_metric(gpt3_3_b4), 
                                                            round_metric(gpt3_3_m), 
                                                            round_metric(gpt3_3_r),
                                                            round_metric(gpt3_3_c)))
print("-----------------------------------------------------------------------------------------")



print("-----------------------------------------------------------------------------------------")
print("|                                Question body evaluation                               |")
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE','COVERAGE'))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Ours-WSDM', 
                                                            round_metric(t_zero_nd_wsdm_b1), 
                                                            round_metric(t_zero_nd_wsdm_b2), 
                                                            round_metric(t_zero_nd_wsdm_b3), 
                                                            round_metric(t_zero_nd_wsdm_b4), 
                                                            round_metric(t_zero_nd_wsdm_m), 
                                                            round_metric(t_zero_nd_wsdm_r), 
                                                            round_metric(t_zero_nd_wsdm_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Ours-AutoScore', 
                                                            round_metric(t_zero_nd_auto_b1), 
                                                            round_metric(t_zero_nd_auto_b2), 
                                                            round_metric(t_zero_nd_auto_b3), 
                                                            round_metric(t_zero_nd_auto_b4), 
                                                            round_metric(t_zero_nd_auto_m), 
                                                            round_metric(t_zero_nd_auto_r), 
                                                            round_metric(t_zero_nd_auto_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Zero-shot GPT3', 
                                                            round_metric(t_gpt3_0_b1), 
                                                            round_metric(t_gpt3_0_b2), 
                                                            round_metric(t_gpt3_0_b3), 
                                                            round_metric(t_gpt3_0_b4), 
                                                            round_metric(t_gpt3_0_m), 
                                                            round_metric(t_gpt3_0_r),
                                                            round_metric(t_gpt3_0_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('One-shot GPT3', 
                                                            round_metric(t_gpt3_1_b1), 
                                                            round_metric(t_gpt3_1_b2), 
                                                            round_metric(t_gpt3_1_b3), 
                                                            round_metric(t_gpt3_1_b4), 
                                                            round_metric(t_gpt3_1_m), 
                                                            round_metric(t_gpt3_1_r),
                                                            round_metric(t_gpt3_1_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Two-shot GPT3', 
                                                            round_metric(t_gpt3_2_b1), 
                                                            round_metric(t_gpt3_2_b2), 
                                                            round_metric(t_gpt3_2_b3), 
                                                            round_metric(t_gpt3_2_b4), 
                                                            round_metric(t_gpt3_2_m), 
                                                            round_metric(t_gpt3_2_r),
                                                            round_metric(t_gpt3_2_c)))
print("-----------------------------------------------------------------------------------------")
print("|{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}|".format('Three-shot GPT3', 
                                                            round_metric(t_gpt3_3_b1), 
                                                            round_metric(t_gpt3_3_b2), 
                                                            round_metric(t_gpt3_3_b3), 
                                                            round_metric(t_gpt3_3_b4), 
                                                            round_metric(t_gpt3_3_m), 
                                                            round_metric(t_gpt3_3_r),
                                                            round_metric(t_gpt3_3_c)))
print("-----------------------------------------------------------------------------------------")

-----------------------------------------------------------------------------------------
|                               Full reference evaluation                               |
-----------------------------------------------------------------------------------------
|MODEL                    BLEU1    BLEU2    BLEU3    BLEU4    METEOR   ROUGE    COVERAGE|
-----------------------------------------------------------------------------------------
|Ours-WSDM                41.8     17.57    9.71     6.45     38.52    44.19    98.96   |
-----------------------------------------------------------------------------------------
|Ours-AutoScore           43.42    20.47    12.93    9.64     41.03    47.87    98.28   |
-----------------------------------------------------------------------------------------
|Zero-shot GPT3           42.96    21.45    12.98    8.76     45.95    46.69    85.27   |
-----------------------------------------------------------------------------------------
|One-shot 

# *Previous  experiments
## 2.1. Template + Facet + Huggingface constrained decoding + ranked by perplexity

In [2]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []
ps = PorterStemmer()

model_output = 'zeroshot_hf.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)

        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))

else:
    for iter, row in facet_test_data.iterrows():
        facet = facet_test_data.at[iter, 'facet_desc']
        query = facet_test_data.at[iter, 'initial_request']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)

        force_flexible = facet.split()
        force_words_ids = [ tokenizer(word, add_prefix_space=True, add_special_tokens=False).input_ids[0]
                            for word in force_flexible ]
        constraints = [PhrasalConstraint(force_words_ids)] 

        template_scores = {}
        for s_t in starting_texts:
            starting_text = query + s_t
            input_ids = tokenizer(starting_text, return_tensors="pt").input_ids.to(device)
            outputs = model.generate(
                input_ids,
                constraints=constraints,
                num_beams=10,
                num_return_sequences=5,
                no_repeat_ngram_size=1,
                remove_invalid_values=True,
                max_length = len(input_ids[0]) + 10,
                top_p=0.9, 
                temperature = 1,
                pad_token_id=tokenizer.eos_token_id
            )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_cq = nltk_tokenizer.tokenize(generated_text[len(query):])[0]
            generated_cq = process_generation(generated_cq)
            generated_cq = ' '.join(word_tokenize(generated_cq))

            constraint_penalty = 1
            for constraint in force_flexible:
                if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                    constraint_penalty *= 2
            template_scores[generated_cq] = calculatePerplexity(sentence=query+generated_cq, model=model, tokenizer=tokenizer) \
                                            * constraint_penalty
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 

        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        
        if iter % 50 == 0: 
            print(iter, query, "-", force_flexible, '-', ' '.join(tokenized_hyp))
            pprint.pprint(template_scores)   

        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))
        
    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("cons", np.mean(c))

zero_hf_b1 = np.mean(b1)
zero_hf_b2 = np.mean(b2)
zero_hf_b3 = np.mean(b3)
zero_hf_b4 = np.mean(b4)
zero_hf_m = np.mean(m)
zero_hf_r = np.mean(rs_list)
zero_hf_c = np.mean(c)

0 tell me about cass county missouri - list homes sale - would you like to know how list homes are list homes sale
50 tell me about memory - computer - do you want to know what it was like when the computer went down
100 Find information on ontario california airport. - directions location - would you like to be notified via e mail when this directions location
150 Find me map of USA - roads - do you need information on the roads and highways in your area
200 Where can I buy pressure washers? - washer - do you want to know what is the difference between a vacuum sealer and
250 Tell me about defender - lyrics - do you want to play with
300 Tell me more about Rocky Mountain News - recent events historical - do you want to know what the recent events have been like recent events historical
350 I want to know about appraisals. - appraisal cost - do you want to know if the appraisal is valid or not
400 Where should I order dog clean-up bags - specif bag type - do you need to know about spec

## 2.2 Only template + ranked by perplexity

In [3]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

model_output = 'template.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
        
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))
else:
    for iter, row in facet_test_data.iterrows():
        facet = facet_test_data.at[iter, 'facet_desc']
        query = facet_test_data.at[iter, 'initial_request']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)

        template_scores = {}
        for s_t in starting_texts:
            starting_text = query + s_t
            input_ids = tokenizer(starting_text, return_tensors="pt").input_ids.to(device)
            outputs = model.generate(
                input_ids,
                num_beams=10,
                num_return_sequences=1,
                no_repeat_ngram_size=1,
                remove_invalid_values=True,
                max_length = len(input_ids[0]) + 20,
                top_p=0.9, 
                temperature = 0.7,
                pad_token_id=tokenizer.eos_token_id
            )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            generated_cq = nltk_tokenizer.tokenize(generated_text[len(query):])[0]
            generated_cq = process_generation(generated_cq)
            generated_cq = ' '.join(word_tokenize(generated_cq))

            
            template_scores[generated_cq] = calculatePerplexity(sentence=query+generated_cq, model=model, tokenizer=tokenizer)
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 

        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            pprint.pprint(template_scores)
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

template_b1 = np.mean(b1)
template_b2 = np.mean(b2)
template_b3 = np.mean(b3)
template_b4 = np.mean(b4)
template_m = np.mean(m)
template_r = np.mean(rs_list)
template_c = np.mean(c)

0 tell me about cass county missouri - list homes sale - do you want to know how many people have been killed in the state of Missouri and what are your thoughts on that
50 tell me about memory - computer - are you looking for something else to do with your life
100 Find information on ontario california airport. - directions location - would you like to be notified via e mail when this article is published
150 Find me map of USA - roads - do you want information on how to get there
200 Where can I buy pressure washers? - washer - do you want to know if there are any other types of saws in the world that will work well for this job
250 Tell me about defender - lyrics - do you need to be in the team
300 Tell me more about Rocky Mountain News - recent events historical - do you need information on how to get involved in the community
350 I want to know about appraisals. - appraisal cost - do you want to know how much money is involved in the appraisal process
400 Where should I order dog

## 2.3. No prompt, Facet + neurologic decoding + ranked by perplexity

In [6]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

generated_file = 'neurologic_decoding/zero_shot/gpt2noprompt'

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []
ps = PorterStemmer()

generated_cqs = open(generated_file, 'r').readlines()

for iter, row in facet_test_data.iterrows():
    query = model_output_data.at[iter, 'query']
    facet = facet_test_data.at[iter, 'facet_desc']
    ref = model_output_data.at[iter, 'reference']
    tokenized_ref = word_tokenize(ref)
    generated_cq = generated_cqs[iter][len(facet_test_data.at[iter, 'initial_request']):].strip()
    facet_test_data.at[iter, 'generated'] = generated_cq
    tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
    
    if iter % 50 == 0: 
        print(iter, query, "-", facet, '-', generated_cq)
    
    rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
    
    b1.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 0, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b2.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 1, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b3.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 1, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b4.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 0, 1),
                            smoothing_function = SmoothingFunction().method1
                            ))

    m.append(meteor_score([tokenized_ref], tokenized_hyp))

    constraint_unsatisfied = 0
    force_words = facet.split()
    for constraint in force_words:
        if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
            constraint_unsatisfied += 1
    c.append(1 - constraint_unsatisfied/len(force_words))

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
output_df.columns = ['query', 'facet', 'reference', 'candidate']
output_df.to_csv('zeroshot_nd_noprompt_pp.csv')

facet_nd_b1 = np.mean(b1)
facet_nd_b2 = np.mean(b2)
facet_nd_b3 = np.mean(b3)
facet_nd_b4 = np.mean(b4)
facet_nd_m = np.mean(m)
facet_nd_r = np.mean(rs_list)
facet_nd_c = np.mean(c)

0 tell me about cass county missouri - list homes sale - county list of homes with sale records.
50 tell me about memory - computer - in computer science," he said.
100 Find information on ontario california airport. - directions location - Click here for directions and location.
150 Find me map of USA - roads - and UK roads and highways.
200 Where can I buy pressure washers? - washer - I don't know what they are.
250 Tell me about defender - lyrics - lyrics and what they mean.
300 Tell me more about Rocky Mountain News - recent events historical - and the recent historical events.
350 I want to know about appraisals. - appraisal cost - The cost of an appraisal.
400 Where should I order dog clean-up bags - specif bag type - and specifc type bag.
b1 0.21009466369496277 b2 0.06442674864631723 b3 0.028553873315365456 b4 0.022501687356610053
rouge-L 0.23507797559462323
m 0.19491474938070089
c 0.9027450980392155


## 2.7. Template + Facet&subject + neuro + perplexity (Ours v2)

In [11]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)


rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
ps = PorterStemmer()
c = []

model_output = 'zeroshot_taggednoun_nd_pp.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))
else:
    generated_file = 'neurologic_decoding/zero_shot/gpt2facet_taggednoun'
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]
    for iter, row in facet_test_data.iterrows():
        query = ''
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)
        force_flexible = facet.split()

        template_scores = {}
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            constraint_penalty = 1
            for constraint in force_flexible:
                if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                    constraint_penalty *= 2
            template_scores[generated_cq] = calculatePerplexity(sentence=full_sentence, model=model, tokenizer=tokenizer) * constraint_penalty
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 

        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            pprint.pprint(template_scores)
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))
    
    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

zero_taggednoun_nd_b1 = np.mean(b1)
zero_taggednoun_nd_b2 = np.mean(b2)
zero_taggednoun_nd_b3 = np.mean(b3)
zero_taggednoun_nd_b4 = np.mean(b4)
zero_taggednoun_nd_m = np.mean(m)
zero_taggednoun_nd_r = np.mean(rs_list)
zero_taggednoun_nd_c = np.mean(c)

0 tell me about cass county missouri - list homes sale - do you want to know the list of sale homes
50 tell me about memory - computer - are you interested in computer science
100 Find information on ontario california airport. - directions location - do you want to know the location of the airport
150 Find me map of USA - roads - do you want to know what roads are in the map
200 Where can I buy pressure washers? - washer - do you want to know what pressure washer washers are
250 Tell me about defender - lyrics - do you want to know more about defender lyrics
300 Tell me more about Rocky Mountain News - recent events historical - would you like to know more about recent historical events
350 I want to know about appraisals. - appraisal cost - do you want to know what appraisal cost appra apprais
400 Where should I order dog clean-up bags - specif bag type - do you want to know what type specifical dog bag bags are available
b1 0.44002332248763826 b2 0.1849682947950811 b3 0.105298812629

## 2.8 Only "Are you interested in", no question body (boarderline)

In [12]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
ps = PorterStemmer()
c = []

generated_file = 'neurologic_decoding/zero_shot/gpt2facet'
generated_cq_all_templates = open(generated_file, 'r').readlines()
generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                            for l in range(len(starting_texts))] 
                            for k in range(int(len(generated_cq_all_templates)/8         
                            ))]

for iter, row in facet_test_data.iterrows():
    query = ''
    facet = facet_test_data.at[iter, 'facet_desc']
    ref = facet_test_data.at[iter, 'question']
    tokenized_ref = word_tokenize(ref)
    force_flexible = facet.split()

    generated_cq = 'Are you interested in'

    facet_test_data.at[iter, 'generated'] = generated_cq
    tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
    
    #print(query, "-", ' '.join(tokenized_hyp))
    #pprint.pprint(template_scores)
    rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
    
    b1.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 0, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b2.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 1, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b3.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 1, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b4.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 0, 1),
                            smoothing_function = SmoothingFunction().method1
                            ))

    m.append(meteor_score([tokenized_ref], tokenized_hyp))

    constraint_unsatisfied = 0
    force_words = facet.split()
    for constraint in force_words:
        if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
            constraint_unsatisfied += 1
    c.append(1 - constraint_unsatisfied/len(force_words))

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

template_only_b1 = np.mean(b1)
template_only_b2 = np.mean(b2)
template_only_b3 = np.mean(b3)
template_only_b4 = np.mean(b4)
template_only_m = np.mean(m)
template_only_r = np.mean(rs_list)
template_only_c = np.mean(c)

b1 0.09052243592983396 b2 0.03240188263147355 b3 0.030044300631148498 b4 0.027229303729754527
rouge-L 0.20390190521530863
m 0.16745748922004236
c 0.004705882352941176


### 4.2.1. Template + Facet & tagged noun + neuro + WSDM

In [17]:
from word_forms.word_forms import get_word_forms
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)


def calculate_WSDM(query, doc_list):
    lambda_t = 1
    lambda_o = 1
    lambda_u = 1
    mu = 25
    collection = ' '.join(doc_list)
    collection_size = len(collection)

    def tfq(word, doc):
        many_forms = get_word_forms(word)
        word_forms = [word for k in many_forms.keys() for word in many_forms[k]] 
        return sum( [sum([1 if w == wf else 0 for w in doc]) for wf in word_forms])

    def tf1(qk, qk1, doc):    
        many_formsk = get_word_forms(qk)
        word_formsk = [word for k in many_formsk.keys() for word in many_formsk[k]] 
        many_formsk1 = get_word_forms(qk1)
        word_formsk1 = [word for k in many_formsk1.keys() for word in many_formsk1[k]] 
        return sum( [sum([1 if qkf == doc[k] and qk1f == doc[k+1] else 0 for k in range(len(doc)-1)]) for qkf in word_formsk for qk1f in word_formsk1])
    
    def tfuw(qk, qj, doc):
        wsz = 2
        many_formsk = get_word_forms(qk)
        word_formsk = [word for k in many_formsk.keys() for word in many_formsk[k]] 
        many_formsj = get_word_forms(qj)
        word_formsj = [word for k in many_formsj.keys() for word in many_formsj[k]] 
        return sum( [sum([1 if qkf == doc[k] and qjf in doc[max(k-wsz,0):min(k+wsz,len(doc))] else 0 for k in range(len(doc))]) for qkf in word_formsk for qjf in word_formsj])

    def f_t(query, doc, collection):
        return sum([(tfq(word, doc.split()) + mu * tfq(word, collection.split())/collection_size) / (len(doc.split()) + mu) for word in query.split() ])
    
    def f_o(query, doc, collection):
        query = query.split()
        if len(query) < 2:
            return 0
        return sum([(tf1(query[k], query[k+1], doc.split()) + mu * tf1(query[k], query[k+1], collection.split())/collection_size) / (len(doc.split()) + mu)  for k in range(len(query)-1)])

    def f_u(query, doc, collection):
        query = list(set(query.split()))
        l = len(query)
        if l < 2:
            return 0
        return sum([(tfuw(query[k], query[j], doc.split()) + mu * tfuw(query[k], query[j], collection.split())/collection_size) / (len(doc.split()) + mu)  for k in range(l) for j in range(k+1, l)])

    return {
        doc:lambda_t * f_t(query, doc, collection) + \
            lambda_o * f_o(query, doc, collection) + \
            lambda_u * f_u(query, doc, collection) 
        for doc in doc_list
    }

rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
ps = PorterStemmer()
c = []

model_output = 'zeroshot_taggednoun_nd_wsdm.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))


else:

    generated_file = 'neurologic_decoding/zero_shot/gpt2facet_taggednoun'
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]
    for iter, row in facet_test_data.iterrows():
        query = ''
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)
        force_flexible = facet.split()

        generated_cqs = []
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            generated_cqs.append(generated_cq)
        
        template_scores = calculate_WSDM(query=facet, doc_list=generated_cqs)
        
        sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)

        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 

        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            pprint.pprint(sorted_template_scores)
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

zero_taggednoun_nd_wsdm_b1 = np.mean(b1)
zero_taggednoun_nd_wsdm_b2 = np.mean(b2)
zero_taggednoun_nd_wsdm_b3 = np.mean(b3)
zero_taggednoun_nd_wsdm_b4 = np.mean(b4)
zero_taggednoun_nd_wsdm_m = np.mean(m)
zero_taggednoun_nd_wsdm_r = np.mean(rs_list)
zero_taggednoun_nd_wsdm_c = np.mean(c)

0 tell me about cass county missouri - list homes sale - do you need information on the homes sale list
50 tell me about memory - computer - are you interested in computer science
100 Find information on ontario california airport. - directions location - are you looking for directions or location information
150 Find me map of USA - roads - are you looking for a map of the roads
200 Where can I buy pressure washers? - washer - are you looking for pressure washer washers
250 Tell me about defender - lyrics - do you want to know more about defender lyrics
300 Tell me more about Rocky Mountain News - recent events historical - are you interested in recent historical events
350 I want to know about appraisals. - appraisal cost - are you looking for appraisal cost
400 Where should I order dog clean-up bags - specif bag type - would you like to specifiy dog bag type bags
b1 0.4242048794497764 b2 0.17426603364908366 b3 0.09484695215887695 b4 0.06482489616750807
rouge-L 0.4457258369368304
m 0

# 5.  Neurologic Decoding result analysis by each question prompt (template)

In [23]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []

generated_cq_all_templates = open(generated_file, 'r').readlines()
generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                            for l in range(len(starting_texts))] 
                            for k in range(int(len(generated_cq_all_templates)/8))]

ps = PorterStemmer()

for template_i in range(len(starting_texts)):
    rs_list = []
    b1, b2, b3, b4 = [], [], [], []
    m = []
    c = []
    for iter, row in facet_test_data.iterrows():
        query = ''
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)
        force_flexible = facet.split()

        generated_cqs = []
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            generated_cqs.append(generated_cq)

        '''
        rs_best = [(k, rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
            for k, generated_cq in enumerate(generated_cqs)
        ]

        rs_best = sorted(rs_best, key = lambda x:x[1], reverse = True)
        '''
        best_id = template_i

        facet_test_data.at[iter, 'generated'] = generated_cqs[best_id]

        rs_list.append(rs.get_scores(generated_cqs[best_id], ref)[0]['rouge-l']['f'])
        
        b1.append(
            sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
        ))
        b2.append(
            sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
        ))
        b3.append(
            sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
        ))
        b4.append(
            sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
        ))

        m.append(meteor_score([tokenized_ref], word_tokenize(generated_cqs[best_id])))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))
        
    
    print(starting_texts[template_i])
    print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
    print("m", np.mean(m))
    print("rouge-L", np.mean(rs_list))
    print("c", np.mean(c))

## oracle

rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

for iter, row in facet_test_data.iterrows():
    query = ''
    facet = facet_test_data.at[iter, 'facet_desc']
    ref = facet_test_data.at[iter, 'question']
    tokenized_ref = word_tokenize(ref)
    force_flexible = facet.split()

    generated_cqs = []
    for full_sentence in generated_cq_grouped[iter]:
        query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
        generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
        generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
        generated_cqs.append(generated_cq)

    
    rs_best = [(k, rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        for k, generated_cq in enumerate(generated_cqs)
    ]

    rs_best = sorted(rs_best, key = lambda x:x[1], reverse = True)
    best_id = rs_best[0][0]

    facet_test_data.at[iter, 'generated'] = generated_cqs[best_id]

    rs_list.append(rs.get_scores(generated_cqs[best_id], ref)[0]['rouge-l']['f'])
    
    b1.append(
        sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                            weights=(1, 0, 0, 0),
                            smoothing_function = SmoothingFunction().method1
    ))
    b2.append(
        sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                            weights=(0, 1, 0, 0),
                            smoothing_function = SmoothingFunction().method1
    ))
    b3.append(
        sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                            weights=(0, 0, 1, 0),
                            smoothing_function = SmoothingFunction().method1
    ))
    b4.append(
        sentence_bleu([tokenized_ref],  word_tokenize(generated_cqs[best_id]), 
                            weights=(0, 0, 0, 1),
                            smoothing_function = SmoothingFunction().method1
    ))

    m.append(meteor_score([tokenized_ref], word_tokenize(generated_cqs[best_id])))

    constraint_unsatisfied = 0
    force_words = facet.split()
    for constraint in force_words:
        if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
            constraint_unsatisfied += 1
    c.append(1 - constraint_unsatisfied/len(force_words))


print('oracle')
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("m", np.mean(m))
print("rouge-L", np.mean(rs_list))
print("c", np.mean(c))

[SEP] are you looking for
b1 0.4174069226224852 b2 0.22187649477432006 b3 0.1533567781840256 b4 0.12163278454272482
m 0.4121104559849342
rouge-L 0.46478386214961215
c 0.8964705882352941
[SEP] do you want to know
b1 0.4321942454843958 b2 0.19389510498075813 b3 0.11558840523981023 b4 0.07344376763353411
m 0.4138996199559468
rouge-L 0.45402395768220605
c 0.8964705882352941
[SEP] would you like to
b1 0.41677213227730003 b2 0.17027090657557872 b3 0.10502780242708211 b4 0.06548335471949998
m 0.3862641413515435
rouge-L 0.44457932910308434
c 0.8964705882352941
[SEP] are you interested in
b1 0.41052554687096887 b2 0.18729103322918872 b3 0.10072285376409679 b4 0.0685977893139586
m 0.38309885935181376
rouge-L 0.4396009010664169
c 0.8964705882352941
[SEP] do you need information
b1 0.3624816392337853 b2 0.13168083028262945 b3 0.05690658424184476 b4 0.029389967541808372
m 0.35076128904595166
rouge-L 0.3819194215378325
c 0.8964705882352941
[SEP] do you want information
b1 0.3801903034295857 b2 0.154

# 5.1. Neurologic Decoding result analysis by facet word count

In [24]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

generated_file = 'neurologic_decoding/zero_shot/gpt2facet'

rs = rouge.Rouge()
b1_1, b2_1, b3_1, b4_1 = [], [], [], []
m_1 = []
rs_list_1 = []
ranking_acc_1 = []
c_1 = []

b1_2, b2_2, b3_2, b4_2 = [], [], [], []
m_2 = []
rs_list_2 = []
ranking_acc_2 = []
c_2 = []

b1_3, b2_3, b3_3, b4_3 = [], [], [], []
m_3 = []
rs_list_3 = []
ranking_acc_3 = []
c_3 = []

b1_4, b2_4, b3_4, b4_4 = [], [], [], []
m_4 = []
rs_list_4 = []
ranking_acc_4 = []
c_4 = []

generated_cq_all_templates = open(generated_file, 'r').readlines()
generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                            for l in range(len(starting_texts))] 
                            for k in range(int(len(generated_cq_all_templates)/8))]

ps = PorterStemmer()

for iter, row in facet_test_data.iterrows():
    query = ''
    facet = facet_test_data.at[iter, 'facet_desc']
    ref = facet_test_data.at[iter, 'question']
    tokenized_ref = word_tokenize(ref)
    force_flexible = facet.split()        

    template_scores = {}
    generated_cqs = []
    for full_sentence in generated_cq_grouped[iter]:
        query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
        generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
        generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
        generated_cqs.append(generated_cq)
        constraint_penalty = 1
        for constraint in force_flexible:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_penalty *= 2
        template_scores[generated_cq] = calculatePerplexity(sentence=full_sentence, model=model, tokenizer=tokenizer) * constraint_penalty
    
    rs_best = [(generated_cq, rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        for k, generated_cq in enumerate(generated_cqs)
    ]

    rs_best = sorted(rs_best, key = lambda x:x[1], reverse = True)
    best_cq = rs_best[0][0]

    generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 

    facet_test_data.at[iter, 'generated'] = generated_cq
    tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
    
    if len(force_flexible) == 1:

        rs_list_1.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_1.append(meteor_score([tokenized_ref], tokenized_hyp))
        ranking_acc_1.append(1 if best_cq == generated_cq else 0)

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c_1.append(1 - constraint_unsatisfied/len(force_words))


    elif len(force_flexible) == 2:
        rs_list_2.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_2.append(meteor_score([tokenized_ref], tokenized_hyp))
        ranking_acc_2.append(1 if best_cq == generated_cq else 0)

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c_2.append(1 - constraint_unsatisfied/len(force_words))

    elif len(force_flexible) == 3:
        rs_list_3.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_3.append(meteor_score([tokenized_ref], tokenized_hyp))
        ranking_acc_3.append(1 if best_cq == generated_cq else 0)

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c_3.append(1 - constraint_unsatisfied/len(force_words))


    else:
        rs_list_4.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_4.append(meteor_score([tokenized_ref], tokenized_hyp))
        ranking_acc_4.append(1 if best_cq == generated_cq else 0)

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c_4.append(1 - constraint_unsatisfied/len(force_words))


print("Facet 1", "count", len(m_1), "%", len(m_1)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_1), "b2", np.mean(b2_1), "b3", np.mean(b3_1), "b4", np.mean(b4_1))
print("rouge-L", np.mean(rs_list_1))
print("m", np.mean(m_1))
print("reranking", np.mean(ranking_acc_1))
print("c", np.mean(c_1))

print("Facet 2", "count", len(m_2), "%", len(m_2)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_2), "b2", np.mean(b2_2), "b3", np.mean(b3_2), "b4", np.mean(b4_2))
print("rouge-L", np.mean(rs_list_2))
print("m", np.mean(m_2))
print("reranking", np.mean(ranking_acc_2))
print("c", np.mean(c_2))

print("Facet 3", "count", len(m_3), "%", len(m_3)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_3), "b2", np.mean(b2_3), "b3", np.mean(b3_3), "b4", np.mean(b4_3))
print("rouge-L", np.mean(rs_list_3))
print("m", np.mean(m_3))
print("reranking", np.mean(ranking_acc_3))
print("c", np.mean(c_3))

print("Facet 4+", "count", len(m_4), "%", len(m_4)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_4), "b2", np.mean(b2_4), "b3", np.mean(b3_4), "b4", np.mean(b4_4))
print("rouge-L", np.mean(rs_list_4))
print("m", np.mean(m_4))
print("reranking", np.mean(ranking_acc_4))
print("c", np.mean(c_4))

Facet 1 count 176 % 0.41411764705882353
b1 0.37066004628009935 b2 0.16648085923807768 b3 0.10720332666666116 b4 0.07229187120720551
rouge-L 0.4143029459167021
m 0.36525013976195686
reranking 0.23295454545454544
c 0.9545454545454546
Facet 2 count 167 % 0.39294117647058824
b1 0.44930889741235686 b2 0.1926663900118912 b3 0.11052348497415793 b4 0.07530655005025497
rouge-L 0.4613084686140523
m 0.41954701182229287
reranking 0.16766467065868262
c 0.9940119760479041
Facet 3 count 57 % 0.13411764705882354
b1 0.49581447382812605 b2 0.23869937283842516 b3 0.14656948195102065 b4 0.09623476150229873
rouge-L 0.5178166345544951
m 0.4764083837759933
reranking 0.15789473684210525
c 1.0
Facet 4+ count 25 % 0.058823529411764705
b1 0.5307288955582292 b2 0.22797396076788806 b3 0.11222597116530317 b4 0.05376991257708576
rouge-L 0.49804562112769185
m 0.5067972981060292
reranking 0.12
c 0.99


# 5.2. Analysis by facet word count on HF decoding results

In [25]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

generated_df = pd.read_csv('zeroshot_hf.csv')

rs = rouge.Rouge()
rs_list_1 = []
b1_1, b2_1, b3_1, b4_1 = [], [], [], []
m_1 = []
rs_list_1 = []

b1_2, b2_2, b3_2, b4_2 = [], [], [], []
m_2 = []
rs_list_2 = []

b1_3, b2_3, b3_3, b4_3 = [], [], [], []
m_3 = []
rs_list_3 = []

b1_4, b2_4, b3_4, b4_4 = [], [], [], []
m_4 = []
rs_list_4 = []


ps = PorterStemmer()

for iter, row in facet_test_data.iterrows():
    query = ''
    facet = facet_test_data.at[iter, 'facet_desc']
    ref = facet_test_data.at[iter, 'question']
    tokenized_ref = word_tokenize(ref)
    force_flexible = facet.split()

    generated_cq = generated_df.at[iter, 'candidate']
    facet_test_data.at[iter, 'generated'] = generated_df.at[iter, 'candidate']
    tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
    
    if len(force_flexible) == 1:

        rs_list_1.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_1.append(meteor_score([tokenized_ref], tokenized_hyp))

    elif len(force_flexible) == 2:
        rs_list_2.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_2.append(meteor_score([tokenized_ref], tokenized_hyp))

    elif len(force_flexible) == 3:
        rs_list_3.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_3.append(meteor_score([tokenized_ref], tokenized_hyp))

    else:
        rs_list_4.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_4.append(meteor_score([tokenized_ref], tokenized_hyp))


print("Facet 1", "count", len(m_1), "%", len(m_1)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_1), "b2", np.mean(b2_1), "b3", np.mean(b3_1), "b4", np.mean(b4_1))
print("rouge-L", np.mean(rs_list_1))
print("m", np.mean(m_1))

print("Facet 2", "count", len(m_2), "%", len(m_2)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_2), "b2", np.mean(b2_2), "b3", np.mean(b3_2), "b4", np.mean(b4_2))
print("rouge-L", np.mean(rs_list_2))
print("m", np.mean(m_2))

print("Facet 3", "count", len(m_3), "%", len(m_3)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_3), "b2", np.mean(b2_3), "b3", np.mean(b3_3), "b4", np.mean(b4_3))
print("rouge-L", np.mean(rs_list_3))
print("m", np.mean(m_3))

print("Facet 4+", "count", len(m_4), "%", len(m_4)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_4), "b2", np.mean(b2_4), "b3", np.mean(b3_4), "b4", np.mean(b4_4))
print("rouge-L", np.mean(rs_list_4))
print("m", np.mean(m_4))

Facet 1 count 176 % 0.41411764705882353
b1 0.27550844287382986 b2 0.08444366092780525 b3 0.0482652234780481 b4 0.033841097393735003
rouge-L 0.2965379003453253
m 0.27554149450784154
Facet 2 count 167 % 0.39294117647058824
b1 0.3211927016220878 b2 0.11521572272004516 b3 0.07002232708974651 b4 0.04742007288917467
rouge-L 0.3436276368035422
m 0.32994245947243234
Facet 3 count 57 % 0.13411764705882354
b1 0.37100093680480056 b2 0.11233053592905683 b3 0.0516392139102477 b4 0.03381056724647838
rouge-L 0.3856013355210162
m 0.3516601800854668
Facet 4+ count 25 % 0.058823529411764705
b1 0.42528586585242734 b2 0.13284774192629373 b3 0.07730630169513093 b4 0.0515464719139813
rouge-L 0.3994982516907758
m 0.35643684579964274


# 5.3. Analysis by facet word count on finetuning baseline

In [26]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

generated_df = pd.read_csv('sekulic.csv')

rs = rouge.Rouge()
rs_list_1 = []
b1_1, b2_1, b3_1, b4_1 = [], [], [], []
m_1 = []
rs_list_1 = []

b1_2, b2_2, b3_2, b4_2 = [], [], [], []
m_2 = []
rs_list_2 = []

b1_3, b2_3, b3_3, b4_3 = [], [], [], []
m_3 = []
rs_list_3 = []

b1_4, b2_4, b3_4, b4_4 = [], [], [], []
m_4 = []
rs_list_4 = []


ps = PorterStemmer()

for iter, row in facet_test_data.iterrows():
    query = ''
    facet = facet_test_data.at[iter, 'facet_desc']
    ref = facet_test_data.at[iter, 'question']
    tokenized_ref = word_tokenize(ref)
    force_flexible = facet.split()

    generated_cq = generated_df.at[iter, 'candidate']
    facet_test_data.at[iter, 'generated'] = generated_df.at[iter, 'candidate']
    tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
    
    if len(force_flexible) == 1:

        rs_list_1.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_1.append(meteor_score([tokenized_ref], tokenized_hyp))

    elif len(force_flexible) == 2:
        rs_list_2.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_2.append(meteor_score([tokenized_ref], tokenized_hyp))

    elif len(force_flexible) == 3:
        rs_list_3.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_3.append(meteor_score([tokenized_ref], tokenized_hyp))

    else:
        rs_list_4.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4_4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m_4.append(meteor_score([tokenized_ref], tokenized_hyp))


print("Facet 1", "count", len(m_1), "%", len(m_1)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_1), "b2", np.mean(b2_1), "b3", np.mean(b3_1), "b4", np.mean(b4_1))
print("rouge-L", np.mean(rs_list_1))
print("m", np.mean(m_1))

print("Facet 2", "count", len(m_2), "%", len(m_2)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_2), "b2", np.mean(b2_2), "b3", np.mean(b3_2), "b4", np.mean(b4_2))
print("rouge-L", np.mean(rs_list_2))
print("m", np.mean(m_2))

print("Facet 3", "count", len(m_3), "%", len(m_3)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_3), "b2", np.mean(b2_3), "b3", np.mean(b3_3), "b4", np.mean(b4_3))
print("rouge-L", np.mean(rs_list_3))
print("m", np.mean(m_3))

print("Facet 4+", "count", len(m_4), "%", len(m_4)/(len(m_1+m_2+m_3+m_4)))
print("b1", np.mean(b1_4), "b2", np.mean(b2_4), "b3", np.mean(b3_4), "b4", np.mean(b4_4))
print("rouge-L", np.mean(rs_list_4))
print("m", np.mean(m_4))

Facet 1 count 176 % 0.41411764705882353
b1 0.27280088325905355 b2 0.10163992421528394 b3 0.053693751447859574 b4 0.030843726070661395
rouge-L 0.31157616449628356
m 0.28959097021452185
Facet 2 count 167 % 0.39294117647058824
b1 0.2670542077791494 b2 0.1063942612909681 b3 0.06144455024652616 b4 0.04133252298327092
rouge-L 0.30837631521180603
m 0.27363438989728
Facet 3 count 57 % 0.13411764705882354
b1 0.30781525643213287 b2 0.1252111938715249 b3 0.07710040862054839 b4 0.05537387034736785
rouge-L 0.3456929112139785
m 0.3024132761408417
Facet 4+ count 25 % 0.058823529411764705
b1 0.3105070547995927 b2 0.12173359803414291 b3 0.06166130708769793 b4 0.03171710450988532
rouge-L 0.34828246920422423
m 0.2989814730411402


## 6.9. Huggingface decoding question body

In [35]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

model_output = 'zeroshot_hf.csv'
model_output_data = pd.read_csv(model_output)

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

generated_cq_all_templates = open(generated_file, 'r').readlines()
generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                            for l in range(len(starting_texts))] 
                            for k in range(int(len(generated_cq_all_templates)/8))]

template_len = 4
ps = PorterStemmer()

for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter,'facet_desc']
    ref = ' '.join(facet_test_data.at[iter, 'question'].split()[template_len:])
    generated_cq = ' '.join(model_output_data.at[iter, 'candidate'].split()[template_len:])

    tokenized_ref = word_tokenize(ref)
    tokenized_hyp = word_tokenize(generated_cq)

    if generated_cq == '':
        rs_list.append(0)
    else:
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
    
    b1.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 0, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b2.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 1, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b3.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 1, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b4.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 0, 1),
                            smoothing_function = SmoothingFunction().method1
                            ))

    m.append(meteor_score([tokenized_ref], tokenized_hyp))

    constraint_unsatisfied = 0
    force_words = facet.split()
    for constraint in force_words:
        if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
            constraint_unsatisfied += 1
    c.append(1 - constraint_unsatisfied/len(force_words))

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

hf_truncate_b1 = np.mean(b1)
hf_truncate_b2 = np.mean(b2)
hf_truncate_b3 = np.mean(b3)
hf_truncate_b4 = np.mean(b4)
hf_truncate_m = np.mean(m)
hf_truncate_r = np.mean(rs_list)
hf_truncate_c = np.mean(c)

b1 0.20177504099202298 b2 0.04947821965166067 b3 0.022721678213357348 b4 0.01882946356277396
rouge-L 0.22264293607342042
m 0.20600085382212535
c 0.7207843137254902


## 6.10. No prompt, facet only + neuro.


In [36]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

model_output = 'zeroshot_nd_noprompt_pp.csv'
model_output_data = pd.read_csv(model_output)

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

generated_cq_all_templates = open(generated_file, 'r').readlines()
generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                            for l in range(len(starting_texts))] 
                            for k in range(int(len(generated_cq_all_templates)/8))]

template_len = 0
ps = PorterStemmer()

for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter,'facet_desc']
    ref = ' '.join(facet_test_data.at[iter, 'question'].split()[template_len:])
    generated_cq = ' '.join(model_output_data.at[iter, 'candidate'].split()[template_len:])

    tokenized_ref = word_tokenize(ref)
    tokenized_hyp = word_tokenize(generated_cq)

    if generated_cq == '':
        rs_list.append(0)
    else:
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
    
    b1.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(1, 0, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b2.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 1, 0, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b3.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 1, 0),
                            smoothing_function = SmoothingFunction().method1
                            ))
    b4.append(sentence_bleu([tokenized_ref], 
                            tokenized_hyp, 
                            weights=(0, 0, 0, 1),
                            smoothing_function = SmoothingFunction().method1
                            ))

    m.append(meteor_score([tokenized_ref], tokenized_hyp))

    constraint_unsatisfied = 0
    force_words = facet.split()
    for constraint in force_words:
        if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
            constraint_unsatisfied += 1
    c.append(1 - constraint_unsatisfied/len(force_words))

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

facet_nd_truncate_b1 = np.mean(b1)
facet_nd_truncate_b2 = np.mean(b2)
facet_nd_truncate_b3 = np.mean(b3)
facet_nd_truncate_b4 = np.mean(b4)
facet_nd_truncate_m = np.mean(m)
facet_nd_truncate_r = np.mean(rs_list)
facet_nd_truncate_c = np.mean(c)

b1 0.21009466369496277 b2 0.06442674864631723 b3 0.028553873315365456 b4 0.022501687356610053
rouge-L 0.23507797559462323
m 0.19491474938070089
c 0.9027450980392155


## Full question evaluation adding GPT3 results and finetune GPT2 with prompt

In [44]:
def round_metric(num):
    return round(num * 100, 2)

print ("Full question evaluation")
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('MODEL', 'BLEU1','BLEU2','BLEU3','BLEU4','METEOR','ROUGE', 'COVERAGE'))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Template-only BDL', 
                                                            round_metric(template_only_b1), 
                                                            round_metric(template_only_b2), 
                                                            round_metric(template_only_b3), 
                                                            round_metric(template_only_b4), 
                                                            round_metric(template_only_m), 
                                                            round_metric(template_only_r), 
                                                            round_metric(template_only_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Prompt', 
                                                            round_metric(template_b1), 
                                                            round_metric(template_b2), 
                                                            round_metric(template_b3), 
                                                            round_metric(template_b4), 
                                                            round_metric(template_m), 
                                                            round_metric(template_r), 
                                                            round_metric(template_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Facet only (ND)', 
                                                            round_metric(facet_nd_b1), 
                                                            round_metric(facet_nd_b2), 
                                                            round_metric(facet_nd_b3), 
                                                            round_metric(facet_nd_b4), 
                                                            round_metric(facet_nd_m), 
                                                            round_metric(facet_nd_r), 
                                                            round_metric(facet_nd_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Prompt append Facet', 
                                                            round_metric(template_facet_b1), 
                                                            round_metric(template_facet_b2), 
                                                            round_metric(template_facet_b3), 
                                                            round_metric(template_facet_b4), 
                                                            round_metric(template_facet_m), 
                                                            round_metric(template_facet_r), 
                                                            round_metric(template_facet_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Finetuned GPT2 (Base)', 
                                                            round_metric(sekulic_b1), 
                                                            round_metric(sekulic_b2), 
                                                            round_metric(sekulic_b3), 
                                                            round_metric(sekulic_b4), 
                                                            round_metric(sekulic_m), 
                                                            round_metric(sekulic_r), 
                                                            round_metric(sekulic_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Finetuned GPT2 w/ prompt', 
                                                            round_metric(ftgpt2_prompt_b1), 
                                                            round_metric(ftgpt2_prompt_b2), 
                                                            round_metric(ftgpt2_prompt_b3), 
                                                            round_metric(ftgpt2_prompt_b4), 
                                                            round_metric(ftgpt2_prompt_m), 
                                                            round_metric(ftgpt2_prompt_r), 
                                                            round_metric(ftgpt2_prompt_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Zero-shot HF decoding', 
                                                            round_metric(zero_hf_b1), 
                                                            round_metric(zero_hf_b2), 
                                                            round_metric(zero_hf_b3), 
                                                            round_metric(zero_hf_b4), 
                                                            round_metric(zero_hf_m), 
                                                            round_metric(zero_hf_r), 
                                                            round_metric(zero_hf_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('Zero-shot ND decoding', 
                                                            round_metric(zero_nd_b1), 
                                                            round_metric(zero_nd_b2), 
                                                            round_metric(zero_nd_b3), 
                                                            round_metric(zero_nd_b4), 
                                                            round_metric(zero_nd_m), 
                                                            round_metric(zero_nd_r), 
                                                            round_metric(zero_nd_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('0-shot GPT3', 
                                                            round_metric(gpt3_0_b1), 
                                                            round_metric(gpt3_0_b2), 
                                                            round_metric(gpt3_0_b3), 
                                                            round_metric(gpt3_0_b4), 
                                                            round_metric(gpt3_0_m), 
                                                            round_metric(gpt3_0_r), 
                                                            round_metric(gpt3_0_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('1-shot GPT3', 
                                                            round_metric(gpt3_1_b1), 
                                                            round_metric(gpt3_1_b2), 
                                                            round_metric(gpt3_1_b3), 
                                                            round_metric(gpt3_1_b4), 
                                                            round_metric(gpt3_1_m), 
                                                            round_metric(gpt3_1_r), 
                                                            round_metric(gpt3_1_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('2-shot GPT3', 
                                                            round_metric(gpt3_2_b1), 
                                                            round_metric(gpt3_2_b2), 
                                                            round_metric(gpt3_2_b3), 
                                                            round_metric(gpt3_2_b4), 
                                                            round_metric(gpt3_2_m), 
                                                            round_metric(gpt3_2_r), 
                                                            round_metric(gpt3_2_c)))
print ("{:<24} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8} {:<8}".format('3-shot GPT3', 
                                                            round_metric(gpt3_3_b1), 
                                                            round_metric(gpt3_3_b2), 
                                                            round_metric(gpt3_3_b3), 
                                                            round_metric(gpt3_3_b4), 
                                                            round_metric(gpt3_3_m), 
                                                            round_metric(gpt3_3_r), 
                                                            round_metric(gpt3_3_c)))


Full question evaluation
MODEL                    BLEU1    BLEU2    BLEU3    BLEU4    METEOR   ROUGE    COVERAGE
Template-only BDL        9.05     3.24     3.0      2.72     16.75    20.39    0.47    
Prompt                   19.92    7.06     4.68     3.08     21.35    22.95    2.22    
Facet only (ND)          21.01    6.44     2.86     2.25     19.49    23.51    90.27   
Prompt append Facet      37.93    13.89    9.65     7.51     33.74    45.21    100.0   
Finetuned GPT2 (Base)    27.75    10.79    6.03     3.83     28.56    31.71    20.85   
Finetuned GPT2 w/ prompt 32.79    14.58    8.57     5.63     37.55    40.81    72.55   
Zero-shot HF decoding    31.51    10.31    5.9      4.02     31.19    33.3     72.55   
Zero-shot ND decoding    42.78    19.01    11.41    7.56     40.98    45.16    97.82   
0-shot GPT3              43.11    21.0     13.13    8.65     47.54    46.92    86.35   
1-shot GPT3              45.27    23.0     14.25    9.32     49.06    48.13    93.84   
2-shot 

# Generating trial evaluation data

## zero-shot gpt2

In [71]:
from word_forms.word_forms import get_word_forms
facet_test_file = 'data/clariq_f/ClariQ-FKw-trial.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')

write_to_file = 'neurologic_decoding/dataset/clean/constraint/test.constraint.json'
prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen.test.init.txt'
no_prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen_no_prompt.test.init.txt'

pos_tagger = spacy.load('en_core_web_sm')

all_queries= []
all_constraints = []
for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter, 'facet_desc']
    query = facet_test_data.at[iter, 'initial_request']
    noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
    propn_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'PROPN']
    all_queries.append(query)

    constraints = [[term] for term in facet.split()+noun_in_query]
    #constraints += [[' '.join(propn_in_query)]]
    #for facet_word in facet.split():
    #for facet_word in facet.split():
    #    many_forms = get_word_forms(facet_word)
    #    constraints.append(list(set([word for k in many_forms.keys() for word in many_forms[k] ]+[facet_word])))
    all_constraints.append(constraints)


with open(write_to_file, 'w') as output:
    for constraints in all_constraints:
        for k, prompt in enumerate(starting_texts):
            json_str = json.dumps(constraints)
            output.write(json_str)
            output.write('\n')

with open(prompt_write_to_file, 'w') as output:
    for query in all_queries:
        for k, prompt in enumerate(starting_texts):
            output.write(query + prompt)
            output.write('\n')

with open(no_prompt_write_to_file, 'w') as output:
    for query in all_queries:
        output.write(query)
        output.write('\n')

## using WSDM ranker

In [10]:
facet_test_file = 'data/clariq_f/ClariQ-FKw-trial.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

rs = rouge.Rouge()
rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
ps = PorterStemmer()
c = []

model_output = 'zeroshot_trial.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))
else:
    pos_tagger = spacy.load('en_core_web_sm')
    generated_file = 'neurologic_decoding/zero_shot/gpt2trial'
    generated_cq_all_templates = open(generated_file, 'r').readlines()
    generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                                for l in range(len(starting_texts))] 
                                for k in range(int(len(generated_cq_all_templates)/8))]
    for iter, row in facet_test_data.iterrows():
        query = ''
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)
        force_flexible = facet.split()
            
        generated_cqs = []
        for full_sentence in generated_cq_grouped[iter]:
            query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
            generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
            generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
            generated_cqs.append(generated_cq)
        
        center_words = ' '.join([token.text for token in pos_tagger(query) if token.pos_ == 'NOUN' or  token.pos_ == 'PROPN'])

        template_scores = calculate_WSDM(query=facet + ' ' + center_words, doc_list=generated_cqs)
        
        sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)

        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 

        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        
        if iter % 1 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
            pprint.pprint(template_scores)
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))
    
    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

0 tell me about uss yorktown charleston SC - aircrafts - are you looking for aircrafts
50 What are specific dangers of asbestos? - exposure asnestos - would you like to discuss asbestos exposure dangers
b1 0.4141665089717724 b2 0.1253300292517271 b3 0.056597530124790185 b4 0.0315843977495502
rouge-L 0.4001556893146436
m 0.3492758853895794
c 0.9595959595959596


## zero-shot gpt3

In [76]:
import os
import openai

facet_test_file = 'data/clariq_f/ClariQ-FKw-trial.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

temperature = 0
use_examples = 0
model_output = 'zeroshotgpt3_trial.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
        
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))
else:
    for iter, row in facet_test_data.iterrows():
        query = facet_test_data.at[iter, 'initial_request']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = facet_test_data.at[iter, 'question']
        tokenized_ref = word_tokenize(ref)
        force_flexible = facet.split()

        template_scores = {}
        for s_t in starting_texts:
            s_t = re.sub('\[SEP\]', ' ', s_t).strip()
            prompt = ' '.join(gpt3_examples[:use_examples]) + ' ' + query + ' ' + "Ask a question that contains words in the list" + ' ' + "[" + ", ".join(["'"+f+"'" for f in facet.split()])  + '].' + s_t
            response = openai.Completion.create(
                model="text-davinci-002",
                prompt= prompt,
                temperature=temperature,
                max_tokens=32,
                top_p=1,
                frequency_penalty=0.0,
                presence_penalty=0.0,
                stop=["\n"]
            )
            
            generated_cq = s_t + response['choices'][0]['text']
            generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
            generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()

            constraint_penalty = 1
            for constraint in force_flexible:
                if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                    constraint_penalty *= 2
            template_scores[generated_cq] = calculatePerplexity(sentence=query + generated_cq, model=model, tokenizer=tokenizer) * constraint_penalty
        
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x])[0] 
        facet_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
        

        if iter % 50 == 0: 
            print(iter, query, "-", force_flexible, '-', ' '.join(tokenized_hyp))
            pprint.pprint(template_scores)
    
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

    output_df = facet_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)


print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))


0 tell me about uss yorktown charleston SC - aircrafts - are you interested in aircrafts
50 What are specific dangers of asbestos? - exposure asnestos - are you interested in the dangers of asbestos exposure
b1 0.4055506205267506 b2 0.21345373382999158 b3 0.1277100675827996 b4 0.08804459598751083
rouge-L 0.4553104620841699
m 0.4382431443929386
c 0.791077441077441


## finetuning gpt2

In [16]:
batch_size = 32
sample_every = 100
epochs = 8
learning_rate = 5e-5
warmup_steps = 1e2
epsilon = 1e-8
max_length = 128
prompt_instruction = ''

facet_test_file = 'data/clariq_f/ClariQ-FKw-trial.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

rs_list = []
b1, b2, b3, b4 = [], [], [], []
m = []
c = []

temperature = 0.1
model_output = 'ftgpt2_trial.csv'

if os.path.isfile(model_output):
    model_output_data = pd.read_csv(model_output)

    for iter, row in facet_test_data.iterrows():
        query = model_output_data.at[iter, 'query']
        facet = facet_test_data.at[iter, 'facet_desc']
        ref = model_output_data.at[iter, 'reference']
        tokenized_ref = word_tokenize(ref)
        generated_cq = model_output_data.at[iter, 'candidate']
        tokenized_hyp = word_tokenize(generated_cq)
        
        if iter % 50 == 0: 
            print(iter, query, "-", facet, '-', generated_cq)
        
        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

        constraint_unsatisfied = 0
        force_words = facet.split()
        for constraint in force_words:
            if ps.stem(constraint) not in set([ps.stem(w) for w in word_tokenize(generated_cq)]):
                constraint_unsatisfied += 1
        c.append(1 - constraint_unsatisfied/len(force_words))

else:  
    print("Output file not found, generating output.") 
    model_dir = './model_save/'+str(epochs)+'/'
    if os.path.exists(model_dir):
        tokenizer = GPT2Tokenizer.from_pretrained(model_dir, bos_token=BOS, eos_token=EOS, pad_token=PAD) 
        configuration = GPT2Config.from_pretrained(model_dir, output_hidden_states=False)
        model = GPT2LMHeadModel.from_pretrained(model_dir, config=configuration)
        model.resize_token_embeddings(len(tokenizer))
        model.cuda()
    else:
        print("Model checkpoint not found, finetuning.")
        clariq_f_train_file = 'data/clariq_f/ClariQ-FKw-train_no_trial.tsv'
        clariq_f_train_data = pd.read_csv(clariq_f_train_file, sep='\t') 
        clariq_f_train_dict, clariq_f_train_data = process_clariq_f(clariq_f_train_data)
        clariq_f_train_text_list = clariq_f_train_data['instructional_q_f_cq']

        class GPT2Dataset(Dataset):
            def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
                self.tokenizer = tokenizer
                self.input_ids = []
                self.attn_masks = []
            
                print("training text example", txt_list[0])
                for txt in txt_list:
                    encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
                    self.input_ids.append(T.tensor(encodings_dict['input_ids']))
                    self.attn_masks.append(T.tensor(encodings_dict['attention_mask']))
            
            def __len__(self):
                return len(self.input_ids)
            
            def __getitem__(self, idx):
                return self.input_ids[idx], self.attn_masks[idx] 
            
        dataset = GPT2Dataset(clariq_f_train_text_list, tokenizer, max_length=max_length)

        train_size = int(0.99 * len(dataset))
        val_size = len(dataset) - train_size

        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        print('{:>5,} train /{:>5,} val'.format(train_size, val_size))

        train_dataloader = DataLoader(
        train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

        validation_dataloader = DataLoader(
        val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

        optimizer = AdamW(model.parameters(),
            lr = learning_rate,
            eps = epsilon
        )

        total_steps = len(train_dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, 
            num_warmup_steps = warmup_steps, 
            num_training_steps = total_steps
        )

        training_stats = []

        for epoch_i in range(0, epochs):
            # ========================================
            #               Training
            # ========================================
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            print('Training...')

            total_train_loss = 0

            model.train()

            for step, batch in enumerate(train_dataloader):

                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_masks = batch[1].to(device)

                model.zero_grad()        

                outputs = model(  b_input_ids,
                                labels=b_labels, 
                                attention_mask = b_masks,
                                token_type_ids=None
                                )
                loss = outputs[0]  

                batch_loss = loss.item()
                total_train_loss += batch_loss

                # Get sample every x batches.
                if step % sample_every == 0 and not step == 0:
                    print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))
                    model.eval()
                    model.train()

                loss.backward()
                optimizer.step()
                scheduler.step()

            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len(train_dataloader)       
            
            # Measure how long this epoch took.

            print("")
            print("  Average training loss: {0:.2f}".format(avg_train_loss))       
            # ========================================
            #               Validation
            # ========================================
            print("")
            print("Running Validation...")
            model.eval()
            total_eval_loss = 0
            nb_eval_steps = 0

            # Evaluate data for one epoch
            for batch in validation_dataloader:       
                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_masks = batch[1].to(device)
                
                with T.no_grad():        
                    outputs  = model(b_input_ids, 
                                    attention_mask = b_masks,
                                    labels=b_labels)         
                    loss = outputs[0]  
                    
                batch_loss = loss.item()
                total_eval_loss += batch_loss        

            avg_val_loss = total_eval_loss / len(validation_dataloader)


            print("  Validation Loss: {0:.2f}".format(avg_val_loss))

            # Record all statistics from this epoch.
            training_stats.append(
                {
                    'epoch': epoch_i + 1,
                    'Training Loss': avg_train_loss,
                    'Valid. Loss': avg_val_loss,
                }
            )

        print("")
        print("Training complete!")

        output_dir = model_dir
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print("Saving model to %s" % output_dir)

        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

    clariq_f_test_file = 'data/clariq_f/ClariQ-FKw-trial.tsv'
    clariq_f_test_data = pd.read_csv(clariq_f_test_file, sep='\t') 
    clariq_f_test_dict, clariq_f_test_data = process_clariq_f(clariq_f_test_data)

    rs = rouge.Rouge()
    rs_list = []
    b1, b2, b3, b4 = [], [], [], []
    m = []
    
    for iter, row in clariq_f_test_data.iterrows():
        query = clariq_f_test_data.at[iter, 'instructional_q_f']
        ref = clariq_f_test_data.at[iter, 'question']
        tokenized_input = T.tensor(tokenizer.encode(query)).unsqueeze(0).to(device)
        generated_text = ''
        generated_cq = ''
        
        sample_outputs = model.generate(
                tokenized_input,
                do_sample=True,   
                top_k=20, 
                max_length = len(tokenized_input[0]) + 32,
                top_p=0.9, 
                temperature = temperature,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
        generated_cq = generated_text[len(query):].strip()
        generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
        generated_cq = re.sub('[.?]', '&', generated_cq).split('&')[0].strip()

        tokenized_ref = word_tokenize(ref)
        clariq_f_test_data.at[iter, 'generated'] = generated_cq
        tokenized_hyp = word_tokenize(clariq_f_test_data.at[iter, 'generated'])

        if iter % 50 == 0: 
            print(iter, query, '-', generated_cq)

        rs_list.append(rs.get_scores(generated_cq, ref)[0]['rouge-l']['f'])
        
        b1.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(1, 0, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b2.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 1, 0, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b3.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 1, 0),
                                smoothing_function = SmoothingFunction().method1
                                ))
        b4.append(sentence_bleu([tokenized_ref], 
                                tokenized_hyp, 
                                weights=(0, 0, 0, 1),
                                smoothing_function = SmoothingFunction().method1
                                ))

        m.append(meteor_score([tokenized_ref], tokenized_hyp))

    output_df = clariq_f_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)
    
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(rs_list))
print("m", np.mean(m))
print("c", np.mean(c))

Output file not found, generating output.
0 tell me about uss yorktown charleston SC Ask a question that contains words in the list ['aircrafts']. - do you want to know about aircrafts
50 What are specific dangers of asbestos? Ask a question that contains words in the list ['exposure', 'asnestos']. - are you interested in the sun exposure the sun exposure information the sun exposure
b1 0.28625975382135704 b2 0.10700706063394176 b3 0.050937711173140464 b4 0.02941782018847191
rouge-L 0.37583581111308995
m 0.35637739326231804
c nan


## Concatenate them

In [17]:
zerogpt2_output = 'zeroshot_trial.csv'
gpt3_output = 'zeroshotgpt3_trial.csv'
ftgpt2_output = 'ftgpt2_trial.csv'

zerogpt2_output_data = pd.read_csv(zerogpt2_output)
gpt3_output_data = pd.read_csv(gpt3_output)
ftgpt2_output_data = pd.read_csv(ftgpt2_output)

assert len(gpt3_output_data) == len(zerogpt2_output_data) and len(gpt3_output_data) == len(ftgpt2_output_data)

trial_data = gpt3_output_data[['query', 'facet', 'reference']]
zerogpt2_candidates = zerogpt2_output_data['candidate'].tolist()
gpt3_candidates = gpt3_output_data['candidate'].tolist()
ftgpt2_candidates = ftgpt2_output_data['candidate'].tolist()

index_to_model = {
    0: "zeroshotgpt2",
    1: "oneshotgpt3",
    2: "finetunegpt2"
}

index_to_df = {
    0: zerogpt2_candidates,
    1: gpt3_candidates,
    2: ftgpt2_candidates
}

n_models = len(index_to_model.keys())
indices = []

for k in range(len(zerogpt2_candidates)):
    perm = np.random.permutation(n_models)
    indices.append(perm.tolist())
    for j in range(n_models):
        trial_data.at[k, 'generation'+str(j)] = index_to_df[perm[j]][k]

np.savetxt('trial_indices', np.array(indices), fmt='%d', delimiter=' ')
trial_data.to_csv("trial_data.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_data.at[k, 'generation'+str(j)] = index_to_df[perm[j]][k]


# Comparing our generation with SIGIR'22 paper

## processing their data and generate files for neurologic decoding

In [17]:
original_output = pd.read_csv('srqg-gen_results.txt', sep='\t', header=None)
original_output.columns = ['initial_request', 'facet_desc', 'srqg-gen']
for iter, row in original_output.iterrows():
    processed_facets = re.sub('[\[\]\']','', original_output.at[iter, 'facet_desc'] )
    processed_facets = re.sub('<unknown>', '', processed_facets) # removing empty facet
    original_output.at[iter, 'facet_desc'] = ','.join([facet.strip() for facet in processed_facets.split(',') if facet.strip() != '']).strip()
original_output.to_csv('data/srqg/test.tsv', sep = '\t')

######################################################

from word_forms.word_forms import get_word_forms
facet_test_file = 'data/srqg/test.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')

write_to_file = 'neurologic_decoding/dataset/clean/constraint/test.constraint.json'
prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen.test.init.txt'
no_prompt_write_to_file = 'neurologic_decoding/dataset/clean/init/commongen_no_prompt.test.init.txt'

pos_tagger = spacy.load('en_core_web_sm')

all_queries= []
all_constraints = []
for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter, 'facet_desc']
    query = facet_test_data.at[iter, 'initial_request']
    noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
    propn_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'PROPN']

    for term in facet.split(','):
        if term.strip() != '':
            all_queries.append(query)
            all_constraints.append([[term.strip()]])


with open(write_to_file, 'w') as output:
    for constraints in all_constraints:
        for k, prompt in enumerate(starting_texts):
            json_str = json.dumps(constraints)
            output.write(json_str)
            output.write('\n')

with open(prompt_write_to_file, 'w') as output:
    for query in all_queries:
        for k, prompt in enumerate(starting_texts):
            output.write(query + prompt)
            output.write('\n')

with open(no_prompt_write_to_file, 'w') as output:
    for query in all_queries:
        output.write(query)
        output.write('\n')

In [23]:
from word_forms.word_forms import get_word_forms
facet_test_file = 'data/srqg/test.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')

model_output = 'zeroshot_srqg_new.csv'

generated_file = 'neurologic_decoding/zero_shot/gpt2newsrqg'
generated_cq_all_templates = open(generated_file, 'r').readlines()
generated_cq_grouped = [[generated_cq_all_templates[len(starting_texts) * k + l] 
                            for l in range(len(starting_texts))] 
                            for k in range(int(len(generated_cq_all_templates)/8))]
for iter, row in facet_test_data.iterrows():
    facet = facet_test_data.at[iter, 'facet_desc']
    force_flexible = facet.split()

    generated_cqs = []
    for full_sentence in generated_cq_grouped[iter]:
        query = re.sub('\[SEP\]', '&', full_sentence).split('&')[0].strip()
        generated_follow_up = re.sub('\[SEP\]', '&', full_sentence).split('&')[1].strip()
        generated_cq = re.sub('[.?]', '&', generated_follow_up).split('&')[0].strip()
        generated_cqs.append(generated_cq)
    
    template_scores = calculate_WSDM(query=facet, doc_list=generated_cqs)
    
    sorted_template_scores = sorted(template_scores.items(), key = lambda x: x[1], reverse=True)

    generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 

    facet_test_data.at[iter, 'generated'] = generated_cq
    tokenized_hyp = word_tokenize(facet_test_data.at[iter, 'generated'])
    
    
    if iter % 50 == 0: 
        print(iter, query, "-", facet, '-', generated_cq)
        pprint.pprint(sorted_template_scores)
   

output_df = facet_test_data[['initial_request', 'facet_desc', 'generated']]
output_df.columns = ['query', 'facet', 'candidate']
output_df.to_csv(model_output)

0 google chrome exe - 64 bit,32 bit - are you looking for 64 bit or 32 bit
[('are you looking for 64 bit or 32 bit', 0.1678720668595307),
 ('are you interested in the 64 bit version', 0.14265606888557708),
 ('would you like to see the 64 bit version', 0.13846030215364835),
 ('do you need information about the 64 bit version', 0.13846030215364835),
 ('do you want information about the 64 bit version', 0.13846030215364835),
 ('do you need to install 64 bit on your computer', 0.13450429352068696),
 ('do you want to know the 64 bit version of Chrome', 0.13076806314511233),
 ('do you want to install 64 bit Chrome on your computer', 0.13076806314511233)]
50 romans 9 - female,male - are you looking for
[('are you looking for', 0.0),
 ('do you want to know what nlt is', 0.0),
 ('would you like to see the nltv', 0.0),
 ('are you interested in joining nltv', 0.0),
 ('do you need information about the nltv', 0.0),
 ('do you want information about the nltv', 0.0),
 ('do you need to use nltools to 

# Compute trial set agreement

In [26]:
from sklearn.metrics import cohen_kappa_score
from collections import Counter

zd_df = pd.read_csv("trial_data_zd.csv")
yc_df = pd.read_csv("trial_data_yc.csv")
mi_df = pd.read_csv("trial_data_mi.csv")

zerogpt2_df = pd.read_csv("zeroshot_trial.csv")
gpt3_df = pd.read_csv("zeroshotgpt3_trial.csv")
ftgpt2_df = pd.read_csv("ftgpt2_trial.csv")

labeler_df = mi_df
for iter, row in labeler_df.iterrows():
    row = [0,1,2]
    for k in range(3):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, 'mi_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            zerogpt2_df.at[iter, 'mi_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
        if labeler_df.at[iter, 'generation'+str(k)] == gpt3_df.at[iter,'candidate']:
            gpt3_df.at[iter, 'mi_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            gpt3_df.at[iter, 'mi_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, 'mi_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            ftgpt2_df.at[iter, 'mi_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
    
    assert len(row) == 0

labeler_df = yc_df
for iter, row in labeler_df.iterrows():
    row = [0,1,2]
    for k in range(3):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, 'yc_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            zerogpt2_df.at[iter, 'yc_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
        if labeler_df.at[iter, 'generation'+str(k)] == gpt3_df.at[iter,'candidate']:
            gpt3_df.at[iter, 'yc_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            gpt3_df.at[iter, 'yc_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, 'yc_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            ftgpt2_df.at[iter, 'yc_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
    
    assert len(row) == 0

labeler_df = zd_df
for iter, row in labeler_df.iterrows():
    row = [0,1,2]
    for k in range(3):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, 'zd_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            zerogpt2_df.at[iter, 'zd_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
        if labeler_df.at[iter, 'generation'+str(k)] == gpt3_df.at[iter,'candidate']:
            gpt3_df.at[iter, 'zd_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            gpt3_df.at[iter, 'zd_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, 'zd_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip()
            ftgpt2_df.at[iter, 'zd_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip()
            row.remove(k)
    
    assert len(row) == 0

zerogpt2_df.to_csv("zeroshot_trial_labeled.csv")
gpt3_df.to_csv("fewshotgpt3_trial_labeled.csv")
ftgpt2_df.to_csv("ftgpt2_trial_labeled.csv")

zerogpt2_zipped_naturalness = zip(zerogpt2_df['mi_naturalness'].tolist(), zerogpt2_df['zd_naturalness'].tolist(), zerogpt2_df['yc_naturalness'].tolist())
zerogpt2_zipped_usefulness = zip(zerogpt2_df['mi_usefulness'].tolist(), zerogpt2_df['zd_usefulness'].tolist(), zerogpt2_df['yc_usefulness'].tolist())
zerogpt2_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in zerogpt2_zipped_naturalness]
zerogpt2_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in zerogpt2_zipped_usefulness]

print("Zerogpt2")
print("By major vote")
print("Naturalness", Counter(zerogpt2_naturalness_major_vote))
print("Usefulness", Counter(zerogpt2_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(zerogpt2_df['mi_naturalness'].tolist() + zerogpt2_df['zd_naturalness'].tolist() + zerogpt2_df['yc_naturalness'].tolist()))
print("Usefulness", Counter(zerogpt2_df['mi_usefulness'].tolist() + zerogpt2_df['zd_usefulness'].tolist() + zerogpt2_df['yc_usefulness'].tolist()))
print("Agreements")
print("Naturalness")
print("mi_yc", cohen_kappa_score(zerogpt2_df['mi_naturalness'].tolist(), zerogpt2_df['yc_naturalness'].tolist()))
print("mi_zd", cohen_kappa_score(zerogpt2_df['mi_naturalness'].tolist(), zerogpt2_df['zd_naturalness'].tolist()))
print("zd_yc", cohen_kappa_score(zerogpt2_df['zd_naturalness'].tolist(), zerogpt2_df['yc_naturalness'].tolist()))
print("mi_mv", cohen_kappa_score(zerogpt2_df['mi_naturalness'].tolist(), zerogpt2_naturalness_major_vote))
print("zd_mv", cohen_kappa_score(zerogpt2_df['zd_naturalness'].tolist(), zerogpt2_naturalness_major_vote))
print("yc_mv", cohen_kappa_score(zerogpt2_df['yc_naturalness'].tolist(), zerogpt2_naturalness_major_vote))

print("Usefulness")
print("mi_yc", cohen_kappa_score(zerogpt2_df['mi_usefulness'].tolist(), zerogpt2_df['yc_usefulness'].tolist()))
print("mi_zd", cohen_kappa_score(zerogpt2_df['mi_usefulness'].tolist(), zerogpt2_df['zd_usefulness'].tolist()))
print("zd_yc", cohen_kappa_score(zerogpt2_df['zd_usefulness'].tolist(), zerogpt2_df['yc_usefulness'].tolist()))
print("mi_mv", cohen_kappa_score(zerogpt2_df['mi_usefulness'].tolist(), zerogpt2_usefulness_major_vote))
print("zd_mv", cohen_kappa_score(zerogpt2_df['zd_usefulness'].tolist(), zerogpt2_usefulness_major_vote))
print("yc_mv", cohen_kappa_score(zerogpt2_df['yc_usefulness'].tolist(), zerogpt2_usefulness_major_vote))


gpt3_zipped_naturalness = zip(gpt3_df['mi_naturalness'].tolist(), gpt3_df['zd_naturalness'].tolist(), gpt3_df['yc_naturalness'].tolist())
gpt3_zipped_usefulness = zip(gpt3_df['mi_usefulness'].tolist(), gpt3_df['zd_usefulness'].tolist(), gpt3_df['yc_usefulness'].tolist())
gpt3_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in gpt3_zipped_naturalness]
gpt3_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in gpt3_zipped_usefulness]

print("gpt3")
print("By major vote")
print("Naturalness", Counter(gpt3_naturalness_major_vote))
print("Usefulness", Counter(gpt3_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(gpt3_df['mi_naturalness'].tolist() + gpt3_df['zd_naturalness'].tolist() + gpt3_df['yc_naturalness'].tolist()))
print("Usefulness", Counter(gpt3_df['mi_usefulness'].tolist() + gpt3_df['zd_usefulness'].tolist() + gpt3_df['yc_usefulness'].tolist()))
print("Agreements")
print("Naturalness")
print("mi_yc", cohen_kappa_score(gpt3_df['mi_naturalness'].tolist(), gpt3_df['yc_naturalness'].tolist()))
print("mi_zd", cohen_kappa_score(gpt3_df['mi_naturalness'].tolist(), gpt3_df['zd_naturalness'].tolist()))
print("zd_yc", cohen_kappa_score(gpt3_df['zd_naturalness'].tolist(), gpt3_df['yc_naturalness'].tolist()))
print("mi_mv", cohen_kappa_score(gpt3_df['mi_naturalness'].tolist(), gpt3_naturalness_major_vote))
print("zd_mv", cohen_kappa_score(gpt3_df['zd_naturalness'].tolist(), gpt3_naturalness_major_vote))
print("yc_mv", cohen_kappa_score(gpt3_df['yc_naturalness'].tolist(), gpt3_naturalness_major_vote))

print("Usefulness")
print("mi_yc", cohen_kappa_score(gpt3_df['mi_usefulness'].tolist(), gpt3_df['yc_usefulness'].tolist()))
print("mi_zd", cohen_kappa_score(gpt3_df['mi_usefulness'].tolist(), gpt3_df['zd_usefulness'].tolist()))
print("zd_yc", cohen_kappa_score(gpt3_df['zd_usefulness'].tolist(), gpt3_df['yc_usefulness'].tolist()))

ftgpt2_zipped_naturalness = zip(ftgpt2_df['mi_naturalness'].tolist(), ftgpt2_df['zd_naturalness'].tolist(), ftgpt2_df['yc_naturalness'].tolist())
ftgpt2_zipped_usefulness = zip(ftgpt2_df['mi_usefulness'].tolist(), ftgpt2_df['zd_usefulness'].tolist(), ftgpt2_df['yc_usefulness'].tolist())
ftgpt2_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in ftgpt2_zipped_naturalness]
ftgpt2_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in ftgpt2_zipped_usefulness]

print("Finetune gpt2")
print("By major vote")
print("Naturalness", Counter(ftgpt2_naturalness_major_vote))
print("Usefulness", Counter(ftgpt2_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(ftgpt2_df['mi_naturalness'].tolist() + ftgpt2_df['zd_naturalness'].tolist() + ftgpt2_df['yc_naturalness'].tolist()))
print("Usefulness", Counter(ftgpt2_df['mi_usefulness'].tolist() + ftgpt2_df['zd_usefulness'].tolist() + ftgpt2_df['yc_usefulness'].tolist()))
print("Agreements")
print("Naturalness")
print("mi_yc", cohen_kappa_score(ftgpt2_df['mi_naturalness'].tolist(), ftgpt2_df['yc_naturalness'].tolist()))
print("mi_zd", cohen_kappa_score(ftgpt2_df['mi_naturalness'].tolist(), ftgpt2_df['zd_naturalness'].tolist()))
print("zd_yc", cohen_kappa_score(ftgpt2_df['zd_naturalness'].tolist(), ftgpt2_df['yc_naturalness'].tolist()))

print("Usefulness")
print("mi_yc", cohen_kappa_score(ftgpt2_df['mi_usefulness'].tolist(), ftgpt2_df['yc_usefulness'].tolist()))
print("mi_zd", cohen_kappa_score(ftgpt2_df['mi_usefulness'].tolist(), ftgpt2_df['zd_usefulness'].tolist()))
print("zd_yc", cohen_kappa_score(ftgpt2_df['zd_usefulness'].tolist(), ftgpt2_df['yc_usefulness'].tolist()))

Zerogpt2
By major vote
Naturalness Counter({'fair': 43, 'bad': 34, 'good': 22})
Usefulness Counter({'good': 55, 'bad': 25, 'fair': 19})
By all vote
Naturalness Counter({'bad': 106, 'fair': 105, 'good': 86})
Usefulness Counter({'good': 154, 'bad': 78, 'fair': 65})
Agreements
Naturalness
mi_yc 0.40839754678408546
mi_zd 0.16238681905892838
zd_yc 0.1643218908272368
mi_mv 0.7795801526717557
zd_mv 0.3622471910112359
yc_mv 0.5996266915538964
Usefulness
mi_yc 0.10483187341056799
mi_zd 0.3437317215831547
zd_yc 0.2172767203513909
mi_mv 0.4609101516919487
zd_mv 0.8117219917012448
yc_mv 0.37598944591029027
gpt3
By major vote
Naturalness Counter({'good': 90, 'bad': 6, 'fair': 3})
Usefulness Counter({'good': 76, 'bad': 22, 'fair': 1})
By all vote
Naturalness Counter({'good': 260, 'bad': 21, 'fair': 16})
Usefulness Counter({'good': 218, 'bad': 65, 'fair': 14})
Agreements
Naturalness
mi_yc 0.4698795180722891
mi_zd 0.6333333333333333
zd_yc 0.425189816882537
mi_mv 0.8450704225352113
zd_mv 0.740973312401

# Generate human annotation file for RQ2

In [30]:
taf_output_data = pd.read_csv('template_facet.csv')
sekulic_output_data = pd.read_csv('sekulic.csv')
ftgpt2_output_data = pd.read_csv('ftgpt2_prompt_temp0.1.csv')
zerogpt2_output_data = pd.read_csv('zeroshot_nd_wsdm.csv')

assert len(taf_output_data) == len(sekulic_output_data) and len(sekulic_output_data) == len(ftgpt2_output_data) and len(ftgpt2_output_data) == len(zerogpt2_output_data)

trial_data = zerogpt2_output_data[['query', 'facet', 'reference']]
taf_candidates = taf_output_data['candidate'].tolist()
sekulic_candidates = sekulic_output_data['candidate'].tolist()
ftgpt2_candidates = ftgpt2_output_data['candidate'].tolist()
zerogpt2_candidates = zerogpt2_output_data['candidate'].tolist()

index_to_model = {
    0: "taf",
    1: "sekulic",
    2: "prompt",
    3: "zeroshot"
}

index_to_df = {
    0: taf_candidates,
    1: sekulic_candidates,
    2: ftgpt2_candidates,
    3: zerogpt2_candidates
}

n_models = len(index_to_model.keys())
indices = []

for k in range(len(zerogpt2_candidates)):
    perm = np.random.permutation(n_models)
    indices.append(perm.tolist())
    for j in range(n_models):
        trial_data.at[k, 'generation'+str(j)] = index_to_df[perm[j]][k]

np.savetxt('trial_indices', np.array(indices), fmt='%d', delimiter=' ')
trial_data.to_csv("human.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_data.at[k, 'generation'+str(j)] = index_to_df[perm[j]][k]


In [14]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=1)


batch_size = 32
sample_every = 100
epochs = 8
learning_rate = 5e-5
warmup_steps = 1e2
epsilon = 1e-8
max_length = 128
prompt_instruction = ''

facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

model_output = 'gpt2.csv'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    clariq_f_train_file = 'data/clariq_f/ClariQ-FKw-train.tsv'
    clariq_f_train_data = pd.read_csv(clariq_f_train_file, sep='\t') 
    clariq_f_train_dict, clariq_f_train_data = process_clariq_f(clariq_f_train_data)
    clariq_f_train_text_list = clariq_f_train_data['f_q_cq']

    clariq_f_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
    clariq_f_test_data = pd.read_csv(clariq_f_test_file, sep='\t') 
    clariq_f_test_dict, clariq_f_test_data = process_clariq_f(clariq_f_test_data)

    rs = rouge.Rouge()
    rs_list = []
    b1, b2, b3, b4 = [], [], [], []
    m = []
    
    for iter, row in clariq_f_test_data.iterrows():
        query = clariq_f_test_data.at[iter, 'initial_request'] + BOS
        ref = clariq_f_test_data.at[iter, 'question']
        facet = clariq_f_test_data.at[iter, 'facet_desc']

        sample_outputs = generator(query, max_length=len(query) + 30, num_return_sequences=1)
        
        generated_text = sample_outputs[0]['generated_text']
        generated_cq = generated_text[len(query):]
        generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
        generated_cq = re.sub('[.?\n]', '&', generated_cq).split('&')[0].strip()

        clariq_f_test_data.at[iter, 'generated'] = process_generation(generated_cq)


        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = clariq_f_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

sekulic0_b1 = np.mean(b1)
sekulic0_b2 = np.mean(b2)
sekulic0_b3 = np.mean(b3)
sekulic0_b4 = np.mean(b4)
sekulic0_m = np.mean(m)
sekulic0_r = np.mean(r)
sekulic0_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_sekulic0_b1 = np.mean(t_b1)
t_sekulic0_b2 = np.mean(t_b2)
t_sekulic0_b3 = np.mean(t_b3)
t_sekulic0_b4 = np.mean(t_b4)
t_sekulic0_m = np.mean(t_m)
t_sekulic0_r = np.mean(t_r)
t_sekulic0_c = np.mean(t_c)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Full reference evaluation
b1 0.04311297961020846 b2 0.006631606205278626 b3 0.004148754940587422 b4 0.00432941193005007
rouge-L 0.052052737118538465
m 0.055976241870738526
c 0.013333333333333334
Question body evaluation
b1 0.023189283411068365 b2 0.0040683780339581446 b3 0.0034799485175119427 b4 0.00405636676922024
rouge-L 0.03308281607841434
m 0.03714642349666363
c 0.009215686274509804


# Multiple template

In [13]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)


batch_size = 32
sample_every = 100
epochs = 8
learning_rate = 5e-5
warmup_steps = 1e2
epsilon = 1e-8
max_length = 128
prompt_instruction = ''

facet_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
facet_test_data = pd.read_csv(facet_test_file, sep='\t')
_, facet_test_data = process_clariq_f(facet_test_data)

b1, b2, b3, b4 = [], [], [], []
m = []
r = []
c = []

t_b1, t_b2, t_b3, t_b4 = [], [], [], []
t_m = []
t_r = []
t_c = []

model_output = 'template0.csv'

if os.path.isfile(model_output):
    b1, b2, b3, b4, m, r, c, t_b1, t_b2, t_b3, t_b4, t_m, t_r, t_c = evaluate_from_output(model_output)

else:
    clariq_f_train_file = 'data/clariq_f/ClariQ-FKw-train.tsv'
    clariq_f_train_data = pd.read_csv(clariq_f_train_file, sep='\t') 
    clariq_f_train_dict, clariq_f_train_data = process_clariq_f(clariq_f_train_data)
    clariq_f_train_text_list = clariq_f_train_data['f_q_cq']

    clariq_f_test_file = 'data/clariq_f/ClariQ-FKw-dev.tsv'
    clariq_f_test_data = pd.read_csv(clariq_f_test_file, sep='\t') 
    clariq_f_test_dict, clariq_f_test_data = process_clariq_f(clariq_f_test_data)

    rs = rouge.Rouge()
    rs_list = []
    b1, b2, b3, b4 = [], [], [], []
    m = []
    
    for iter, row in clariq_f_test_data.iterrows():
        query = clariq_f_test_data.at[iter, 'initial_request']
        ref = clariq_f_test_data.at[iter, 'question']
        facet = clariq_f_test_data.at[iter, 'facet_desc']
    
        noun_in_query = [token.text for token in pos_tagger(query) if token.pos_ == 'NOUN']
        propn_in_query = [token.text.lower() for token in pos_tagger(query) if token.pos_ == 'PROPN']

        generated_cqs = []
        for st in starting_texts:
            sample_outputs = generator(query + ' ' + st, max_length=64, num_return_sequences=1)
        
            generated_text = sample_outputs[0]['generated_text']
            generated_cq = generated_text[len(query):]
            generated_cq = re.sub('\[SEP\]', ' ', generated_cq).strip()
            generated_cq = re.sub('[.?\n]', '&', generated_cq).split('&')[0].strip()
            generated_cqs.append(generated_cq)

        template_scores = calculate_WSDM(query=' '.join(noun_in_query+propn_in_query), doc_list=generated_cqs)
        generated_cq = sorted(template_scores.keys(), key = lambda x: template_scores[x], reverse=True)[0] 
        #print(generated_cq)
        clariq_f_test_data.at[iter, 'generated'] = process_generation(generated_cq)


        # full reference evaluation
        hyp_b1, hyp_b2, hyp_b3, hyp_b4, hyp_m, hyp_r, hyp_c = auto_evaluation(ref, generated_cq, facet)

        b1.append(hyp_b1)
        b2.append(hyp_b2)
        b3.append(hyp_b3)
        b4.append(hyp_b4)
        m.append(hyp_m)
        r.append(hyp_r)
        c.append(hyp_c)

        # question body evaluation
        truncate_ref = ' '.join(ref.split()[template_len:])
        truncate_generated_cq = ' '.join(generated_cq.split()[template_len:])
        
        t_hyp_b1, t_hyp_b2, t_hyp_b3, t_hyp_b4, t_hyp_m, t_hyp_r, t_hyp_c = auto_evaluation(truncate_ref, truncate_generated_cq, facet)

        t_b1.append(t_hyp_b1)
        t_b2.append(t_hyp_b2)
        t_b3.append(t_hyp_b3)
        t_b4.append(t_hyp_b4)
        t_m.append(t_hyp_m)
        t_r.append(t_hyp_r)
        t_c.append(t_hyp_c)

    output_df = clariq_f_test_data[['initial_request', 'facet_desc', 'question', 'generated']]
    output_df.columns = ['query', 'facet', 'reference', 'candidate']
    output_df.to_csv(model_output)

# full reference results
print("================================================================")
print("Full reference evaluation")
print("================================================================")
print("b1", np.mean(b1), "b2", np.mean(b2), "b3", np.mean(b3), "b4", np.mean(b4))
print("rouge-L", np.mean(r))
print("m", np.mean(m))
print("c", np.mean(c))

sekulic0_b1 = np.mean(b1)
sekulic0_b2 = np.mean(b2)
sekulic0_b3 = np.mean(b3)
sekulic0_b4 = np.mean(b4)
sekulic0_m = np.mean(m)
sekulic0_r = np.mean(r)
sekulic0_c = np.mean(c)

# question body results
print("================================================================")
print("Question body evaluation")
print("================================================================")
print("b1", np.mean(t_b1), "b2", np.mean(t_b2), "b3", np.mean(t_b3), "b4", np.mean(t_b4))
print("rouge-L", np.mean(t_r))
print("m", np.mean(t_m))
print("c", np.mean(t_c))

t_sekulic0_b1 = np.mean(t_b1)
t_sekulic0_b2 = np.mean(t_b2)
t_sekulic0_b3 = np.mean(t_b3)
t_sekulic0_b4 = np.mean(t_b4)
t_sekulic0_m = np.mean(t_m)
t_sekulic0_r = np.mean(t_r)
t_sekulic0_c = np.mean(t_c)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Full reference evaluation
b1 0.21265806579183102 b2 0.09247087817195276 b3 0.05440921170863705 b4 0.03858218109211079
rouge-L 0.2758348540127664
m 0.24035101874009449
c 0.06858823529411766
Question body evaluation
b1 0.07888041389072298 b2 0.025424614734841443 b3 0.012360211398353017 b4 0.011564095956348113
rouge-L 0.10652104329201147
m 0.11132045940816086
c 0.06270588235294117


In [42]:
from sklearn.metrics import cohen_kappa_score
from collections import Counter
import pandas as pd

human1 = pd.read_csv("csv/Deshmukh.csv", encoding = "utf-8")
human2 = pd.read_csv("csv/Gloria.csv", encoding = "utf-8")
human3 = pd.read_csv("csv/Jyotsna.csv", encoding = "utf-8")
human4 = pd.read_csv("csv/Kranthi.csv", encoding = "utf-8")
human5 = pd.read_csv("csv/Mithila.csv", encoding = "utf-8")

zerogpt2_df = pd.read_csv("csv/zeroshot_nd_wsdm.csv")
rewriting_df = pd.read_csv("csv/template_facet.csv")
sekulic_df = pd.read_csv("csv/sekulic.csv")
ftgpt2_df = pd.read_csv("csv/ftgpt2_prompt_temp0.1.csv")

labeler_df = human1
labeler_name = 'human1'
for iter, irow in labeler_df.iterrows():
    row = [0,1,2,3]
    for k in range(4):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            zerogpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == rewriting_df.at[iter,'candidate']:
            rewriting_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            rewriting_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            ftgpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == sekulic_df.at[iter,'candidate']:
            sekulic_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            sekulic_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
    
    try:
        assert len(row) == 0
    except:
        pass

labeler_df = human2
labeler_name = 'human2'
for iter, irow in labeler_df.iterrows():
    row = [0,1,2,3]
    for k in range(4):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            zerogpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == rewriting_df.at[iter,'candidate']:
            rewriting_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            rewriting_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            ftgpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == sekulic_df.at[iter,'candidate']:
            sekulic_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            sekulic_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
    
    try:
        assert len(row) == 0
    except:
        pass


labeler_df = human3
labeler_name = 'human3'
for iter, irow in labeler_df.iterrows():
    row = [0,1,2,3]
    for k in range(4):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            zerogpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == rewriting_df.at[iter,'candidate']:
            rewriting_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            rewriting_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            ftgpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == sekulic_df.at[iter,'candidate']:
            sekulic_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            sekulic_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
    
    try:
        assert len(row) == 0
    except:
        pass


labeler_df = human4
labeler_name = 'human4'
for iter, irow in labeler_df.iterrows():
    row = [0,1,2,3]
    for k in range(4):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            zerogpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == rewriting_df.at[iter,'candidate']:
            rewriting_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            rewriting_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            ftgpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == sekulic_df.at[iter,'candidate']:
            sekulic_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            sekulic_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
    
    try:
        assert len(row) == 0
    except:
        pass


labeler_df = human5
labeler_name = 'human5'
for iter, irow in labeler_df.iterrows():
    row = [0,1,2,3]
    for k in range(4):
        if labeler_df.at[iter, 'generation'+str(k)] == zerogpt2_df.at[iter,'candidate']:
            zerogpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            zerogpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == rewriting_df.at[iter,'candidate']:
            rewriting_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            rewriting_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == ftgpt2_df.at[iter,'candidate']:
            ftgpt2_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            ftgpt2_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass
        if labeler_df.at[iter, 'generation'+str(k)] == sekulic_df.at[iter,'candidate']:
            sekulic_df.at[iter, labeler_name + '_naturalness'] = labeler_df.at[iter, 'Naturalness'+str(k)].strip().lower()
            sekulic_df.at[iter, labeler_name + '_usefulness'] = labeler_df.at[iter, 'Usefulness'+str(k)].strip().lower()
            try:
                row.remove(k)
            except:
                pass    
    
    try:
        assert len(row) == 0
    except:
        pass


zerogpt2_df.to_csv("csv/zeroshot_labeled.csv")
rewriting_df.to_csv("csv/template_facet_labeled.csv")
sekulic_df.to_csv("csv/sekulic_labeled.csv")
ftgpt2_df.to_csv("csv/ftgpt2_prompt_labeled.csv")

zerogpt2_zipped_naturalness = zip(zerogpt2_df['human1_naturalness'].tolist(), zerogpt2_df['human2_naturalness'].tolist(), zerogpt2_df['human3_naturalness'].tolist(), zerogpt2_df['human4_naturalness'].tolist())
zerogpt2_zipped_usefulness = zip(zerogpt2_df['human1_usefulness'].tolist(), zerogpt2_df['human2_usefulness'].tolist(), zerogpt2_df['human3_usefulness'].tolist(), zerogpt2_df['human4_usefulness'].tolist())
zerogpt2_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in zerogpt2_zipped_naturalness]
zerogpt2_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in zerogpt2_zipped_usefulness]

print("Zerogpt2")
print("By major vote")
print("Naturalness", Counter(zerogpt2_naturalness_major_vote))
print("Usefulness", Counter(zerogpt2_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(zerogpt2_df['human1_naturalness'].tolist() + zerogpt2_df['human2_naturalness'].tolist() + zerogpt2_df['human3_naturalness'].tolist() + zerogpt2_df['human4_naturalness'].tolist()))
print("Usefulness", Counter(zerogpt2_df['human1_usefulness'].tolist() + zerogpt2_df['human2_usefulness'].tolist() + zerogpt2_df['human3_usefulness'].tolist() + zerogpt2_df['human4_usefulness'].tolist()))

human1_naturalness = zerogpt2_df['human1_naturalness'].tolist() + rewriting_df['human1_naturalness'].tolist() + ftgpt2_df['human1_naturalness'].tolist() + sekulic_df['human1_naturalness'].tolist()
human2_naturalness = zerogpt2_df['human2_naturalness'].tolist() + rewriting_df['human2_naturalness'].tolist() + ftgpt2_df['human2_naturalness'].tolist() + sekulic_df['human2_naturalness'].tolist()
human3_naturalness = zerogpt2_df['human3_naturalness'].tolist() + rewriting_df['human3_naturalness'].tolist() + ftgpt2_df['human3_naturalness'].tolist() + sekulic_df['human3_naturalness'].tolist()
human4_naturalness = zerogpt2_df['human4_naturalness'].tolist() + rewriting_df['human4_naturalness'].tolist() + ftgpt2_df['human4_naturalness'].tolist() + sekulic_df['human4_naturalness'].tolist()
human5_naturalness = zerogpt2_df['human5_naturalness'].tolist() + rewriting_df['human5_naturalness'].tolist() + ftgpt2_df['human5_naturalness'].tolist() + sekulic_df['human5_naturalness'].tolist()

human1_usefulness = zerogpt2_df['human1_usefulness'].tolist() + rewriting_df['human1_usefulness'].tolist() + ftgpt2_df['human1_usefulness'].tolist() + sekulic_df['human1_usefulness'].tolist()
human2_usefulness = zerogpt2_df['human2_usefulness'].tolist() + rewriting_df['human2_usefulness'].tolist() + ftgpt2_df['human2_usefulness'].tolist() + sekulic_df['human2_usefulness'].tolist()
human3_usefulness = zerogpt2_df['human3_usefulness'].tolist() + rewriting_df['human3_usefulness'].tolist() + ftgpt2_df['human3_usefulness'].tolist() + sekulic_df['human3_usefulness'].tolist()
human4_usefulness = zerogpt2_df['human4_usefulness'].tolist() + rewriting_df['human4_usefulness'].tolist() + ftgpt2_df['human4_usefulness'].tolist() + sekulic_df['human4_usefulness'].tolist()
human5_usefulness = zerogpt2_df['human5_usefulness'].tolist() + rewriting_df['human5_usefulness'].tolist() + ftgpt2_df['human5_usefulness'].tolist() + sekulic_df['human5_usefulness'].tolist()




rewriting_df_zipped_naturalness = zip(rewriting_df['human1_naturalness'].tolist(), rewriting_df['human2_naturalness'].tolist(), rewriting_df['human3_naturalness'].tolist(), rewriting_df['human4_naturalness'].tolist())
rewriting_df_zipped_usefulness = zip(rewriting_df['human1_usefulness'].tolist(), rewriting_df['human2_usefulness'].tolist(), rewriting_df['human3_usefulness'].tolist(), rewriting_df['human4_usefulness'].tolist())
rewriting_df_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in rewriting_df_zipped_naturalness]
rewriting_df_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in rewriting_df_zipped_usefulness]

print("Rewriting.")
print("By major vote")
print("Naturalness", Counter(rewriting_df_naturalness_major_vote))
print("Usefulness", Counter(rewriting_df_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(rewriting_df['human1_naturalness'].tolist() + rewriting_df['human2_naturalness'].tolist() + rewriting_df['human3_naturalness'].tolist() + rewriting_df['human4_naturalness'].tolist()))
print("Usefulness", Counter(rewriting_df['human1_usefulness'].tolist() + rewriting_df['human2_usefulness'].tolist() + rewriting_df['human3_usefulness'].tolist() + rewriting_df['human4_usefulness'].tolist()))


sekulic_df_zipped_naturalness = zip(sekulic_df['human1_naturalness'].tolist(), sekulic_df['human2_naturalness'].tolist(), sekulic_df['human3_naturalness'].tolist(), sekulic_df['human4_naturalness'].tolist())
sekulic_df_zipped_usefulness = zip(sekulic_df['human1_usefulness'].tolist(), sekulic_df['human2_usefulness'].tolist(), sekulic_df['human3_usefulness'].tolist(), sekulic_df['human4_usefulness'].tolist())
sekulic_df_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in sekulic_df_zipped_naturalness]
sekulic_df_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in sekulic_df_zipped_usefulness]

print("Sekulic.")
print("By major vote")
print("Naturalness", Counter(sekulic_df_naturalness_major_vote))
print("Usefulness", Counter(sekulic_df_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(sekulic_df['human1_naturalness'].tolist() + sekulic_df['human2_naturalness'].tolist() + sekulic_df['human3_naturalness'].tolist() + sekulic_df['human4_naturalness'].tolist()))
print("Usefulness", Counter(sekulic_df['human1_usefulness'].tolist() + sekulic_df['human2_usefulness'].tolist() + sekulic_df['human3_usefulness'].tolist() + sekulic_df['human4_usefulness'].tolist()))


ftgpt2_df_zipped_naturalness = zip(ftgpt2_df['human1_naturalness'].tolist(), ftgpt2_df['human2_naturalness'].tolist(), ftgpt2_df['human3_naturalness'].tolist(), ftgpt2_df['human4_naturalness'].tolist())
ftgpt2_df_zipped_usefulness = zip(ftgpt2_df['human1_usefulness'].tolist(), ftgpt2_df['human2_usefulness'].tolist(), ftgpt2_df['human3_usefulness'].tolist(), ftgpt2_df['human4_usefulness'].tolist())
ftgpt2_df_naturalness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in ftgpt2_df_zipped_naturalness]
ftgpt2_df_usefulness_major_vote = [Counter(list(labels)).most_common()[0][0] if Counter(list(labels)).most_common()[0][1] > 1 else 'fair' for labels in ftgpt2_df_zipped_usefulness]

print("prompt finetuning.")
print("By major vote")
print("Naturalness", Counter(ftgpt2_df_naturalness_major_vote))
print("Usefulness", Counter(ftgpt2_df_usefulness_major_vote))
print("By all vote")
print("Naturalness", Counter(ftgpt2_df['human1_naturalness'].tolist() + ftgpt2_df['human2_naturalness'].tolist() + ftgpt2_df['human3_naturalness'].tolist() + ftgpt2_df['human4_naturalness'].tolist()))
print("Usefulness", Counter(ftgpt2_df['human1_usefulness'].tolist() + ftgpt2_df['human2_usefulness'].tolist() + ftgpt2_df['human3_usefulness'].tolist() + ftgpt2_df['human4_usefulness'].tolist()))


print("Agreements")
print("Naturalness")
print("1_2", cohen_kappa_score(human1_naturalness, human2_naturalness))
print("1_3", cohen_kappa_score(human1_naturalness, human3_naturalness))
print("1_4", cohen_kappa_score(human1_naturalness, human4_naturalness))
print("1_5", cohen_kappa_score(human1_naturalness, human5_naturalness))
print("2_3", cohen_kappa_score(human2_naturalness, human3_naturalness))
print("2_4", cohen_kappa_score(human2_naturalness, human4_naturalness))
print("2_5", cohen_kappa_score(human2_naturalness, human5_naturalness))
print("3_4", cohen_kappa_score(human3_naturalness, human4_naturalness))
print("3_5", cohen_kappa_score(human3_naturalness, human5_naturalness))
print("4_5", cohen_kappa_score(human4_naturalness, human5_naturalness))
print("Usefulness")
print("1_2", cohen_kappa_score(human1_usefulness, human2_usefulness))
print("1_3", cohen_kappa_score(human1_usefulness, human3_usefulness))
print("1_4", cohen_kappa_score(human1_usefulness, human4_usefulness))
print("1_5", cohen_kappa_score(human1_usefulness, human5_usefulness))
print("2_3", cohen_kappa_score(human2_usefulness, human3_usefulness))
print("2_4", cohen_kappa_score(human2_usefulness, human4_usefulness))
print("2_5", cohen_kappa_score(human2_usefulness, human5_usefulness))
print("3_4", cohen_kappa_score(human3_usefulness, human4_usefulness))
print("3_5", cohen_kappa_score(human3_usefulness, human5_usefulness))
print("4_5", cohen_kappa_score(human4_usefulness, human5_usefulness))


human1_naturalness = zerogpt2_df['human1_naturalness'].tolist() + rewriting_df['human1_naturalness'].tolist() + ftgpt2_df['human1_naturalness'].tolist() + sekulic_df['human1_naturalness'].tolist()
human2_naturalness = zerogpt2_df['human2_naturalness'].tolist() + rewriting_df['human2_naturalness'].tolist() + ftgpt2_df['human2_naturalness'].tolist() + sekulic_df['human2_naturalness'].tolist()
human3_naturalness = zerogpt2_df['human3_naturalness'].tolist() + rewriting_df['human3_naturalness'].tolist() + ftgpt2_df['human3_naturalness'].tolist() + sekulic_df['human3_naturalness'].tolist()
human4_naturalness = zerogpt2_df['human4_naturalness'].tolist() + rewriting_df['human4_naturalness'].tolist() + ftgpt2_df['human4_naturalness'].tolist() + sekulic_df['human4_naturalness'].tolist()
human5_naturalness = zerogpt2_df['human5_naturalness'].tolist() + rewriting_df['human5_naturalness'].tolist() + ftgpt2_df['human5_naturalness'].tolist() + sekulic_df['human5_naturalness'].tolist()

human1_usefulness = zerogpt2_df['human1_usefulness'].tolist() + rewriting_df['human1_usefulness'].tolist() + ftgpt2_df['human1_usefulness'].tolist() + sekulic_df['human1_usefulness'].tolist()
human2_usefulness = zerogpt2_df['human2_usefulness'].tolist() + rewriting_df['human2_usefulness'].tolist() + ftgpt2_df['human2_usefulness'].tolist() + sekulic_df['human2_usefulness'].tolist()
human3_usefulness = zerogpt2_df['human3_usefulness'].tolist() + rewriting_df['human3_usefulness'].tolist() + ftgpt2_df['human3_usefulness'].tolist() + sekulic_df['human3_usefulness'].tolist()
human4_usefulness = zerogpt2_df['human4_usefulness'].tolist() + rewriting_df['human4_usefulness'].tolist() + ftgpt2_df['human4_usefulness'].tolist() + sekulic_df['human4_usefulness'].tolist()
human5_usefulness = zerogpt2_df['human5_usefulness'].tolist() + rewriting_df['human5_usefulness'].tolist() + ftgpt2_df['human5_usefulness'].tolist() + sekulic_df['human5_usefulness'].tolist()


n_1_2 = cohen_kappa_score(human1_naturalness, human2_naturalness)
n_1_3 = cohen_kappa_score(human1_naturalness, human3_naturalness)
n_1_4 = cohen_kappa_score(human1_naturalness, human4_naturalness)
n_1_5 = cohen_kappa_score(human1_naturalness, human5_naturalness)
n_2_3 = cohen_kappa_score(human2_naturalness, human3_naturalness)
n_2_4 = cohen_kappa_score(human2_naturalness, human4_naturalness)
n_2_5 = cohen_kappa_score(human2_naturalness, human5_naturalness)
n_3_4 = cohen_kappa_score(human3_naturalness, human4_naturalness)
n_3_5 = cohen_kappa_score(human3_naturalness, human5_naturalness)
n_4_5 = cohen_kappa_score(human4_naturalness, human5_naturalness)


u_1_2 = cohen_kappa_score(human1_usefulness, human2_usefulness)
u_1_3 = cohen_kappa_score(human1_usefulness, human3_usefulness)
u_1_4 = cohen_kappa_score(human1_usefulness, human4_usefulness)
u_1_5 = cohen_kappa_score(human1_usefulness, human5_usefulness)
u_2_3 = cohen_kappa_score(human2_usefulness, human3_usefulness)
u_2_4 = cohen_kappa_score(human2_usefulness, human4_usefulness)
u_2_5 = cohen_kappa_score(human2_usefulness, human5_usefulness)
u_3_4 = cohen_kappa_score(human3_usefulness, human4_usefulness)
u_3_5 = cohen_kappa_score(human3_usefulness, human5_usefulness)
u_4_5 = cohen_kappa_score(human4_usefulness, human5_usefulness)


print("Agreements all")
print("Naturalness")
print((n_1_2 + n_1_3 + n_1_4 + n_1_5 + n_2_3 + n_2_4 + n_2_5 + n_3_4 + n_3_5 + n_4_5)/10)
print("Usefulness")
print((u_1_2 + u_1_3 + u_1_4 + u_1_5 + u_2_3 + u_2_4 + u_2_5 + u_3_4 + u_3_5 + u_4_5)/10)


print("Agreements 1,2,5")
print("Naturalness")
print((n_1_2 + n_1_5 + n_2_5)/3)
print("Usefulness")
print((u_1_2 + u_1_5 + u_2_5)/3)


Zerogpt2
By major vote
Naturalness Counter({'good': 351, 'fair': 59, 'bad': 15})
Usefulness Counter({'good': 293, 'fair': 93, 'bad': 39})
By all vote
Naturalness Counter({'good': 1222, 'fair': 358, 'bad': 120})
Usefulness Counter({'good': 1000, 'fair': 469, 'bad': 231})
Rewriting.
By major vote
Naturalness Counter({'good': 244, 'fair': 140, 'bad': 41})
Usefulness Counter({'good': 216, 'fair': 153, 'bad': 56})
By all vote
Naturalness Counter({'good': 858, 'fair': 632, 'bad': 210})
Usefulness Counter({'good': 753, 'fair': 678, 'bad': 269})
Sekulic.
By major vote
Naturalness Counter({'good': 253, 'bad': 112, 'fair': 60})
Usefulness Counter({'bad': 302, 'fair': 74, 'good': 49})
By all vote
Naturalness Counter({'good': 894, 'bad': 431, 'fair': 374, nan: 1})
Usefulness Counter({'bad': 1048, 'fair': 377, 'good': 274, nan: 1})
prompt finetuning.
By major vote
Naturalness Counter({'bad': 192, 'good': 186, 'fair': 47})
Usefulness Counter({'bad': 205, 'good': 125, 'fair': 95})
By all vote
Natural