# Definition Modeling Expirements

Notebook for what I imagine the first set of expirements will be. It will consist of testing a few different basic modeling approaches.  

1) **Sequence-to-Sequence**: Includes finetuned BART (question: {} context: {}) setup. 

2) **Language Modeling**: Includes off-the-shelf few-shot GPT2 and GPT3 (from the paper: ```We append two held-out term and definition pairs, along with the first abstract drawn from each supporting document (we use the first abstract only to keep the full input within the models’ context windows)```. Also finetuned GPT2 (with ```<question> term q <context> context doc <definition> definition``` tags).

3) **Informational Retrieval**: An extractive approach over the support docs, using BiDAF



In [None]:
from transformers import (
     AutoTokenizer,
     AutoModelForSeq2SeqLM,
     AutoConfig, 
     AutoModelForCausalLM,
)

from importlib import reload  
from joblib import dump, load

import os
import sys
from importlib import reload  
import pandas as pd
import numpy as np
import csv
import nlp
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import  word_tokenize, sent_tokenize
import random
import uuid
import spacy 
from tqdm import tqdm
from spacy.pipeline import Sentencizer
import re
from typing import Iterable, List


random.seed(42)

sys.path.append('/lib')
import utils

MODEL_DIR = ''
RESOURCE_DIR = ''
DATA_DIR = ''




# Data

In [None]:
##### Dataset reloading - Ignore all of the above
df_wiki_medq_strat_train = pd.read_csv('{}/model_data/train.csv'.format(DATA_DIR))
df_wiki_medq_strat_dev = pd.read_csv('{}/model_data/dev.csv'.format(DATA_DIR))
df_wiki_medq_strat_test = pd.read_csv('{}/model_data/test.csv'.format(DATA_DIR))


df_medq_wiki = pd.concat([df_wiki_medq_strat_train, df_wiki_medq_strat_dev, df_wiki_medq_strat_test])

# we use the dev set for testing the base generators, and reserve our test set for 
# complexity control, for training the generators we use splits of the train set
df_test = df_wiki_medq_strat_dev 

# GPUs

In [None]:
GPU_NUMBER = ''
CUDA_DEVICE = ''
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_NUMBER

DEVICE = 'cuda:0'


# Model Generations

In [None]:


# function for wrapping the rather long code for generating answers across a df
def generate_answers(df, model, tokenizer, model_type,
                     num_return_sequences=1,
                     num_beams=5,
                     max_length=64,
                     min_length=8,
                     early_stopping=True,
                     temperature=None,
                     do_sample=True,
                     top_k=50,
                     top_p=0.9,
                     max_input_length=1024,
                     no_repeat_ngram_size=3,
                     device=None):
    
    answer_df_lists = []
    for i, r in tqdm(df.iterrows()):
        
        # set, or reset lengths (mostly for gpt2)
        current_max_input = max_input_length
        max_output_length = max_length
        min_output_length = min_length
        
        row = r.to_dict()

        # make the input, doing truncation differently depending on if it is bart or gpt2
        if 'bart' in model_type.lower():
            
            # for bart, just truncate to max input length
            q_doc = row['q_s2orc_doc'] 
            inputs = tokenizer(q_doc, return_tensors='pt', max_length=current_max_input, truncation=True)
            
                
        elif 'gpt'in model_type.lower():
            # for gpt2 truncate from the front to preserve the question at the end.
            q_doc = r['gpt2_task_row']
            
            # make space for input based on max output allowed  
            current_max_input -= max_output_length 
            
            inputs = tokenizer(q_doc, return_tensors='pt', truncation=False) 
            input_length = inputs['input_ids'].shape[1]
            if input_length > current_max_input:
                print('Input length of {} is too long for max input of {}, truncating to {}.'.format(input_length, current_max_input, current_max_input))
                # this is truncating from the back
                inputs['input_ids'] = inputs['input_ids'][:, -current_max_input:]
                inputs['attention_mask'] = inputs['attention_mask'][:, -current_max_input:]
                input_length = current_max_input
                
        else:
            raise Exception('Model type {} not found!'.format(model_type))
            
        eos_token_id = tokenizer.eos_token_id
        
        
        # resize max length based on the size of the input for gpt models
        if 'gpt'in model_type.lower():
            max_output_length += input_length
            min_output_length += input_length
            if 'ootb' not in model_type.lower():
                # also set the special end token (if this is a finetuned gpt model) to the context token
                eos_token_id = tokenizer.additional_special_tokens_ids[1]
                
        outputs = model.generate(**inputs.to(device),
                                 decoder_start_token_id=tokenizer.bos_token_id,
                                 num_return_sequences=num_return_sequences,
                                 num_beams=num_beams, 
                                 min_length=min_output_length,
                                 max_length=max_output_length, 
                                 early_stopping=early_stopping, 
                                 temperature=temperature,
                                 do_sample=do_sample,
                                 top_k=top_k,
                                 top_p=top_p,
                                 eos_token_id=eos_token_id,
                                 no_repeat_ngram_size=no_repeat_ngram_size,
                                 output_scores=True,
                                 return_dict_in_generate=True)

        # save all the answers with their associated scores in a df
        # because GPT2 includes what the answer was conditioned on, we have to strip that
        if 'gpt'in model_type.lower():
            answers = [tokenizer.decode(ans_ids[input_length:], skip_special_tokens=True).strip() for ans_ids in outputs[0]]
        else:
            answers = [tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in outputs[0]]
        
        df_answers = pd.DataFrame(zip(answers, outputs['sequences_scores'].tolist()), columns=['response', 'scores'])
        
        df_answers['model-type'] = model_type
        
        # save information about the question
        df_answers['question'] = row['question']
        df_answers['category'] = row['category']
        df_answers['first_sentence'] = row['first_sentence']
    
        # append the df
        answer_df_lists.append(df_answers)
        
    return pd.concat(answer_df_lists)


## BART

In [None]:
# Bart model - finetuned on medq s2orc
bart_tokenizer = AutoTokenizer.from_pretrained("{}/bart_medq_wiki_gen".format(MODEL_DIR))
bart_model = AutoModelForSeq2SeqLM.from_pretrained("{}/bart_medq_wiki_gen".format(MODEL_DIR))
_ = bart_model.eval()


In [None]:
bart_answers = generate_answers(df=df_test, model=bart_model, tokenizer=bart_tokenizer, model_type='bart', use_decoder_prefix=False, device=DEVICE)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(bart_answers[['question', 'first_sentence', 'response', 'scores']].sample(10))

In [None]:
# move the bart model off the gpu
bart_model=bart_model.cpu()

## Finetuned GPT2

In [None]:
df_test['gpt2_task_row'] = [utils.make_gpt_doc(r['question'], r['support_doc_sparse_s2orc'], None, testing=True) for _,r in df_test.iterrows()] 

In [None]:
# GPT2 model - finetuned on medq and s2orc
finetuned_gpt2_model_path = '/homes/gws/taugust/Projects/ARK/sci_comm/models/gpt2_full_gen'

gpt2_model = AutoModelForCausalLM.from_pretrained(finetuned_gpt2_model_path).to(DEVICE)
gpt2_tokenizer = AutoTokenizer.from_pretrained(finetuned_gpt2_model_path)

In [None]:
gpt2_answers = generate_answers(df=df_test, model=gpt2_model, tokenizer=gpt2_tokenizer, model_type='gpt2', device=DEVICE)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(gpt2_dev_answers[['question', 'first_sentence', 'response', 'scores']])

In [None]:
gpt2_model=gpt2_model.cpu()

# OOTB GPT2

Right now we are using the first two rows as a few shot setting, so MAKE SURE YOU TAKE THEM OUT IN SCORING

In [None]:
### Input should be the context doc + 2 new lines + a templated question

def make_templated_input(support_doc, question):
    return '{}\n\n{}'.format(support_doc, question)


def make_few_shot_templated_input(support_doc, question, few_shot_prefix):
    return '{}\n {}\n\n{}'.format(few_shot_prefix, support_doc, question)

In [None]:
#### Ignoring the support doc

def make_qa_prompt(q, definition):
    return 'question: {} \ndefinition: {}'.format(q, definition)

def make_task_row(few_shot_prefix, row):
    return '{}{}'.format(qa_few_shot_prefix, make_qa_prompt(row['question'], ''))


row1=df_wiki_medq_strat_dev.iloc[0]
row2=df_wiki_medq_strat_dev.iloc[1]


ex1 = make_qa_prompt(row1['question'], row1['first_sentence'])
ex2 = make_qa_prompt(row2['question'], row2['first_sentence'])

qa_few_shot_prefix = '{}\n\n###\n\n{}\n\n###\n\n'.format(ex1, ex2)


In [None]:
# to make this work with the above function, just remake the task row
df_test['gpt2_task_row'] = [make_task_row(qa_few_shot_prefix, r) for _,r in df_test.iterrows()] 

In [None]:
ootb_gpt2_model = AutoModelForCausalLM.from_pretrained('gpt2-medium').to(DEVICE)
ootb_gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2-medium')

In [None]:
ootb_gpt2_answers = generate_answers(df=df_test[2:], model=ootb_gpt2_model, tokenizer=ootb_gpt2_tokenizer, model_type='ootb-gpt2', device=DEVICE)

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(ootb_gpt2_dev_answers[['question', 'first_sentence', 'generated_def', 'scores']])

In [None]:
# at the end move the model off the gpu (if it was there)
ootb_gpt2_model=ootb_gpt2_model.cpu()

# OOTB GPT3 - maxed out 

In [None]:
import openai
ENV_DIR = ''

# get the api key 
with open('{}/vars.txt'.format(ENV_DIR), 'r') as f:
    env_vars = f.readlines()
    
env_vars = [tuple(v.split('=')) for v in env_vars]
assert(env_vars[0][0] == 'OPENAI_API_KEY')
open_api_key = env_vars[0][1]
openai.api_key = open_api_key.strip()

In [None]:

# from: https://beta.openai.com/docs/introduction/factual-responses
def get_gpt3_output(openai, prompt, max_tokens=64):
    response = openai.Completion.create(
      engine="davinci",
      prompt=prompt,
      max_tokens=max_tokens,
      top_p=0.9,
      frequency_penalty=0,
      presence_penalty=0,
      stop=["###"]
    )
    return response['choices'][0]['text']

In [None]:
prompt = '{}{}'.format(qa_few_shot_prefix, make_qa_prompt(test_row['question'], ''))

In [None]:
ootb_gpt3_answers = df_test.sample(100, random_state=84, replace=False).copy()


In [None]:
### because gpt3 just gives back out answers, duplicate the df_dev and set the response column to the output
ootb_gpt3_answers['model-type'] = 'ootb-gpt3'
ootb_gpt3_answers['response'] = [get_gpt3_output(openai, p, max_tokens=64) for p in tqdm(ootb_gpt3_dev_answers['gpt2_task_row'])]

In [None]:
# ootb_gpt3_dev_answers = df[df['model-type'] == 'ootb-gpt3']
with pd.option_context('display.max_colwidth', None):
    display(ootb_gpt3_answers[['question', 'first_sentence', 'response']])

# Evaluating 

Doing ROUGE and BERTscore

In [None]:
from datasets import list_metrics, load_metric

bert_score = load_metric("bertscore")
rouge = load_metric("rouge")

In [None]:
def calc_rouge(df, rouge_metric, ref_col, response_col):
    scores = rouge_metric.compute(
        predictions=df[response_col], references=df[ref_col],
        rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'],
        use_agregator=True, use_stemmer=False
    )
    rows = []
    measure_mapping = {'F':lambda x: x.mid.fmeasure}

    for t in scores.keys():
        for measure in ['F']:
            rows.append({'score_type':t, 'score_measure':measure, 'score':measure_mapping[measure](scores[t])})
    
    return pd.DataFrame(rows)

In [None]:
df = pd.concat([ootb_gpt2_answers, gpt2_answers, bart_answers, df_extracted_answers])

In [None]:
df.groupby(['model-type']).apply(lambda g: calc_rouge(g, rouge, ref_col='first_sentence', response_col='response'))

In [None]:
scores = bert_score.compute(
        predictions=df['response'], references=df['first_sentence'], lang='en', verbose=False, device=None)

df['bert_scores'] = scores['f1']
df.groupby(['model-type'])['bert_scores'].describe()