# 1. Process train and test set for Qulac and ClariQ

In [None]:
import csv
import json 
import pandas as pd
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import torch as T
import os
from tqdm import tqdm
import re
from autocorrect import Speller
spell = Speller(lang='en')

import torch as T
import pandas as pd
import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
)

idk_list = [
    'i dont know',
    'i do not know',
    'im not sure',
    'i am not sure',
    'unsure',
    'possibly',
    'this is not related to my search'
    ]

negation_list = [
    'no',
    'not',
    'none',
    'isnt',
    'isn\'t',
    'dont',
    'don\'t',
    ]


auxiliary_verb_list = [
    'did',
    'do',
    'does',
    'it',
    'are',
    'was',
    'were',
    'have',
    'has',
    'can',
    'could',
    'will',
    'would',
]

def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = text.lower()
    text = re.sub("'(.*)'", r"\1", text)
    if text[-1] == '?':
        text = text[:-1] + '.'
    return text

def type_answer(answer):
    if answer in idk_list:
        return 'idk'
    elif 'yes' in answer.split()[:3]:
        return 'yes'
    elif any([w in answer.split()[:3] for w in negation_list]):
        return 'no'
    else:
        return 'open'

def u2i(text):
    text = re.sub('are you', 'am i', text)
    text = re.sub('you', 'i', text)
    return text

qulac_data_files = [
    '../data/qulac/qulac.train.json',
    '../data/qulac/qulac.test.json',
    '../data/qulac/qulac.valid.json',
]

qulac_output_names = [
    '../data/processed/qulac_train.csv',
    '../data/processed/qulac_dev.csv',
    '../data/processed/qulac_test.csv',
]


if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')


for data_file, output_name in zip(qulac_data_files, qulac_output_names):
    df = pd.read_json(data_file)
    df.replace(['', "NaN", 'NaT'], np.nan, inplace = True)
    df.dropna(subset=['question', 'facet_desc' , 'answer'], how='any', inplace=True)
    df = df[['topic','facet_desc','question','answer']].copy(deep=True)

    for iter, row in df.iterrows():
        query = normalize_text(df.at[iter, 'topic'])
        facet = normalize_text(df.at[iter, 'facet_desc'])
        question = normalize_text(df.at[iter, 'question'])
        answer = normalize_text(df.at[iter, 'answer'])
        df.at[iter, 't5-question'] = facet + ' . ' + query + ' . ' + question
        df.at[iter, 'unifiedqa-question'] = u2i(question) + ' ? \\n ' + 'i am looking for ' + facet 
        df.at[iter, 'answer'] = normalize_text(answer)
        df.at[iter, 'answer-type'] = type_answer(answer)
        df.at[iter, 'answer-len'] = len(answer.split())

    df.to_csv(output_name, index=False)


clariq_data_files = [
    '../data/clariq/clariq_train.tsv',
    '../data/clariq/clariq_dev.tsv',
    '../data/clariq/clariq_test.tsv'
]

clariq_output_names = [
    '../data/processed/clariq_train.csv',
    '../data/processed/clariq_dev.csv',
    '../data/processed/clariq_test.csv',
]

for data_file, output_name in zip(clariq_data_files, clariq_output_names):
    df = pd.read_csv(data_file, delimiter='\t')
    df.replace(['', "NaN", 'NaT'], np.nan, inplace = True)
    df.dropna(subset=['question', 'facet_desc' , 'answer'], how='any', inplace=True)

    for iter, row in df.iterrows():
        query = normalize_text(df.at[iter, 'initial_request'])
        facet = normalize_text(df.at[iter, 'facet_desc'])
        question = normalize_text(df.at[iter, 'question'])
        answer = normalize_text(df.at[iter, 'answer'])
        df.at[iter, 't5-question'] = facet + ' . ' + query + ' . ' + question
        df.at[iter, 'unifiedqa-question'] = u2i(question) + ' ? \\n ' + 'i am looking for ' + facet 
        df.at[iter, 'answer'] = normalize_text(answer)
        df.at[iter, 'answer-type'] = type_answer(answer)
        df.at[iter, 'answer-len'] = len(answer.split())

    df.to_csv(output_name, index=False)

# 2. Train RoberTa on Qulac

In [None]:
import pandas as pd
import datasets
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4).cuda()
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = True, truncation=True)

def to_categorical(batch):
    """ 1-hot encodes a tensor """
    batch['labels'] = batch['labels'].unsqueeze(0)
    return batch

# qulac experiments
print("training roberta on qulac dataset")
data_files = {"train": "qulac_train.csv", "dev": "qulac_dev.csv", "test": "qulac_test.csv"}
dataset = datasets.load_dataset("../data/processed", data_files=data_files)
train_data, dev_data, test_data = dataset['train'],dataset['dev'],dataset['test']

train_data = train_data.remove_columns(['topic', 'facet_desc', 'question', 'answer', 't5-question', 'answer-len'])
train_data = train_data.rename_column('unifiedqa-question', 'text')
train_data = train_data.rename_column('answer-type', 'labels')
train_data = train_data.class_encode_column('labels')

test_data = test_data.remove_columns(['topic', 'facet_desc', 'question', 'answer', 't5-question',  'answer-len'])
test_data = test_data.rename_column('unifiedqa-question', 'text')
test_data = test_data.rename_column('answer-type', 'labels')
test_data = test_data.class_encode_column('labels')

train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

train_data = train_data.map(to_categorical, batched = False, batch_size = len(train_data))
test_data = test_data.map(to_categorical, batched = False, batch_size = len(test_data))

# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# define the training arguments
training_args = TrainingArguments(
    output_dir = './ckpt/roberta-qulac/',
    num_train_epochs=100,
    per_device_train_batch_size = 32,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 32,
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir='./ckpt/roberta-qulac/log',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)

trainer.train()


# 3. Train RoberTa on ClariQ

In [None]:

# clariq experiments
print("training roberta on clariq dataset")
data_files = {"train": "clariq_train.csv", "dev": "clariq_dev.csv", "test": "clariq_dev.csv"}
dataset = datasets.load_dataset("../data/processed", data_files=data_files)
train_data, dev_data, test_data = dataset['train'],dataset['dev'],dataset['test']


train_data = train_data.remove_columns(['topic_id', 'initial_request' , 'topic_desc', 'clarification_need', 'facet_id' ,'facet_desc', 'question_id', 'question', 'answer', 't5-question', 'answer-len'])
train_data = train_data.rename_column('unifiedqa-question', 'text')
train_data = train_data.rename_column('answer-type', 'labels')
train_data = train_data.class_encode_column('labels')

test_data = test_data.remove_columns(['topic_id', 'initial_request' , 'topic_desc', 'clarification_need', 'facet_id' ,'facet_desc', 'question_id', 'question', 'answer', 't5-question', 'answer-len'])
test_data = test_data.rename_column('unifiedqa-question', 'text')
test_data = test_data.rename_column('answer-type', 'labels')
test_data = test_data.class_encode_column('labels')

train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


train_data = train_data.map(to_categorical, batched = False, batch_size = len(train_data))
test_data = test_data.map(to_categorical, batched = False, batch_size = len(test_data))

# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# define the training arguments
training_args = TrainingArguments(
    output_dir = './ckpt/roberta-clariq/',
    num_train_epochs=100,
    per_device_train_batch_size = 32,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 32,
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps = 8,
    fp16 = True,
    logging_dir='./ckpt/roberta-clariq/log',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)

trainer.train()


# 4. Generate decoder inputs with Finetuned RoberTa (requires step 1)

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

id2prefix = {
    0: 'i dont know',
    1: 'no',
    2: '',
    3: 'yes'
}

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained("ckpt/roberta-qulac/checkpoint-102")
for output_name in qulac_output_names:
    df = pd.read_csv(output_name)
    for iter, row in df.iterrows():
        clf_inputs = roberta_tokenizer(df.at[iter, 'unifiedqa-question'], return_tensors="pt")
        with T.no_grad():
            logits = roberta_model(**clf_inputs).logits
        predicted_class_id = logits.argmax().item()
        df.at[iter, 'decoder-input'] = id2prefix[predicted_class_id]
    df.to_csv(output_name, index=False)

roberta_model = RobertaForSequenceClassification.from_pretrained("ckpt/roberta-clariq/checkpoint-100")
for output_name in clariq_output_names:
    df = pd.read_csv(output_name)
    for iter, row in df.iterrows():
        clf_inputs = roberta_tokenizer(df.at[iter, 'unifiedqa-question'], return_tensors="pt")
        with T.no_grad():
            logits = roberta_model(**clf_inputs).logits
        predicted_class_id = logits.argmax().item()
        df.at[iter, 'decoder-input'] = id2prefix[predicted_class_id]
    df.to_csv(output_name, index=False)