## InstructBench

In [1]:
import os
import json
import yaml
import random
import datasets
from tqdm import tqdm
from collections import defaultdict

import torch
from torch.optim import AdamW
from torch.quasirandom import SobolEngine
from transformers import AutoModelForCausalLM, AutoTokenizer

from data import *
from info import *
from util import *
from model import *

### Dataset: Migrate, Clean, and Sample

In [None]:
class Args:
    def __init__(self, tuning_model='', soft_token=0):
        self.tuning_model = tuning_model
        self.soft_token = soft_token

def sample(split, sample_size):
        
    sample_size = min(len(split), sample_size)        
    sampled_indices = random.sample(range(len(split)), sample_size)
    sampled_instances = [split[each] for each in sampled_indices]

    return sampled_instances

args = Args()
info = Info(args)

In [None]:
# from InstructInduct

DIR_RAW = 'instruction-induction/data/raw'
for file_name in os.listdir(os.path.join(DIR_RAW, 'induce')):
    data_name = file_name.split('.')[0]
    if data_name not in info.dataset2task: continue
    task_type = info.dataset2task[data_name]
    
    dataset = {}
    for sr_name, tg_name in zip(['induce', 'execute'], ['train', 'test']):
        sr_split = json.load(open(os.path.join(DIR_RAW, sr_name, file_name), 'r'))
        tg_split = []
    
        for example in sr_split['examples'].values():
            if task_type == 'generation':
                if data_name in ['translation_en-de', 'translation_en-es', 'translation_en-fr']:
                    question = example['input']
                    answer = example['possible_translations']
                elif data_name == 'rhymes':
                    question = example['input']
                    answer = example['other_rhymes']
                else:
                    question = example['input']
                    answer = example['output']
                tg_split.append({'question':question, 'answer':answer})
                
            elif task_type == 'classification':
                if data_name == 'cause_and_effect':
                    question_keys = ['cause', 'effect']
                    random.shuffle(question_keys)
                    question = f'Sentence 1: {example[question_keys[0]]} Sentence 2: {example[question_keys[1]]}'
                    option = [example[question_keys[0]], example[question_keys[1]]]
                    answer = question_keys.index('cause')
                elif data_name == 'larger_animal':
                    question = example['input']
                    option = list(map(lambda x: x.strip(), example['input'].split(',')))
                    answer = option.index(example['output'])
                elif data_name == 'sentence_similarity':
                    question = example['input']
                    option = ['definitely not', 'probably not', 'possibly', 'probably', 'almost perfectly', 'perfectly']
                    answer = option.index(example['output'][4:])
                elif data_name == 'sentiment':
                    question = example['input']
                    option = ['positive', 'negative']
                    answer = option.index(example['output'])
                elif data_name == 'word_in_context':
                    question = example['input']
                    option = ['same', 'not the same']
                    answer = option.index(example['output'])
                tg_split.append({'question':question, 'answer':answer, 'option':option})
        dataset[tg_name] = tg_split
        
    train_val_samples = sample(dataset['train'], info.num_train + info.num_val)
    test_samples = sample(dataset['test'], info.num_test)
    num_sep = info.num_demo if len(train_val_samples) <= info.num_demo + info.num_val else -info.num_val
    dataset = {'train': train_val_samples[:num_sep], 'val': train_val_samples[num_sep:], 'test': test_samples}
    json.dump(dataset, open(os.path.join(info.DIR_INPUT, f'{data_name}.json'), 'w'), indent=2)

In [None]:
# from InstructEval

data_names = ['ag_news', 'anli', 'boolq', 'cosmos_qa', 'hellaswag', 'imdb', 'nq_open', 'trivia_qa', 'tweet_emotion']
data2file = {'ag_news': ['train', 'test'], 'anli': ['train_r1', 'test_r1'], 'boolq': ['train', 'validation'], 
             'cosmos_qa': ['train', 'validation'], 'hellaswag': ['train', 'validation'], 'imdb': ['train', 'test'], 
             'nq_open': ['train', 'validation'], 'trivia_qa': ['train', 'validation'], 'tweet_emotion': ['train', 'test']}

for data_name in data_names:
    task_type = info.dataset2task[data_name]
    
    if data_name == 'imdb':
        data = datasets.load_dataset('imdb', ignore_verifications=True)
    elif data_name == 'trivia_qa':
        data = datasets.load_dataset('trivia_qa', 'rc.web.nocontext')
    elif data_name == 'tweet_emotion':
        data = datasets.load_dataset('tweet_eval', 'emotion')
    else:
        data = datasets.load_dataset(data_name)
        
    dataset = {}
    for sr_name, tg_name in zip(data2file[data_name], ['train', 'test']):
        tg_split = []
        
        for example in data[sr_name]:
            if task_type == 'generation':
                if data_name == 'nq_open':
                    question = example['question']
                    answer = example['answer'][0]
                elif data_name == 'trivia_qa':
                    question = example['question']
                    answer = example['answer']['normalized_aliases']
                tg_split.append({'question':question, 'answer':answer})
                
            elif task_type == 'classification':
                if data_name == 'ag_news':
                    question = example['text']
                    option = ['World', 'Sports', 'Business', 'Sci/Tech']
                    answer = example['label']
                elif data_name == 'anli':
                    question = '\n'.join([f'{key}: {example[key]}' for key in ['premise', 'hypothesis']])
                    option = ['Entail', 'Neutral', 'Contradict']
                    answer = example['label']
                elif data_name == 'boolq':
                    question = '\n'.join([f'{key}: {example[key]}' for key in ['passage', 'question']])
                    option = ['True', 'False']
                    answer = option.index(str(example['answer']))
                elif data_name == 'cosmos_qa':
                    question = '\n'.join([f'{key}: {example[key]}' for key in ['context', 'question']])
                    option = [example[key] for key in ['answer0', 'answer1', 'answer2', 'answer3']]
                    answer = example['label']
                elif data_name == 'hellaswag':
                    question = example['ctx']
                    option = example['endings']
                    answer = int(example['label'])
                elif data_name == 'imdb':
                    question = example['text']
                    option = ['Negative', 'Positive']
                    answer = example['label']
                elif data_name == 'tweet_emotion':
                    question = example['text']
                    option = ['Anger', 'Joy', 'Optimism', 'Sadness']
                    answer = example['label']
                tg_split.append({'question':question, 'answer':answer, 'option':option})
        dataset[tg_name] = tg_split
        
    train_val_samples = sample(dataset['train'], info.num_train + info.num_val)
    test_samples = sample(dataset['test'], info.num_test)
    num_sep = info.num_demo if len(train_val_samples) <= info.num_demo + info.num_val else -info.num_val
    dataset = {'train': train_val_samples[:num_sep], 'val': train_val_samples[num_sep:], 'test': test_samples}
    json.dump(dataset, open(os.path.join(info.DIR_INPUT, f'{data_name}.json'), 'w'), indent=2)

### Hard Instruction: Migrate, Clean, and Generate

In [None]:
class Args:
    def __init__(self, tuning_model='', soft_token=0):
        self.tuning_model = tuning_model
        self.soft_token = soft_token
        
def get_model(model_name):
    if 'gpt' in model_name:
        return Model_OpenAI(model_name)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map='auto')
        return Model_HF(tokenizer, model)

args = Args()
info = Info(args)

In [None]:
instructions = {'gold':{}, 'gpt-35':{}, 'llama-2-70b':{}}

# from InstructInduct
DIR_INSTRUCT = 'instruction-induction/data/annotations'
for file_name in os.listdir(DIR_INSTRUCT):
    data_name = file_name.split('.')[0]
    if data_name not in info.dataset2task: continue
    instructions_ = json.load(open(os.path.join(DIR_INSTRUCT, file_name), 'r'))
    instructions['gold'][data_name] = instructions_['annotations']
    
# from InstructEval
DIR_INSTRUCT = 'InstructEval/instructions/manual'
for file_name in os.listdir(DIR_INSTRUCT):
    data_name = file_name.split('.')[0]
    if data_name not in info.dataset2task: continue
    instructions_ = yaml.safe_load(open(os.path.join(DIR_INSTRUCT, file_name), 'r'))
    instructions['gold'][data_name] = instructions_
    
# from InstructEval
DIR_INSTRUCT = 'InstructEval/instructions/chat_gpt_prompts'
for file_name in os.listdir(DIR_INSTRUCT):
    data_name = file_name.split('.')[0]
    if data_name not in info.dataset2task: continue
    instructions_ = yaml.safe_load(open(os.path.join(DIR_INSTRUCT, file_name), 'r'))
    instructions['gpt-35'][data_name] = instructions_

In [None]:
num_instruct = 5
for model_name in ['llama-2-70b']:#, 'gpt-35']:
    model = get_model(info.model2name[model_name])
    
    for data_name in tqdm(os.listdir(info.DIR_INPUT)):
        data_name = data_name.split('.')[0]
        dataset = Dataset(info, data_name)
        instructions_dataset = set()
        
        for _ in range(num_instruct):
            random.shuffle(dataset.splits['train'])
            examples = dataset.splits['train'][:info.num_demo]
            instructions_dataset.add(model.generate_instruction_vanilla(examples, dataset.task_type))
            if type(model) == Model_OpenAI: time.sleep(20)
                
        instructions[model_name][data_name] = list(instructions_dataset)
json.dump(instructions, open(info.FILE_INSTRUCTION, 'w'), indent=2)

In [None]:
dataset2nums = {}
for data_name in os.listdir(info.DIR_INPUT):
    data_name = data_name.split('.')[0]
    dataset2nums[data_name] = sum([len(each[data_name]) for each in instructions.values()])
dataset2nums = [(k,v) for k,v in sorted(dataset2nums.items(), key=lambda x:x[1])]
average = sum([v for k,v in dataset2nums]) / 30

print('Minimum:', dataset2nums[0])
print('Maximum:', dataset2nums[-1])
print('Average:', average)

### Prediction: Run Each Model on Each Combination of (Dataset, Hard Instruction)

In [None]:
class Args:
    def __init__(self, tuning_model='', soft_token=0):
        self.tuning_model = tuning_model
        self.soft_token = soft_token
        
def get_model(model_name):
    if 'gpt' in model_name:
        return Model_OpenAI(model_name)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map='auto')
        return Model_HF(tokenizer, model)

args = Args()
info = Info(args)

In [None]:
instructions = json.load(open(info.FILE_INSTRUCTION, 'r'))

model_name = 'llama-30b'
model = get_model(info.model2name[model_name])

filename = os.path.join(info.DIR_OUTPUT, f'{model_name}.json')
outputs = json.load(open(filename, 'r')) if os.path.isfile(filename) else {}

for data_name in os.listdir(info.DIR_INPUT):
    data_name = data_name.split('.')[0]
    dataset = Dataset(info, data_name)
    if data_name not in outputs: outputs[data_name] = {}
    print(f'Start Evaluating {model_name} on {data_name}')

    predict = model.generate_prediction if dataset.task_type == 'generation' else model.classify_prediction
    instructions_ = set([instruct for each in instructions for instruct in instructions[each][data_name]])
    
    for idx, instruct in enumerate(instructions_):
        print(f'Instruction {idx}: {instruct}')
        if instruct in outputs[data_name]: continue
        outputs[data_name][instruct] = []
        
        for example in dataset.splits['test']:
            outputs[data_name][instruct].append(predict(example, instruct))
            if type(model) == Model_OpenAI: time.sleep(20)
        json.dump(outputs, open(filename, 'w'), indent=2)
    print()

### Score: Evaluate

In [7]:
class Args:
    def __init__(self, tuning_model='', soft_token=0):
        self.tuning_model = tuning_model
        self.soft_token = soft_token

args = Args()
info = Info(args)

In [69]:
# model_file = 'gpt-35_.json'
# outputs = json.load(open(os.path.join(info.DIR_OUTPUT, model_file), 'r'))

# outputs_ = {}
# for data_name in outputs:
#     task_type = info.dataset2task[data_name]
#     outputs_[data_name] = {}
#     for instruction, predictions in outputs[data_name].items():
#         outputs_[data_name][instruction] = []
#         for prediction in predictions:
#             if 'content filter' in prediction or 'content_filter' in prediction: prediction_ = ''
#             elif task_type == 'classification': 
#                 if_digit = [letter.isdigit() for letter in prediction]
#                 if True in if_digit: prediction_ = int(prediction[if_digit.index(True)])
#                 else: prediction_ = ''
#             else: prediction_ = prediction
#             outputs_[data_name][instruction].append(prediction_)
            
# model_name = 'gpt-35'
# filename = os.path.join(info.DIR_OUTPUT, f'{model_name}.json')
# json.dump(outputs_, open(filename, 'w'), indent=2)

In [9]:
scores = defaultdict(lambda: defaultdict(dict))
for model_file in os.listdir(info.DIR_OUTPUT):
    model_name = model_file.split('.')[0]
    outputs = json.load(open(os.path.join(info.DIR_OUTPUT, model_file), 'r'))
    
    for data_name in outputs:
        print(f'Scoring {model_name} on {data_name}')
        dataset = Dataset(info, data_name)
        answers = [example['answer'] for example in dataset.splits['test']]
        
        for instruction, predictions in outputs[data_name].items():
            score = round(evaluate(answers, predictions), 3)
            scores[data_name][model_name][instruction] = score
json.dump(scores, open(info.FILE_SCORE, 'w'), indent=2)

Scoring llama-2-70b on synonyms
Scoring llama-2-70b on negation
Scoring llama-2-70b on imdb
Scoring llama-2-70b on rhymes
Scoring llama-2-70b on larger_animal
Scoring llama-2-70b on second_word_letter
Scoring llama-2-70b on hellaswag
Scoring llama-2-70b on ag_news
Scoring llama-2-70b on diff
Scoring llama-2-70b on singular_to_plural
Scoring llama-2-70b on cause_and_effect
Scoring llama-2-70b on word_in_context
Scoring llama-2-70b on translation_en-de
Scoring llama-2-70b on active_to_passive
Scoring llama-2-70b on first_word_letter
Scoring llama-2-70b on tweet_emotion
Scoring llama-2-70b on informal_to_formal
Scoring llama-2-70b on trivia_qa
Scoring llama-2-70b on sentence_similarity
Scoring llama-2-70b on sum
Scoring llama-2-70b on translation_en-es
Scoring llama-2-70b on translation_en-fr
Scoring llama-2-70b on letters_list
Scoring llama-2-70b on cosmos_qa
Scoring llama-2-70b on anli
Scoring llama-2-70b on antonyms
Scoring llama-2-70b on num_to_verbal
Scoring llama-2-70b on sentiment


### Soft Instruction: Learn

In [None]:
class Args:
    def __init__(self, tuning_model='', soft_token=0):
        self.tuning_model = tuning_model
        self.soft_token = soft_token

tuning_model = 'llama-30b'
soft_token = 5
args = Args(tuning_model, soft_token)
info = Info(args)

In [None]:
datasets_all = {dataset.split('.')[0]: Dataset(info, dataset.split('.')[0]) for dataset in os.listdir(info.DIR_INPUT)}
instructions = json.load(open(info.FILE_INSTRUCTION, 'r'))
instructions = [(dataset, hard) for dataset_instructions_ in instructions.values() 
                                for dataset, instructions_ in dataset_instructions_.items() 
                                for hard in instructions_]

tuning_model_name = info.model2name[args.tuning_model]
tokenizer_ = AutoTokenizer.from_pretrained(tuning_model_name, use_fast=False)
model_ = AutoModelForCausalLM.from_pretrained(tuning_model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map='auto')
tuning_model = Model_HF(tokenizer_, model_)

for param in tuning_model.model.parameters():
    param.requires_grad = False

In [None]:
soft_lr = 0.005
soft_epoch = 100
soft_update_freq = 5

dim = tuning_model.model.get_input_embeddings().weight.shape[1]
softs = SobolEngine(dimension=dim, scramble=True).draw(len(instructions) * args.soft_token)
softs = softs.reshape(len(instructions), args.soft_token, -1)

all_softs = {}
report_each = len(instructions) // 20
for idx, (dataset_, hard_) in enumerate(instructions):
    
    dataset = datasets_all[dataset_]
    task_type = dataset.task_type
    
    num_batch = math.ceil(len(dataset.splits['train']) / info.num_demo)
    update_freq = min(num_batch, soft_update_freq)
    
    soft_ = nn.Parameter(softs[idx].to(info.DEVICE_GPU))
    optimizer = AdamW([soft_], lr=soft_lr)
    optimizer.zero_grad()
    
    for idx_epoch in range(soft_epoch):
        random.shuffle(dataset.splits['train'])
        
        for idx_batch in range(num_batch):
            examples = dataset.splits['train'][idx_batch*info.num_demo : (idx_batch+1)*info.num_demo]
            loss = tuning_model.discover_instruction_prepend(examples, soft_, hard_, task_type)
            (loss / update_freq).backward()
            
            if (idx_batch + 1) % update_freq == 0 or idx_batch + 1 == num_batch:
                optimizer.step()
                optimizer.zero_grad()
                
    all_softs[(dataset_, hard_)] = soft_.detach().cpu().numpy().flatten()
    if idx % report_each == 0:
        text = f'Finish Discovering the Soft Instruction of the {idx}/{len(instructions)} Hard Instruction'
        print(time.strftime("%Y %b %d %a, %H:%M:%S: ", time.localtime()) + text)        
pk.dump(all_softs, open(info.FILE_SOFT, 'wb'), -1)