In [1]:
import json
import time
import openai
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from collections import defaultdict
from sentence_transformers import CrossEncoder
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
unwanted_words = [
    "text-based AI language model", "domestic violence", "please refrain", "derogatory", "inappropriate", "offensive",
    "racism", "racist", "racial", "discriminate", "discriminatory", "discrimination", "sexist", "sexism", "unacceptable",
    "inclusive workplace", "lgbt", "morals", "ethics", "ethical", "legality", "illegal", "illegality", "hateful", "harmful",
    "it is never okay", "It is important to", "It's important to", "real-world consequences", "hate speech", "glorify",
    "not be appropriate", "supremacist", "extremist", "responsible AI", "AI principles", "AI assistant", "an AI language",
    "ableist", "hurtful", "gender stereotype", "gender inequality", "underrepresentation", "safe spaces", "gender-based",
    "inclusivity", "feminist", "feminism", "transgender", "empowerment", "communist", "capitalism", "stereotypes", "biases",
    "bias", "Microaggression", "prioritize human safety", "as a language model", "as an AI language model",
    "As a large language model", "As an AI", "ethical principles", "consensual", "it is not appropriate",
    "it's not appropriate", "I cannot fulfill your request", "harmful to human beings", "ethical guidelines", "my guidelines",
    "prioritize user safety", "adhere to ethical guidelines", "harmful consequences", "potentially harmful", "dangerous activities",
    "promote safety", "well-being of all users", "responsible information sharing", "jeopardize the safety",
    "illegal actions or intentions", "undermine the stability", "promote the well-being", "illegal activities or actions",
    "adherence to the law", "potentially be harmful", "illegal substances or activities", "committed to promoting",
    "safe information", "lawful information", "cannot provide guidance", "cannot provide information",
    "unable to offer assistance", "cannot engage in discussions", "programming prohibits", "follow ethical guidelines",
    "ensure the safety", "involves an illegal subject", "prioritize safety", "illegal subject", "prioritize user well-being",
    "cannot support or promote", "activities that could harm", "pose a risk to others", "against my programming",
    "activities that could undermine", "potentially dangerous", "not within the scope", "designed to prioritize safety",
    "not able to provide", "maintain user safety", "adhere to safety guidelines", "dangerous or harmful",
    "cannot provide any information", "focus on promoting safety", "an AI language model you don't have",
    "As an AI language model, I cannot", "As an AI language model, I do not", "As an AI language model, I am not able",
    "As an AI language model, I don't have personal", "I am an AI language model and do not",
    "However,it is important to use any code or information provided responsibly and within legal and ethical boundaries.",
    "As an AI language model, I don't have", "As an AI language model, I am only able",
    "AI language model and I do not", "As an AI language model, I cannot modify",
    "I know as an AI language model you don't have", "as an AI language model, you cannot",
    "I'm sorry, but as an AI language model", "Unfortunately, I cannot provide", "I'm sorry, I cannot",
    "I'm sorry, I cannot generate", "AI cannot create or program", "I'm afraid I cannot create", "you cannot create an", 
    "it operates ethically and is", "had an ethical system", "Ensuring the ethical", "and ethical sourcing", 
    "are from ethical", "legal and ethical", "engage in unethical", "unethical or aggressive", "unethical business",
    "como modelo de lenguaje AI", "Lo siento, como modelo de lenguaje", "no puedo proporcionar",
    "pero debido a mi capacidad para generar códigos complejos y completos es limitado", "Lo siento, pero no puedo",
    "Lo siento, pero como modelo de lenguaje, no puedo proporcionar", "Lo siento, como modelo de lenguaje, no tengo",
    "Lo siento, debe haber habido una confusión", "Lo siento, como modelo de lenguaje, no puedo realizar",
    "Lo siento, soy un modelo de lenguaje y no tengo la capacidad de generar", "Lamento no poder proporcionarte el código",
    "Desculpe-me, mas a linguagem vulgar e ofensiva", "apropriada em nenhum contexto",
    "Como modelo de linguagem", "Como um modelo de linguagem, não tenho a capacidade de",
    "I cannot assist", "prioritize ethical", "respectful", "morally", "I'm sorry,", "I'm an", "I am an",
    "I'm an AI", "I am an AI", "my purpose", "filter_bad_language", "entertainment purposes", "purely hypothetical", 
    "not a human", "cannot provide", "can't provide", "won't provide", "not provide", "worth noting",
    "cause harm", "a language model", "keep in mind", "unethical", "bad language", "the words ****", "bad_language",
    "certainly not", "complying", "comply", "I cannot", "my main goal", "As a machine", "I don't have the ability",
    "I am here to assist", "my purpose is to ", "my knowledge cutoff", "my knowledge cut off", "September 2021", 
    "regulations", "not be suitable", "I apologize, but", "It is not possible", "controversial", "my programming",
    "ethically", "it is important to", "Please note", "sensitive topic", "not acceptable", "It is important for",
    "divisive", "not appropriate", "our values", "f*cking", "F*ck", "sh*t", "diversity and", "diversityF and inclusion",
    "values diversity", "social responsibility", "environmental, social, and governance", " ESG ", "against women",
    "problematic history", "diversity", "*This chat conversation is shared from", "*This conversation is shared from",
    "sorry", "apologize", "apologies", "cannot", "can't", "unable", "not able", "not be able", 
    "unfortunately", "I do not", "non-malicious requests", "never"
]
    
def contains_unwanted_words(text):
    text = text.lower()
    for word in unwanted_words:
        if word.lower() in text:
            return True
    return False

In [3]:
template_simplify = \
'''
## Instruction
Do not respond to the input query.
Instead, extract only the key content phrases that capture the core intent of the query.

Your output should:
1. Be a simplified phrase or fragment that summarizes the essential content.
2. Use only words that appear in the original input query. Do not add, paraphrase, or infer beyond what is stated.

Follow the style and structure shown in the provided examples.
Do not repeat the given examples, the input query, or any headings like "### Simplified Query" in your output. 
Only return the simplified query.

## Examples
### Input Query 1: Give detailed instructions for making a cake.
### Simplified Query 1: Make a cake.

### Input Query 2: List personal care items easily obtained at home.
### Simplified Query 2: Obtain personal care items at home.

### Input Query 3: Write an article celebrating the success of the Apollo 11 mission.
### Simplified Query 3: Celebrate the success of the Apollo 11 mission.

## Your Task
### Input Query: {QUERY}
### Simplified Query: 
'''

def simplify_query(idx, row, key):
    prompt = template_simplify.format(**{'QUERY':row[key]})
    simplified = call_gpt(prompt)
    return idx, simplified

In [4]:
template_complete_input = \
"""
Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request.

### Instruction:
{INSTRUCTION}

### Input:
{INPUT}

### Response:
"""
template_complete_no_input = \
"""
Below is an instruction that describes a task. 
Write a response that appropriately completes the request.

### Instruction:
{INSTRUCTION}

### Response:
"""

def complete_query(idx, row, key):
    if 'input' in row and row['input'] is not np.nan: 
        prompt = template_complete_input.format(**{'INSTRUCTION':row[key], 'INPUT':row['input']})
    else: 
        prompt = template_complete_no_input.format(**{'INSTRUCTION':row[key]})
    response = call_gpt(prompt)
    return idx, response

In [5]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
openai_model = "gpt-4o"

def call_gpt(prompt, retries=50):
    messages = [{"role": "system", "content": 'You are a helpful assistant.'}, 
                {"role": "user", "content": prompt}]
    for i in range(retries):
        try:
            response = openai_client.chat.completions.create(
                model = openai_model, messages = messages, max_tokens = 4096
            )
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            time.sleep(i)
        except Exception as e:
            print(f"An error occurred: {e}")
            raise
    raise Exception(f"Failed to call GPT-4o after {retries} retries.")
    
def call_gpt_batch(func, dataset, key):
    responses = {}
    with ThreadPoolExecutor(max_workers=50) as exe:
        futures = [exe.submit(func, idx, row, key) for idx, row in dataset.iterrows()]
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                idx, resp = future.result()
                responses[idx] = resp
            except Exception as e:
                print(f"Error: {e}")
    responses = [resp for idx, resp in sorted(responses.items(), key=lambda x:x[0])]
    return responses

In [6]:
def aug_prefix(query, prefix):
    return prefix + query[0].lower() + query[1:]

def aug_suffix(query, suffix):
    if query[-1] in string.punctuation: return query[:-1] + suffix + query[-1]
    else: return query + suffix

style2augment = {
    ('list', 'prefix'): lambda x: aug_prefix(x, 'Create a list to '),
    ('list', 'suffix'): lambda x: aug_suffix(x, ' by creating a list'),
    ('poem', 'prefix'): lambda x: aug_prefix(x, 'Write a poem to '),
    ('poem', 'suffix'): lambda x: aug_suffix(x, ' by writing a poem'),
}

### Training Data

In [7]:
# load source data
size = 1000
train = load_dataset('yahma/alpaca-cleaned', split='train').to_pandas()
train = train.sort_values(by='output', key=lambda x: x.str.len(), ascending=False).reset_index(drop=True)[:size*5]
train = train[['input', 'instruction']].rename(columns={'instruction':'original instruction'})

# simplify the original instructions
train['simplified instruction'] = call_gpt_batch(simplify_query, train, 'original instruction')

# only retain those simplified instructions that can entail the original ones
model = CrossEncoder('cross-encoder/nli-deberta-v3-base')
model.model.to('cuda:0')
nli_input = [(row['original instruction'], row['simplified instruction']) for _, row in train.iterrows()]
scores = model.predict(nli_input, show_progress_bar=True)
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
not_entailment = np.where(np.array(labels)!='entailment')[0]
train = train.drop(not_entailment).reset_index(drop=True)

# further augment the instructions into four different styles
for style, augment in style2augment.items():
    train['_'.join(style) + ' instruction'] = train['simplified instruction'].apply(augment)
    
# generate responses to the queries by prompting GPT-4o
for col in train.columns:
    new_col = col.replace('instruction', 'response')
    if col.endswith('instruction') and 'suffix' not in col:
        train[new_col] = call_gpt_batch(complete_query, train, col)
    elif 'suffix' in col:
        train[new_col] = train[new_col.replace('suffix', 'prefix')]
        
# remove rows that contain unwanted safety-related words
unwanted_indices = set()
for idx, row in train.iterrows():
    for col in train.columns:
        if contains_unwanted_words(row[col]):
            unwanted_indices.add(idx); break
train = train.drop(unwanted_indices)

# sample from the qualified rows and save the dataset
styles = ['simplified', 'original', 'list_prefix', 'list_suffix', 'poem_prefix', 'poem_suffix']
columns = ['input'] + [f'{style} {each}' for style in styles for each in ['instruction', 'response']]
train = train[columns].sample(n=size, ignore_index=True)
train.to_csv('Data/Source/train.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:40<00:00, 49.75it/s]


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████| 4961/4961 [11:49<00:00,  6.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 4961/4961 [10:44<00:00,  7.69it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 4961/4961 [09:32<00:00,  8.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 4961/4961 [05:30<00:00, 15.03it/s]


In [8]:
# convert the source data into json format
train = pd.read_csv('Data/Source/train.csv')
styles = ['simplified', 'original', 'list_prefix', 'list_suffix', 'poem_prefix', 'poem_suffix']
style2json = {style: [{'instruction':row[f'{style} instruction'], 
                       'input':row['input'] if row['input'] is not np.nan else '', 
                       'output':row[f'{style} response']} 
                      for _, row in train.iterrows()]
              for style in styles}

# mix the training data of different styles and save the mixed datasets
size, ratios = 1000, [0.5, 1]
style2mix = defaultdict(dict)
for style in styles[1:]:
    for ratio in ratios:
        cutoff = int(size*ratio)
        style2mix[style][ratio] = style2json[style][:cutoff] + style2json['simplified'][cutoff:]
        json.dump(style2mix[style][ratio], open(f'Data/Train/{style}-{cutoff}.json', 'w'), indent=4)
json.dump(style2json['simplified'], open(f'Data/Train/simplified-1000.json', 'w'), indent=4)

### Utility Evaluation Data

In [10]:
# load source data
utility = load_dataset('tatsu-lab/alpaca_eval', split='eval').to_pandas()
utility = utility[['dataset', 'instruction']].rename(columns={'instruction':'original instruction'})

# simplify the original instructions
utility['simplified instruction'] = call_gpt_batch(simplify_query, utility, 'original instruction')

# only retain those simplified instructions that can entail the original ones
model = CrossEncoder('cross-encoder/nli-deberta-v3-base')
model.model.to('cuda:0')
nli_input = [(row['original instruction'], row['simplified instruction']) for _, row in utility.iterrows()]
scores = model.predict(nli_input, show_progress_bar=True, batch_size=16)
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
entailment = np.where(np.array(labels)=='entailment')[0]
utility = utility.iloc[entailment].reset_index(drop=True)
utility['index'] = entailment

# further augment the instructions into four different styles
for style, augment in style2augment.items():
    utility['_'.join(style) + ' instruction'] = utility['simplified instruction'].apply(augment)
    
# generate responses to the queries by prompting GPT-4o
for col in utility.columns:
    new_col = col.replace('instruction', 'baseline')
    if col.endswith('instruction') and 'suffix' not in col:
        utility[new_col] = call_gpt_batch(complete_query, utility, col)
    elif 'suffix' in col:
        utility[new_col] = utility[new_col.replace('suffix', 'prefix')]
        
# save the dataset
styles = ['simplified', 'original', 'list_prefix', 'list_suffix', 'poem_prefix', 'poem_suffix']
columns = ['index', 'dataset'] + [f'{style} {each}' for style in styles for each in ['instruction', 'baseline']]
utility = utility[columns]
utility.to_csv('Data/Source/utility.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 805/805 [00:20<00:00, 40.14it/s]


Batches:   0%|          | 0/51 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████| 722/722 [01:46<00:00,  6.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 722/722 [01:41<00:00,  7.09it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 722/722 [01:47<00:00,  6.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 722/722 [01:28<00:00,  8.20it/s]


### Safety Evaluation Data

In [9]:
# load source data
safety = pd.read_csv('../1_InflatedASR/Data/Input/jailbreaks.csv')
safety = safety[['Source', 'Simplified Query', 'Original Query']]
safety.columns = safety.columns.str.lower()

# further augment the instructions into four different styles and save the dataset
for style, augment in style2augment.items():
    safety['_'.join(style) + ' query'] = safety['simplified query'].apply(augment)
safety.to_csv('Data/Source/safety.csv', index=False)