In [1]:
import os
import json
import re
import torch
from datasets import Dataset
from utils.data import load_data, get_prompt
from trl import SFTTrainer

In [2]:
input_content = 'all'
exemplar = 'zero-shot'
output_dir = 'output'

checkpoint_foldername='checkpoints'
adapter_foldername='saved_adapters'
loss_foldername='loss'
result_foldername='results'
response_foldername='responses'

In [3]:
data_filepath = 'data/data.json'
split_filepath = 'data/data_split'
train_data, test_data = load_data(data_filepath, split_filepath)

In [4]:
# Construct the training prompts and test prompts
train_samples = Dataset.from_dict({
    'text': [get_prompt(instance, input_content, data_type='train', exemplar=exemplar) for instance in train_data]
})
test_samples = Dataset.from_dict({
    'text': [get_prompt(instance, input_content, data_type='test', exemplar=exemplar) for instance in test_data],
    'label': [instance['label'] for instance in test_data]
})

In [5]:
train_samples['text']

["### Prompt: Read the tweets chronologically published and determine if the author of the tweet is located at Miami when the tweet was published. The '#' in the hashtags and '@' in the mentions are removed. If the tweets are associated with advertisements or news reports, then the author of the tweet is more likely at Miami. Please select the number listed below.\n\nTWEET 1:\nThe water boil advisory that went into effect following the Memorial Day tornado outbreak has been lifted for the remaining customers in Dayton, Montgomery and Greene County, with the exception of Brookville.\n\nTWEET 2:\nSee how neighborhoods are helping surrounding communities tonight on 2 NEWS First at 4\n\nTWEET 3:\nLearn about organizations sending hundreds of volunteers to help clean up Trotwood and Dayton this weekend and how you can help, tonight on 2 NEWS First at 4.\n\nTWEET 4:\nVolunteers are needed across the Miami Valley following the Memorial Day tornado outbreak.\n\nTWEET 5:\nVolunteers are needed 

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = 'google/flan-ul2'

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    load_in_8bit=True,
    device_map='auto',
    cache_dir='/mnt/DATA/hf_cache/'
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
import torch.nn as nn

for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): 
        return super().forward(x).to(torch.float32)
    
model.lm_head = CastOutputToFloat(model.lm_head)

In [8]:
from pathlib import Path
from utils.learning import get_train_args, get_peft_config, get_model_and_tokenizer
from transformers import TrainingArguments
from peft import get_peft_model, LoraConfig

In [11]:
# Define the training arguments
checkpoint_folder = os.path.join(output_dir, checkpoint_foldername)
checkpoint_subfolder = os.path.join(checkpoint_folder, f"flan_ul2+_{input_content}_{exemplar}_checkpoints")
Path(checkpoint_subfolder).mkdir(parents=True, exist_ok=True)
training_args = get_train_args(checkpoint_subfolder)

# Use PEFT to only finetune part of its parameters
peft_config = LoraConfig(lora_alpha=4, lora_dropout=0.1, r=2, task_type="CAUSAL_LM")

# Get training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    bf16=True,
    learning_rate=1e-5,
    logging_steps=5,
    save_strategy="epoch",
    save_steps=50,
    save_total_limit=5,
    num_train_epochs=5,
)

# Define the Trainer
model = get_peft_model(model, peft_config)
model.config.use_cache = False
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_samples,
    dataset_text_field="text",
    max_seq_length=2048,
#     peft_config=peft_config,
)

# Start fine-tuning!
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2445 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: 

In [13]:
import os
import json
from sklearn.metrics import classification_report

response_folder = 'output/results/'
for response_filename in sorted(os.listdir(response_folder)):
    if 'gpt' in response_filename:
        continue
    response_filepath = os.path.join(response_folder, response_filename)
    with open(response_filepath, 'r') as file:
        response = json.load(file)
#     print("#" * 60)
#     print("#" * 10 + "  " + response_filename + "  " + "#" * 10)
    print(f"{response_filename} | Accuracy: {response['accuracy']:.2f} | F1: {response['weighted avg']['f1-score']:.2f}")
#     break

flan_alpaca_all_five-shot_result | Accuracy: 0.42 | F1: 0.45
flan_alpaca_all_one-shot_result | Accuracy: 0.38 | F1: 0.46
flan_alpaca_all_ten-shot_result | Accuracy: 0.42 | F1: 0.45
flan_alpaca_all_zero-shot_result | Accuracy: 0.34 | F1: 0.50
flan_alpaca_early_target_five-shot_result | Accuracy: 0.43 | F1: 0.46
flan_alpaca_early_target_one-shot_result | Accuracy: 0.37 | F1: 0.46
flan_alpaca_early_target_ten-shot_result | Accuracy: 0.43 | F1: 0.46
flan_alpaca_early_target_zero-shot_result | Accuracy: 0.33 | F1: 0.50
flan_alpaca_target_five-shot_result | Accuracy: 0.46 | F1: 0.46
flan_alpaca_target_later_five-shot_result | Accuracy: 0.43 | F1: 0.45
flan_alpaca_target_later_one-shot_result | Accuracy: 0.38 | F1: 0.45
flan_alpaca_target_later_ten-shot_result | Accuracy: 0.44 | F1: 0.46
flan_alpaca_target_later_zero-shot_result | Accuracy: 0.34 | F1: 0.50
flan_alpaca_target_one-shot_result | Accuracy: 0.43 | F1: 0.45
flan_alpaca_target_ten-shot_result | Accuracy: 0.48 | F1: 0.47
flan_alpaca_

In [12]:
models = ['flan_t5', 'flan_ul2', 'flan_alpaca']
inputs = ['target', 'early_target', 'target_later', 'all']
shots = ['zero-shot', 'one-shot', 'five-shot', 'ten-shot']
for model in models:
    for input_ in inputs:
        print(f'# {model} + {input_}')
        print(f"echo \"Doing experiments with {model} + {input_}\"")
        for index, shot in enumerate(shots):
            if index == 3:
                command = f'CUDA_VISIBLE_DEVICES={index} python llm.py -data_dir data/ -experiment {model} -input_content {input_} -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar {shot}'
            else:
                command = f'CUDA_VISIBLE_DEVICES={index} python llm.py -data_dir data/ -experiment {model} -input_content {input_} -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar {shot} &'
            print(command)
        print()
    print()

# flan_t5 + target
echo "Doing experiments with flan_t5 + target"
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar zero-shot &
CUDA_VISIBLE_DEVICES=1 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar one-shot &
CUDA_VISIBLE_DEVICES=2 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar five-shot &
CUDA_VISIBLE_DEVICES=3 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar ten-shot

# flan_t5 + early_target
echo "Doing experiments with flan_t5 + early_target"
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content early_target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar zero-shot &
CUDA_VISIBLE_DEVIC

## ChatGPT experiments

In [14]:
import os
from utils.data import load_data, get_prompt
from datasets import Dataset

data_dir = 'data/'
input_content = 'target'
exemplar = 'zero-shot'

In [15]:
data_filepath = os.path.join(data_dir, 'data.json')
split_filepath = os.path.join(data_dir, 'data_split')
train_data, test_data = load_data(data_filepath, split_filepath)

In [16]:
# Construct the training prompts and test prompts
train_samples = Dataset.from_dict({
    'text': [get_prompt(instance, input_content, data_type='train', exemplar=exemplar) for instance in train_data]
})
test_samples = Dataset.from_dict({
    'text': [get_prompt(instance, input_content, data_type='test', exemplar=exemplar) for instance in test_data],
    'label': [instance['label'] for instance in test_data]
})

In [17]:
test_samples['text'][0]

"### Prompt: Read the tweets chronologically published and determine if the author of the tweet is located at Atlanta when the tweet was published. The '#' in the hashtags and '@' in the mentions are removed. If the tweets are associated with advertisements or news reports, then the author of the tweet is more likely at Atlanta. Please select the number listed below.\n\nJust a sampling of our Thanksgiving Eve feast. Special shout-out to FOX5ATLCallaway and Elepo for organizing it!!!!!! FOX5Atlanta\n\nOPTIONS:\n1. Yes.\n2. I cannot determine if the author of the tweet is located at Atlanta when the tweet was published.\nANSWER: "

In [18]:
import os
import openai
import time
import json
from tqdm.notebook import tqdm

openai.api_key = "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"
output_dir = 'output/results/'
output_filename = f"chatgpt_{input_content}_{exemplar}_outputs.json"
output_filepath = os.path.join(output_dir, output_filename)

In [24]:
pbar = tqdm(total=len(test_samples['text']))

if os.path.isfile(output_filepath):
    with open(output_filepath, 'r') as file:
        processed_ids = set([json.loads(line)['id'] for line in file.read().splitlines()])
else:
    processed_ids = set()

with open(output_filepath, 'a+') as file:
    
    for index, prompt in enumerate(test_samples['text']):

        if str(index) in processed_ids:
            pbar.update(1)
            continue

        response = openai.ChatCompletion.create(
          model="gpt-3.5-turbo",
          messages=[{"role": "user", "content": prompt}]
        )

        response = response.to_dict()
        response['id'] = str(index)
        file.write(json.dumps(response) + '\n')

#         time.sleep(10)
        pbar.update(1)

pbar.close()

  0%|          | 0/1049 [00:00<?, ?it/s]

In [26]:
import json

with open('output/responses/flan_ul2_all_ten-shot_response', 'r') as file:
    answers = json.load(file)

# original_tweets = []
# for x in test_data:
#     concat_text = ''
#     for index, text in enumerate(x['texts']):
#         concat_text += f'Tweet {index + 1}:\n{text}\n'
#     location = x['location']
#     label = x['label']
#     concat_text += f'Location: {location}\n'
#     concat_text += f'Label ---> {label}'
#     original_tweets.append(concat_text)

## error analysis

In [27]:
test_data[0]

{'texts': ['aungeliquefox5 Yes! And so generous !',
  'AlexaLiackoFOX5 YOU NAME IT!!!!',
  "YES! I am SO happy for Celine. She has one of the greatest voices in music. And she's been through so much. No one deserves this more!",
  'Just a sampling of our Thanksgiving Eve feast. Special shout-out to FOX5ATLCallaway and Elepo for organizing it!!!!!! FOX5Atlanta',
  'Mood when I saw FOX5ATLCallaway slicing our FOX5Atlanta Thanksgiving Eve Turkey!!! YAAASSSSSS!!!',
  'CityStonecrest FOX5ATLCallaway Elepo FOX5Atlanta GONE!!! LOL',
  'I have SO much to be thankful for! Take time to spend this holiday with the people you love. FOX5Atlanta'],
 'label': 'Yes',
 'location': 'Atlanta'}

In [28]:
count = 0
for x, answer in zip(test_data, answers):
    if answer != x['label']:
        count += 1
        if count > 100:
            break
        concat_text = ''
        for index, text in enumerate(x['texts']):
            concat_text += f'Tweet {index + 1}:\n{text}\n'
        location = x['location']
        label = x['label']
        concat_text += f'Location: {location}\n'
        concat_text += f'Label ---> {label}'
        print('=' * 60)
        print(f'######### Error #{count} #########')
        print(concat_text)
        print(f'Answer ---> {answer}')    

######### Error #1 #########
Tweet 1:
Summer Deals
Tweet 2:
Octavia Spencer as a horror-movie villain? Yes, and she’s actually really good.
Tweet 3:
FOIA Specialist – Washington Jobs
Tweet 4:
11 Die On Virginia Roads Over Memorial Day Weekend 2019: Police
Tweet 5:
Lincoln Memorial reflecting pool is being drained for cleaning, repairs
Tweet 6:
Consultant-Energy System Transformation-Multiple Locations – Washington Jobs
Tweet 7:
Registered Nurse – Washington Jobs
Location: Washington
Label ---> No
Answer ---> Yes
######### Error #2 #########
Tweet 1:
(Repeating to myself) I will not get addicted to witchernetflix &amp; guess what?! I’m hooked 📺 👀 netflix TheWitcher
Tweet 2:
netflix witchernetflix Fantasy drama was never my thing &amp; that changed when I watched witchernetflix Looking forward to Season ✌🏼
Tweet 3:
Just finished volunteering FeedingSouthFL &amp; 19,000 meals will be provided to those in need 🙏🏼🙌🏻👏🏼
Tweet 4:
The Christmas tree at DisneysGrandFloridian WaltDisneyWorld neve

In [24]:
from sklearn.metrics import classification_report

for output_filename in os.listdir("output/results/"):
    
    if 'gpt4' not in output_filename:
        continue
        
    preds = []
    output_filepath = os.path.join("output/results/", output_filename)
    with open(output_filepath, 'r') as file:
        lines = file.read().splitlines()
        for line in lines:
            item = json.loads(line)
            content = item['choices'][0]['message']['content']
            if content.startswith('1') or 'Yes' in content:
                preds.append('Yes')
            elif content.startswith('2') or 'No' in content:
                preds.append('No')

    labels = test_samples['label']
    try:
        results = classification_report(labels, preds, output_dict=True)
    except:
        print(output_filename, len(preds), len(labels))
        continue
    print(f"{output_filename} | Accuracy: {results['accuracy']:.2f} | F1: {results['weighted avg']['f1-score']:.2f}")

gpt4_all_ten-shot_outputs.json | Accuracy: 0.49 | F1: 0.49


In [30]:
count

956

In [None]:
with open(output_filepath, 'r') as file:
    lines = file.read().splitlines()

# TEST