In [5]:
import os
import json
import re

In [27]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.strip()
    return text

def load_data(data_filepath, split_filepath):
    
    train_data, test_data = [], []

    with open(split_filepath, 'r') as file:
        splits = json.load(file)
        train_ids = splits['train']
        test_ids = splits['test']
        
    with open(filepath, 'r') as file:
        for line in file:
            item = json.loads(line)
            kept_annotations = [item[key] for key in item.keys() if key.startswith("Answer.Q1_")]
            if len(kept_annotations) == 0:
                continue
            texts = [
                clean_text(item['context8_tweettext']),
                clean_text(item['context9_tweettext']),
                clean_text(item['context10_tweettext']),
                clean_text(item['context11_tweettext']),
                clean_text(item['context12_tweettext']),
                clean_text(item['context13_tweettext']),
            ]
            instance = {'texts': texts, 'label': item['adjudicated_label'], 'location': item['anchor_location']}
            if item['instance_id'] in train_ids:
                train_data.append(instance)
            if item['instance_id'] in test_ids:
                test_data.append(instance)
                
    return train_data, test_data


data_filepath = 'data/data.json'
split_filepath = 'data/data_split'
train_data, test_data = load_data(data_filepath, split_filepath)

In [1]:
# import os

# os.environ["CUDA_VISIBLE_DEVICES"]="3"

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = 'google/flan-ul2'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True).to('cuda:2')
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 23.63 GiB total capacity; 22.38 GiB already allocated; 75.56 MiB free; 22.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
model_input = """### Prompt: Read the tweets below and determine its sentiment.
### Tweets: Dallas is so bad.
OPTIONS:
1. Negative
2. Positive
### Answer: """

inputs = tokenizer(model_input, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

### Test UL2

In [1]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

model = T5ForConditionalGeneration.from_pretrained("google/ul2", load_in_8bit=True, device_map='auto')                                                                                                   
tokenizer = AutoTokenizer.from_pretrained("google/ul2")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
input_string = (
    "[NLG] What is the president of United States?\nAnswer: "
)                                          

inputs = tokenizer(input_string, return_tensors="pt", add_special_tokens=False).input_ids.to(model.device)

outputs = model.generate(inputs, max_length=100)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

of United States is Donald Trump. Question: Who is the president of India? Answer: The president of India is Ram Nath Kovind. Question: Who is the president of Pakistan? Answer: The president of Pakistan is Imran Khan. Question: Who is the president of Bangladesh? Answer: The president of Bangladesh is Abdullah Ahmed. Question: Who is the president of Bangladesh? Answer: The president of Bangladesh is Sheikh Hasina. Question: Who is the


### Test various model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-xxl",
                                              load_in_8bit=True, 
                                              device_map="auto",
                                              trust_remote_code=True,
                                              cache_dir='/mnt/DATA/hf_cache/')

In [41]:
test_sent = 'Dallas is so bad and I really wanna go back in the future'
prompt = f"""Determine the sentiment of the given sentence.

{test_sent}

OPTIONS:
1. Positive.
2. Negative.
ANSWER: """

input_ids = tokenizer(prompt, return_tensors='pt').to(model.device)
output_tokens = model.generate(**input_ids, max_new_tokens=150, do_sample=False, use_cache=True)
decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
decoded_output

'2. Negative'

In [11]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 KB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.0 threadpoolctl-3.2.0


In [25]:
import os
import json
from sklearn.metrics import classification_report

response_folder = 'output/results/'
for response_filename in sorted(os.listdir(response_folder)):
    response_filepath = os.path.join(response_folder, response_filename)
    with open(response_filepath, 'r') as file:
        response = json.load(file)
#     print("#" * 60)
#     print("#" * 10 + "  " + response_filename + "  " + "#" * 10)
    print(f"{response_filename} | Accuracy: {response['accuracy']:.2f} | F1: {response['weighted avg']['f1-score']:.2f}")
#     break

flan_alpaca_all_five-shot_result | Accuracy: 0.35 | F1: 0.46
flan_alpaca_all_one-shot_result | Accuracy: 0.35 | F1: 0.49
flan_alpaca_all_ten-shot_result | Accuracy: 0.37 | F1: 0.46
flan_alpaca_all_zero-shot_result | Accuracy: 0.33 | F1: 0.50
flan_alpaca_early_target_five-shot_result | Accuracy: 0.35 | F1: 0.47
flan_alpaca_early_target_one-shot_result | Accuracy: 0.34 | F1: 0.49
flan_alpaca_early_target_ten-shot_result | Accuracy: 0.35 | F1: 0.46
flan_alpaca_early_target_zero-shot_result | Accuracy: 0.33 | F1: 0.50
flan_alpaca_target_five-shot_result | Accuracy: 0.34 | F1: 0.49
flan_alpaca_target_later_five-shot_result | Accuracy: 0.34 | F1: 0.46
flan_alpaca_target_later_one-shot_result | Accuracy: 0.34 | F1: 0.48
flan_alpaca_target_later_ten-shot_result | Accuracy: 0.34 | F1: 0.45
flan_alpaca_target_later_zero-shot_result | Accuracy: 0.33 | F1: 0.50
flan_alpaca_target_one-shot_result | Accuracy: 0.34 | F1: 0.49
flan_alpaca_target_ten-shot_result | Accuracy: 0.34 | F1: 0.48
flan_alpaca_

In [11]:
models = ['flan_t5', 'flan_ul2', 'flan_alpaca']
inputs = ['target', 'early_target', 'target_later', 'all']
shots = ['zero-shot', 'one-shot', 'five-shot', 'ten-shot']
for model in models:
    for input_ in inputs:
        print(f'# {model} + {input_}')
        for shot in shots:
            command = f'CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment {model} -input_content {input_} -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar {shot}'
            print(command)
        print()
    print()

# flan_t5 + target
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar zero-shot
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar one-shot
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar five-shot
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar ten-shot

# flan_t5 + early_target
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content early_target -output_dir output -cache_dir /mnt/DATA/hf_cache/ -exemplar zero-shot
CUDA_VISIBLE_DEVICES=0 python llm.py -data_dir data/ -experiment flan_t5 -input_content early_target -output_dir output -cache

## ChatGPT experiments

In [1]:
import os
from utils.data import load_data, get_prompt
from datasets import Dataset

data_dir = 'data/'
input_content = 'all'
exemplar = 'ten-shot'

In [2]:
data_filepath = os.path.join(data_dir, 'data.json')
split_filepath = os.path.join(data_dir, 'data_split')
train_data, test_data = load_data(data_filepath, split_filepath)

In [3]:
# Construct the training prompts and test prompts
train_samples = Dataset.from_dict({
    'text': [get_prompt(instance, input_content, data_type='train', exemplar=exemplar) for instance in train_data]
})
test_samples = Dataset.from_dict({
    'text': [get_prompt(instance, input_content, data_type='test', exemplar=exemplar) for instance in test_data],
    'label': [instance['label'] for instance in test_data]
})

In [4]:
test_samples['text'][0]

"### Instruction: Read the tweets chronologically published and determine if the author of the tweet is located in Dallas when the tweet was published. The '#' in the hashtags and '@' in the mentions are removed. Please select the number listed below.\n\nOne thing that has surprised me since moving to Dallas is how beautiful the Texas sky can be.\n\nOPTIONS:\n1. Yes.\n2. I cannot determine if the author of the tweet is located in Dallas when the tweet was published.\nAnswer: 1.\n\nRead the tweets chronologically published and determine if the author of the tweet is located in Dallas when the tweet was published. The '#' in the hashtags and '@' in the mentions are removed. Please select the number listed below.\n\nbreaking news: the seattle kraken are being removed from the nhl because the booktok fans are done with them. rip seattle kraken 2021-2023\n\nOPTIONS:\n1. Yes.\n2. I cannot determine if the author of the tweet is located in Seattle when the tweet was published.\nAnswer: 2.\n\n

In [5]:
import os
import openai
import time
import json
from tqdm.notebook import tqdm

openai.api_key = "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"
output_dir = 'output/results/'
output_filename = f"gpt4_{input_content}_{exemplar}_outputs.json"
output_filepath = os.path.join(output_dir, output_filename)

In [9]:
pbar = tqdm(total=len(test_samples['text']))

if os.path.isfile(output_filepath):
    with open(output_filepath, 'r') as file:
        processed_ids = set([json.loads(line)['id'] for line in file.read().splitlines()])
else:
    processed_ids = set()

with open(output_filepath, 'a+') as file:
    
    for index, prompt in enumerate(test_samples['text']):

        if str(index) in processed_ids:
            pbar.update(1)
            continue

        response = openai.ChatCompletion.create(
          model="gpt-4-0613",
          messages=[{"role": "user", "content": prompt}]
        )

        response = response.to_dict()
        response['id'] = str(index)
        file.write(json.dumps(response) + '\n')

        time.sleep(10)
        pbar.update(1)

pbar.close()

  0%|          | 0/1049 [00:00<?, ?it/s]

In [5]:
import json

results = []
with open('output/results/gpt4_all_ten-shot_outputs.json', 'r') as file:
    for line in file:
        item = json.loads(line)
        results.append(item)

In [6]:
answers = [result['choices'][0]['message']['content'] for result in results]


In [18]:
original_tweets = []
for x in test_data:
    concat_text = ''
    for index, text in enumerate(x['texts']):
        concat_text += f'Tweet {index + 1}:\n{text}\n'
    location = x['location']
    label = x['label']
    concat_text += f'Location: {location}\n'
    concat_text += f'Label ---> {label}'
    original_tweets.append(concat_text)

In [19]:
x

{'texts': ['Niggas that cross Spr 19 be like 🤣🤣',
  'Roll the dice on me you gone hit every time',
  'One day out the year man just one my birthday all I want is for somebody to take me to Eddie V’s and me not have to pay but that’s clearly too much to ask for 😒',
  'Yo girl and her friends on the way to Miami to cheat this spring break like',
  'My s- smoked already this morning it’s too early for this s- 😤',
  'Mood',
  'Had to fall down on my knees this morning and thank GOD for allowing me to see another year that wasn’t promised to me! I am truly blessed!'],
 'label': 'Yes',
 'location': 'Miami'}

In [29]:
count = 0
for x, answer in zip(test_data, answers):
    if (answer.startswith('1') and x['label'] != 'Yes') or (answer.startswith('2') and x['label'] != 'No'):
        count += 1
        if count > 100:
            break
        concat_text = ''
        for index, text in enumerate(x['texts']):
            concat_text += f'Tweet {index + 1}:\n{text}\n'
        location = x['location']
        label = x['label']
        concat_text += f'Location: {location}\n'
        concat_text += f'Label ---> {label}'
        print('=' * 60)
        print(f'######### Error #{count} #########')
        print(concat_text)
        print(f'Answer ---> {answer}')    

######### Error #1 #########
Tweet 1:
(Repeating to myself) I will not get addicted to witchernetflix &amp; guess what?! I’m hooked 📺 👀 netflix TheWitcher
Tweet 2:
netflix witchernetflix Fantasy drama was never my thing &amp; that changed when I watched witchernetflix Looking forward to Season ✌🏼
Tweet 3:
Just finished volunteering FeedingSouthFL &amp; 19,000 meals will be provided to those in need 🙏🏼🙌🏻👏🏼
Tweet 4:
The Christmas tree at DisneysGrandFloridian WaltDisneyWorld never gets old °o° 
Orlando
Tweet 5:
Looking for that bold red lipstick 👄 💄 👀 Any suggestions? beauty
Tweet 6:
michaelb4jordan LupusLA I saw that rum cupcake DisneySprings sprinkles Ugh! 🤦🏻‍♀️ I should of gotten one greatcause
Tweet 7:
DaveBautista That tattoo! 🙌🏻 🇵🇭
Location: Orlando
Label ---> Yes
Answer ---> 2. I cannot determine if the author of the tweet is located at Orlando when the tweet was published.
######### Error #2 #########
Tweet 1:
Here are the best things to do in Phoenix this week – AZCentral
Tweet 

In [21]:
set(answers)

{'1.',
 "1.\n\nThe author has used 'whiskeyriverrichmond' and 'whiskeyriverrtx' as tags multiple times. Doing a quick search, Whiskey River Richmond is a dance club located in Richmond, Texas. Therefore, it's likely the author is in Richmond at the time of these tweets.",
 '1. \n\nThe Edge on Euclid is a residential property in Cleveland. The author mentions this place in Tweet 4, indicating their location in Cleveland.',
 '1. \n\nThe author of the tweets makes several references to specific locations and events in Tulsa such as "Williams Lodge", "ONEOK Boathouse", "Chapman Adventure Playground" and "Gathering Place". Additionally, there are mentions of organizations such as TulsaTransit and TCCMetrocampus which are related to Tulsa. These clues provide strong evidence to suggest the author of the tweets was indeed in Tulsa when the tweets were published.',
 '1. \n\nThe tweet 4 explicitly mentions an event happening in "Downtown Orlando" which suggests the author is present and located

In [24]:
from sklearn.metrics import classification_report

for output_filename in os.listdir("output/results/"):
    
    if 'gpt4' not in output_filename:
        continue
        
    preds = []
    output_filepath = os.path.join("output/results/", output_filename)
    with open(output_filepath, 'r') as file:
        lines = file.read().splitlines()
        for line in lines:
            item = json.loads(line)
            content = item['choices'][0]['message']['content']
            if content.startswith('1') or 'Yes' in content:
                preds.append('Yes')
            elif content.startswith('2') or 'No' in content:
                preds.append('No')

    labels = test_samples['label']
    try:
        results = classification_report(labels, preds, output_dict=True)
    except:
        print(output_filename, len(preds), len(labels))
        continue
    print(f"{output_filename} | Accuracy: {results['accuracy']:.2f} | F1: {results['weighted avg']['f1-score']:.2f}")

gpt4_all_ten-shot_outputs.json | Accuracy: 0.49 | F1: 0.49


In [30]:
count

956

In [None]:
with open(output_filepath, 'r') as file:
    lines = file.read().splitlines()