In [5]:
import os
import json
import re

In [27]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.strip()
    return text

def load_data(data_filepath, split_filepath):
    
    train_data, test_data = [], []

    with open(split_filepath, 'r') as file:
        splits = json.load(file)
        train_ids = splits['train']
        test_ids = splits['test']
        
    with open(filepath, 'r') as file:
        for line in file:
            item = json.loads(line)
            kept_annotations = [item[key] for key in item.keys() if key.startswith("Answer.Q1_")]
            if len(kept_annotations) == 0:
                continue
            texts = [
                clean_text(item['context8_tweettext']),
                clean_text(item['context9_tweettext']),
                clean_text(item['context10_tweettext']),
                clean_text(item['context11_tweettext']),
                clean_text(item['context12_tweettext']),
                clean_text(item['context13_tweettext']),
            ]
            instance = {'texts': texts, 'label': item['adjudicated_label'], 'location': item['anchor_location']}
            if item['instance_id'] in train_ids:
                train_data.append(instance)
            if item['instance_id'] in test_ids:
                test_data.append(instance)
                
    return train_data, test_data


data_filepath = 'data/data.json'
split_filepath = 'data/data_split'
train_data, test_data = load_data(data_filepath, split_filepath)

In [1]:
# import os

# os.environ["CUDA_VISIBLE_DEVICES"]="3"

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = 'google/flan-ul2'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True).to('cuda:2')
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB (GPU 0; 23.63 GiB total capacity; 22.38 GiB already allocated; 75.56 MiB free; 22.56 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
model_input = """### Prompt: Read the tweets below and determine its sentiment.
### Tweets: Dallas is so bad.
OPTIONS:
1. Negative
2. Positive
### Answer: """

inputs = tokenizer(model_input, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

### Test UL2

In [1]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

model = T5ForConditionalGeneration.from_pretrained("google/ul2", load_in_8bit=True, device_map='auto')                                                                                                   
tokenizer = AutoTokenizer.from_pretrained("google/ul2")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
input_string = (
    "[NLG] What is the president of United States?\nAnswer: "
)                                          

inputs = tokenizer(input_string, return_tensors="pt", add_special_tokens=False).input_ids.to(model.device)

outputs = model.generate(inputs, max_length=100)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

of United States is Donald Trump. Question: Who is the president of India? Answer: The president of India is Ram Nath Kovind. Question: Who is the president of Pakistan? Answer: The president of Pakistan is Imran Khan. Question: Who is the president of Bangladesh? Answer: The president of Bangladesh is Abdullah Ahmed. Question: Who is the president of Bangladesh? Answer: The president of Bangladesh is Sheikh Hasina. Question: Who is the


### Test various model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("declare-lab/flan-alpaca-xxl")
model = AutoModelForSeq2SeqLM.from_pretrained("declare-lab/flan-alpaca-xxl",
                                              load_in_8bit=True, 
                                              device_map="auto",
                                              trust_remote_code=True,
                                              cache_dir='/mnt/DATA/hf_cache/')

In [41]:
test_sent = 'Dallas is so bad and I really wanna go back in the future'
prompt = f"""Determine the sentiment of the given sentence.

{test_sent}

OPTIONS:
1. Positive.
2. Negative.
ANSWER: """

input_ids = tokenizer(prompt, return_tensors='pt').to(model.device)
output_tokens = model.generate(**input_ids, max_new_tokens=150, do_sample=False, use_cache=True)
decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
decoded_output

'2. Negative'

In [11]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 KB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.0 threadpoolctl-3.2.0


In [1]:
import os
import json
from sklearn.metrics import classification_report

response_folder = 'output/responses/'
for response_filename in os.listdir(response_folder):
    response_filepath = os.path.join(response_folder, response_filename)
    with open(response_filepath, 'r') as file:
        response = json.load(file)
    labels = response['labels']
    print("#" * 60)
    print("#" * 10 + "  " + response_filename + "  " + "#" * 10)
    print(classification_report(response['labels'], response['predictions']))

############################################################
##########  flan_ul2_all_few-shot_response  ##########
              precision    recall  f1-score   support

          No       0.35      0.77      0.48       350
         Yes       0.71      0.28      0.40       699

    accuracy                           0.45      1049
   macro avg       0.53      0.53      0.44      1049
weighted avg       0.59      0.45      0.43      1049

############################################################
##########  flan_t5_target_zero-shot_response  ##########
              precision    recall  f1-score   support

          No       0.34      0.97      0.50       350
         Yes       0.74      0.04      0.08       699

    accuracy                           0.35      1049
   macro avg       0.54      0.51      0.29      1049
weighted avg       0.60      0.35      0.22      1049

############################################################
##########  flan_alpaca_early_target_zero-shot_res

ValueError: Found input variables with inconsistent numbers of samples: [1049, 1045]

In [48]:
predictions = []
for pred in response['predictions']:
    if pred.startswith('1'):
        predictions.append('Yes')
    if pred.startswith('2'):
        predictions.append('No')
    print(pred)

Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
Yes
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
No
No
No
Yes
No
No
No
Yes
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
No
Yes
No
No
No
No
No
N

In [58]:
type(data)

bytes