In [1]:
import pickle as pkl
import numpy as np

# name_file = 'Perturbed_tweets_test_english_val.tsv'
name_file = 'Eurotweets_English_val_without_line_return.tsv_clean_test'

path_data = './'
path_dump_perturbed = path_data + 'Perturbed_' + name_file #+ '.pkl_ERROR'

with open(path_dump_perturbed, 'rb') as fp:
    perturbed_X_text = pkl.load(fp)

In [4]:
perturbed_X_text.keys()

dict_keys(['France', 'United_Kingdom', 'Ireland', 'Spain', 'Germany', 'Italy', 'Morocco', 'India', 'Canada', 'Australia', 'New_Zealand', 'United_States', 'South_Africa', 'Portugal', 'Hungary', 'Poland', 'Turkey', 'Original'])

In [3]:
id2label = {0:"negative", 1:"positive", 2:"neutral"}
country = 'France'
genders = ['male', 'female']

def random_sampling_from_perturbed(seed, n, both=True, shuffle=True):
    gen = np.random.Generator(np.random.MT19937(seed))
    gender = genders[0]
    indices = gen.integers(0, len(perturbed_X_text[country][gender][0]), n)
    samples = [(perturbed_X_text[country][gender][0][i], id2label[perturbed_X_text[country][gender][1][i]]) for i in indices]
    if both:
        gender = genders[1]
        indices = gen.integers(0, len(perturbed_X_text[country][gender][0]), n)
        samples += [(perturbed_X_text[country][gender][0][i], id2label[perturbed_X_text[country][gender][1][i]]) for i in indices]
    if shuffle:
        np.random.shuffle(samples)
    return samples

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

CACHE_DIR = '/workspace1/sebcif/hfcache/'

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="cuda:5", cache_dir=CACHE_DIR)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 19/19 [00:32<00:00,  1.72s/it]


In [5]:
samples = random_sampling_from_perturbed(1333, 3)

In [11]:
import os
from tqdm import tqdm

DEVICE_MAP = os.getenv("DEVICE_MAP", "cuda:0")

def score2(model, tokenizer, sentence):
    """
    HuggingFace code:
    """
    import torch

    encodings = tokenizer(sentence, return_tensors='pt')
    max_length = 4032
    stride = 4032
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(DEVICE_MAP)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl


In [7]:
model_instruction = "Determine the sentiment of the tweet below by selecting one word: 'negative', 'neutral', or 'positive'. Keep your response succinct, avoiding explanations."
inst_format = "[INST] {instruction} [/INST]\n" if model_instruction else ""
formatted_inst = inst_format.format(instruction=model_instruction)
prompts = []
for sample in samples:
    label = sample[1]
    pre_prompt = f"Tweet:{sample[0]}\nSentiment:"
    prompt = f"{formatted_inst}{pre_prompt}"
    prompts.append((prompt, label))

In [8]:
gen_config = {
  "max_new_tokens": 32,
  #"max_length":1024
  #"batch_size": 32,
}

In [25]:
import torch

scores = []
sentence = prompts[0][0]

encodings = tokenizer(sentence, return_tensors='pt')
max_length = 4032
stride = 4032
seq_len = encodings.input_ids.size(1)

In [26]:
trg_len

63

In [27]:
nlls = []
prev_end_loc = 0
begin_loc = 0
end_loc = min(begin_loc + max_length, seq_len)
trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(DEVICE_MAP)
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100



In [32]:
input_ids == target_ids

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True]], device='cuda:0')

In [33]:
with torch.no_grad():
    outputs = model(input_ids, labels=target_ids)

    # loss is calculated using CrossEntropyLoss which averages over valid labels
    # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
    # to the left by 1.
    neg_log_likelihood = outputs.loss



In [40]:
neg_log_likelihood

tensor(8.9269, device='cuda:0')

In [41]:
nlls.append(neg_log_likelihood)

prev_end_loc = end_loc

ppl = torch.exp(torch.stack(nlls).mean())

In [43]:
torch.log(ppl)


tensor(8.9269, device='cuda:0')

In [14]:
outputs = []
output_prompts = []
for prompt in prompts:
    inputs = tokenizer(prompt[0], return_tensors="pt").to(0)
    output = model.generate(**inputs, max_new_tokens=128, temperature=0.0)# eos_token_id=tokenizer.eos_token_id)
    outputs.append(output)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    output_prompts.append(decoded)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [24]:
import re

answers = {0:[], 1:[], 2:[], 3:[]}
goldens = {0:[], 1:[], 2:[], 3:[]}
resp_regex = r"\[INST\] .*? \[\/INST\]\n\nTweet:.*?\nSentiment:(.*)"
label2id = {"negative":0,"positive":1, "neutral":2}
x = 0
for i, (out_p, p) in enumerate(zip(output_prompts, prompts)):
    groups = re.match(resp_regex, out_p).groups()
    if len(groups) > 1:
        print(f"Extra label text! Iteration {i}")
    pred_label = groups[0].strip().lower().split(" ")[0]
    answers[i%4].append(label2id[pred_label])
    goldens[i%4].append(label2id[p[1]])

In [25]:
import numpy as np
from sklearn.metrics import f1_score

for k in answers.keys():
    print(f1_score(answers[k], goldens[k], average='macro'))

0.43333333333333335
0.43333333333333335
0.43333333333333335
0.43333333333333335


In [26]:
answers

{0: [0, 0, 1, 1, 2, 1],
 1: [0, 0, 1, 1, 2, 1],
 2: [0, 0, 1, 1, 2, 1],
 3: [0, 0, 1, 1, 2, 1]}