In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
from huggingface_hub import login

In [2]:
login(token=os.getenv('HF_TOKEN'))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
data = pd.read_csv('yelp_parallel/test_en_parallel.txt', sep='\t')
data.columns = ['Style1', 'Style2']
data = data[:5000]

In [4]:
sentences_negative = data['Style1'].values.tolist()
sentences_positive = data['Style2'].values.tolist()

In [5]:
model_name = 'meta-llama/Llama-2-7b-hf'
bitsandbytes_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16,
                                         bnb_4bit_quant_type='nf4')


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bitsandbytes_config,
                                             device_map='cuda')


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [7]:
# Zero-shot
sample_sentence = sentences_negative[0]
sample_sentence

"ever since joes has changed hands it's just gotten worse and worse."

In [8]:
prompt = f'Classify the sentiment as positive or negative: {sample_sentence}\nAnswer: '
prompt

"Classify the sentiment as positive or negative: ever since joes has changed hands it's just gotten worse and worse.\nAnswer: "

In [9]:
tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
tokens

{'input_ids': tensor([[    1,  4134,  1598,   278, 19688,   408,  6374,   470,  8178, 29901,
          3926,  1951,  2958,   267,   756,  3939,  6567,   372, 29915, 29879,
           925,  2355,   841, 15029,   322, 15029, 29889,    13, 22550, 29901,
         29871]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [10]:
output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
output_ids

tensor([[    1,  4134,  1598,   278, 19688,   408,  6374,   470,  8178, 29901,
          3926,  1951,  2958,   267,   756,  3939,  6567,   372, 29915, 29879,
           925,  2355,   841, 15029,   322, 15029, 29889,    13, 22550, 29901,
         29871, 29896, 29889, 12610,  1230, 29889,    13, 29908,  3112, 29915,
         29879]], device='cuda:0')

In [11]:
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'Classify the sentiment as positive or negative: ever since joes has changed hands it\'s just gotten worse and worse.\nAnswer: 1. Negative.\n"It\'s'

In [12]:
predictions_0 = []
test_sentences = sentences_negative[:10] + sentences_positive[:10]
true_labels = ['negative'] * 10 + ['positive'] * 10

In [13]:
for sentence in test_sentences:
    prompt = f'Classify the sentiment as positive or negative: {sentence}\nAnswer: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    if 'positive' in output.lower():
        predictions_0.append('positive')
    elif 'negative' in output.lower():
        predictions_0.append('negative')
    else:
        predictions_0.append('negative')

In [14]:
accuracy_score(true_labels, predictions_0)

0.5

In [15]:
precision_score(true_labels, predictions_0, pos_label='positive')

0.5

In [16]:
recall_score(true_labels, predictions_0, pos_label='positive')

1.0

In [17]:
f1_score(true_labels, predictions_0, pos_label='positive')

0.6666666666666666

In [18]:
# One-shot
example = 'Sentence: the food was terrible\nSentiment: negative'
example

'Sentence: the food was terrible\nSentiment: negative'

In [19]:
sample_sentence = sentences_negative[5]
sample_sentence

'just left and took it off the bill.'

In [20]:
prompt = f'{example}\n\nSentence: {sample_sentence}\nSentiment: '
prompt

'Sentence: the food was terrible\nSentiment: negative\n\nSentence: just left and took it off the bill.\nSentiment: '

In [21]:
tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
tokenizer.decode(output_ids[0], skip_special_tokens=True)

"Sentence: the food was terrible\nSentiment: negative\n\nSentence: just left and took it off the bill.\nSentiment: 0\n\nSentence: I'm"

In [22]:
predictions_1 = []

In [23]:
for sentence in test_sentences:
    prompt = f'{example}\n\nSentence: {sentence}\nSentiment: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    if 'positive' in output.lower():
        predictions_1.append('positive')
    elif 'negative' in output.lower():
        predictions_1.append('negative')
    else:
        predictions_1.append('negative')

In [24]:
accuracy_score(true_labels, predictions_1)

0.65

In [25]:
precision_score(true_labels, predictions_1, pos_label='positive')

1.0

In [26]:
recall_score(true_labels, predictions_1, pos_label='positive')

0.3

In [27]:
f1_score(true_labels, predictions_1, pos_label='positive')

0.46153846153846156

In [28]:
# Five-shot
examples_5 = 'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative'
examples_5

'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative'

In [29]:
sample_sentence = sentences_negative[10]
sample_sentence

'we sit down and we got some really slow and lazy service.'

In [30]:
prompt = f'{examples_5}\n\nSentence: {sample_sentence}\nSentiment: '
prompt

'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative\n\nSentence: we sit down and we got some really slow and lazy service.\nSentiment: '

In [31]:
tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
tokenizer.decode(output_ids[0], skip_special_tokens=True)

'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative\n\nSentence: we sit down and we got some really slow and lazy service.\nSentiment: \n\nSentence: the food was terrible'

In [32]:
predictions_5 = []

In [33]:
for sentence in test_sentences:
    prompt = f'{examples_5}\n\nSentence: {sentence}\nSentiment: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    if 'positive' in output.lower():
        predictions_5.append('positive')
    elif 'negative' in output.lower():
        predictions_5.append('negative')
    else:
        predictions_5.append('negative')

In [34]:
accuracy_score(true_labels, predictions_5)

0.5

In [35]:
precision_score(true_labels, predictions_5, pos_label='positive')

0.5

In [36]:
recall_score(true_labels, predictions_5, pos_label='positive')

1.0

In [37]:
f1_score(true_labels, predictions_5, pos_label='positive')

0.6666666666666666

In [38]:
# Ten-shot
examples_10 = 'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative\n\nSentence: amazing place\nSentiment: positive\n\nSentence: worst meal ever\nSentiment: negative\n\nSentence: highly recommend\nSentiment: positive\n\nSentence: disappointing\nSentiment: negative\n\nSentence: fantastic food\nSentiment: positive'
examples_10

'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative\n\nSentence: amazing place\nSentiment: positive\n\nSentence: worst meal ever\nSentiment: negative\n\nSentence: highly recommend\nSentiment: positive\n\nSentence: disappointing\nSentiment: negative\n\nSentence: fantastic food\nSentiment: positive'

In [39]:
sample_sentence = sentences_negative[15]
sample_sentence

'there chips are ok, but their salsa is really bland.'

In [40]:
prompt = f'{examples_10}\n\nSentence: {sample_sentence}\nSentiment: '
prompt

'Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative\n\nSentence: amazing place\nSentiment: positive\n\nSentence: worst meal ever\nSentiment: negative\n\nSentence: highly recommend\nSentiment: positive\n\nSentence: disappointing\nSentiment: negative\n\nSentence: fantastic food\nSentiment: positive\n\nSentence: there chips are ok, but their salsa is really bland.\nSentiment: '

In [41]:
tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
tokenizer.decode(output_ids[0], skip_special_tokens=True)

"Sentence: the food was terrible\nSentiment: negative\n\nSentence: great service\nSentiment: positive\n\nSentence: awful experience\nSentiment: negative\n\nSentence: loved it\nSentiment: positive\n\nSentence: not good\nSentiment: negative\n\nSentence: amazing place\nSentiment: positive\n\nSentence: worst meal ever\nSentiment: negative\n\nSentence: highly recommend\nSentiment: positive\n\nSentence: disappointing\nSentiment: negative\n\nSentence: fantastic food\nSentiment: positive\n\nSentence: there chips are ok, but their salsa is really bland.\nSentiment:  mixed\n\nSentence: I'd"

In [42]:
predictions_10 = []

In [43]:
for sentence in test_sentences:
    prompt = f'{examples_10}\n\nSentence: {sentence}\nSentiment: '
    tokens = tokenizer(prompt, return_tensors='pt').to('cuda')
    output_ids = model.generate(tokens.input_ids, max_new_tokens=10)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    if 'positive' in output.lower():
        predictions_10.append('positive')
    elif 'negative' in output.lower():
        predictions_10.append('negative')
    else:
        predictions_10.append('negative')

In [44]:
accuracy_score(true_labels, predictions_10)

0.5

In [45]:
precision_score(true_labels, predictions_10, pos_label='positive')

0.5

In [46]:
recall_score(true_labels, predictions_10, pos_label='positive')

1.0

In [47]:
f1_score(true_labels, predictions_10, pos_label='positive')

0.6666666666666666

In [None]:
# Овој модел не е подобар од моделите во претходните лабараториски вежби