In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
train = pd.read_csv('yelp/train_en.txt', sep='\t')[['Sentence', 'Style']]
val   = pd.read_csv('yelp/val_en.txt', sep='\t')[['Sentence', 'Style']]
test  = pd.read_csv('yelp/test_en.txt', sep='\t')[['Sentence', 'Style']]

In [3]:
label_map = {'positive': 1, 'negative': 0}
for df in [train, val, test]:
    df['Label'] = df['Style'].map(label_map)

In [4]:
train_small = train.sample(5000, random_state=42)
val_small   = val.sample(2000, random_state=42)
test_small  = test.sample(2000, random_state=42)

In [5]:
few_shot_examples = (
    "Text: I love this restaurant!\nClass: POSITIVE\n"
    "Text: The food was terrible.\nClass: NEGATIVE\n"
    "Text: The waiter was very friendly.\nClass: POSITIVE\n"
    "Text: I will never come back here.\nClass: NEGATIVE\n"
)

In [6]:
def classify_with_model(model_name, sentences, examples=None, max_new_tokens=10, device=None):

    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    
    preds = []
    with torch.no_grad():
        for sent in sentences:
            if examples:
                prompt = f"{examples}\nClassify the following text into either 'POSITIVE' or 'NEGATIVE': {sent}\nClass:"
            else:
                prompt = f"Classify the following text into either 'POSITIVE' or 'NEGATIVE': {sent}"
            
            tokens = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
            output_ids = model.generate(tokens.input_ids, max_new_tokens=max_new_tokens)
            decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip().lower()
            
            if 'positive' in decoded:
                preds.append(1)
            elif 'negative' in decoded:
                preds.append(0)
            elif 'good' in decoded or 'great' in decoded or 'love' in decoded or 'excellent' in decoded:
                preds.append(1)
            elif 'bad' in decoded or 'terrible' in decoded or 'hate' in decoded or 'poor' in decoded:
                preds.append(0)
            else:
                preds.append(0)
    
    return preds


In [7]:
test_sentences_small = test_small['Sentence'].tolist()
true_labels_small   = test_small['Label'].tolist()

In [8]:
zs_preds_t5 = classify_with_model('t5-base', test_sentences_small, examples=None)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
fs_preds_t5 = classify_with_model('t5-base', test_sentences_small, examples=few_shot_examples)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
zs_preds_bart = classify_with_model('facebook/bart-large', test_sentences_small, examples=None)

In [11]:
fs_preds_bart = classify_with_model('facebook/bart-large', test_sentences_small, examples=few_shot_examples)

In [12]:
def evaluate(preds, labels):
    filtered_labels = [l for p, l in zip(preds, labels) if p != -1]
    filtered_preds = [p for p in preds if p != -1]
    return {
        'accuracy': accuracy_score(filtered_labels, filtered_preds),
        'precision': precision_score(filtered_labels, filtered_preds),
        'recall': recall_score(filtered_labels, filtered_preds),
        'f1': f1_score(filtered_labels, filtered_preds)
    }

In [13]:
zs_results_t5 = evaluate(zs_preds_t5, true_labels_small)
fs_results_t5 = evaluate(fs_preds_t5, true_labels_small)

In [14]:
zs_results_bart = evaluate(zs_preds_bart, true_labels_small)
fs_results_bart = evaluate(fs_preds_bart, true_labels_small)

In [15]:
print("T5 Zero-shot:", zs_results_t5)
print("T5 Few-shot:", fs_results_t5)
print("BART Zero-shot:", zs_results_bart)
print("BART Few-shot:", fs_results_bart)


T5 Zero-shot: {'accuracy': 0.696, 'precision': 0.7515078407720145, 'recall': 0.8640776699029126, 'f1': 0.8038709677419354}
T5 Few-shot: {'accuracy': 0.386, 'precision': 0.7018867924528301, 'recall': 0.2579750346740638, 'f1': 0.3772819472616633}
BART Zero-shot: {'accuracy': 0.2795, 'precision': 0.5454545454545454, 'recall': 0.004160887656033287, 'f1': 0.008258774948382657}
BART Few-shot: {'accuracy': 0.718, 'precision': 0.7210473313192346, 'recall': 0.9930651872399445, 'f1': 0.8354725787631272}


In [33]:
# Овој модел е полош од моделот во првата задача и полош од задачите од претходната лабараториска вежба