In [1]:
import re
import torch
import gradio as gr
import torch
import nltk
import random
import numpy as np
import nlpaug.augmenter.word as naw
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, AutoModelForSeq2SeqLM, pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from datasets import Dataset 

In [2]:
# Text cleaning(Remove non-ASCII characters, keep x20~x7E)
def clean_text(text):
    return re.sub(r"[^\x20-\x7E]", "", text)

# Load CSV
file_path = "/Users/tim/Desktop/self-learning/LLM_tune/all-data.csv"

dataset = load_dataset(
    "csv",
    data_files={"full": file_path},
    encoding="ISO-8859-1",
    column_names=["sentiment", "text"]
)["full"]

print("Dataset loaded. First example:")
print(dataset[0])

# Label mapping & cleaning
def map_label_and_clean(example):
    mapping = {"negative": 0, "neutral": 1, "positive": 2}
    example["text"] = clean_text(example["text"])
    example["labels"] = mapping[example["sentiment"]]
    return example

dataset = dataset.map(map_label_and_clean)

# Split data (80% train, 10% val, 10% test)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]

temp_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print("\nDataset split:")
print("Train rows:", len(train_dataset))
print("Validation rows:", len(validation_dataset))
print("Test rows:", len(test_dataset))


Dataset loaded. First example:
{'sentiment': 'neutral', 'text': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'}

Dataset split:
Train rows: 3876
Validation rows: 485
Test rows: 485


In [3]:
nltk.data.path.append("/Users/tim/nltk_data")

# Synonym augmentation using WordNet
syn_aug = naw.SynonymAug(
    aug_src='wordnet',
    aug_max=2,  # Max 2 word replacements per sentence
    aug_p=0.3   # 30% probability per word
)

In [4]:
# English to Chinese translation
en2zh_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")
en2zh_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

# Chinese to English translation
zh2en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
zh2en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

def back_translate_en_zh_en(text):
    # English to Chinese
    inputs = en2zh_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        zh_ids = en2zh_model.generate(**inputs)
    zh_text = en2zh_tokenizer.decode(zh_ids[0], skip_special_tokens=True)

    # Chinese to English
    inputs = zh2en_tokenizer(zh_text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        en_ids = zh2en_model.generate(**inputs)
    back_translated = zh2en_tokenizer.decode(en_ids[0], skip_special_tokens=True)

    return back_translated



In [None]:
# Download stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


# Add Random Swap Augmenter 
random_swap_aug = naw.RandomWordAug(
    action="swap",
    aug_p=0.3,    # 30% probability of swapping words in a sentence
    aug_min=1,    # Perform at least one swap
    aug_max=2     # Perform at most two swaps
)

def random_swap_augmentation(text):
    """Perform word-level augmentation using nlpaug's RandomWordAug(action="swap")."""
    return random_swap_aug.augment(text)

def news_headline_style(text):
    """Convert a sentence into 'news headline style' by removing stopwords."""
    words = text.split()
    important_words = [w for w in words if w.lower() not in stop_words]
    return " ".join(important_words)

# Add a unify_text function to ensure augmented outputs are always strings
def unify_text(aug_text):
    """
    Convert augmented output (possibly a list) into a string.
    If it is any other type (e.g., None), convert forcibly to string to avoid errors
    when using Dataset.from_list().
    """
    if isinstance(aug_text, list):
        aug_text = " ".join(aug_text)
    elif not isinstance(aug_text, str):
        aug_text = str(aug_text)
    return aug_text

# Perform multiple augmentations
augmented_data = []
for example in train_dataset:
    text = example["text"]
    label = example["labels"]

    # Original text
    augmented_data.append({"text": text, "labels": label})

    # Synonym replacement (SynonymAug)
    aug_text_syn = syn_aug.augment(text)
    aug_text_syn = unify_text(aug_text_syn)
    augmented_data.append({"text": aug_text_syn, "labels": label})

    # Back translation (English → Chinese → English)
    aug_text_bt = back_translate_en_zh_en(text)
    aug_text_bt = unify_text(aug_text_bt)
    augmented_data.append({"text": aug_text_bt, "labels": label})

    # Random Swap Augmentation 
    aug_text_swap = random_swap_augmentation(text)
    aug_text_swap = unify_text(aug_text_swap)
    if aug_text_swap != text:
        augmented_data.append({"text": aug_text_swap, "labels": label})

    # News Headline Style
    aug_text_news = news_headline_style(text)
    aug_text_news = unify_text(aug_text_news)
    if aug_text_news != text:
        augmented_data.append({"text": aug_text_news, "labels": label})


[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Convert augmented data to dataset
aug_train_dataset = Dataset.from_list(augmented_data)
print("Augmented train dataset size:", len(aug_train_dataset))

Augmented train dataset size: 19363


In [7]:
# Load model & tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token  # Avoid padding error

# Tokenization
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256  # Reduce memory usage
    )
    tokenized["labels"] = examples["labels"]
    return tokenized

train_tokenized = train_dataset.map(tokenize_function, batched=True)
validation_tokenized = validation_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("\nTokenized train first example:")
print(train_tokenized[0])


Map:   0%|          | 0/485 [00:00<?, ? examples/s]


Tokenized train first example:
{'sentiment': 'neutral', 'text': "Under the terms of the agreement , Bunge will acquire Raisio 's Keiju , Makuisa and Pyszny Duet brands and manufacturing plants in Finland and Poland .", 'labels': 1, 'input_ids': [128000, 16648, 279, 3878, 315, 279, 9306, 1174, 426, 14208, 690, 21953, 432, 2852, 822, 364, 82, 6706, 64274, 1174, 40424, 9425, 64, 323, 393, 73445, 3919, 423, 14127, 16097, 323, 15266, 11012, 304, 37355, 323, 28702, 662, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 12

In [8]:

# Baseline Llama evaluation

# Define metrics
def compute_metrics_baseline(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Load the original Llama model (without LoRA)
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    trust_remote_code=True
)

# Create inference-specific TrainingArguments
baseline_inference_args = TrainingArguments(
    output_dir="./llama_baseline_check",
    do_train=False,
    do_eval=True,
    per_device_eval_batch_size=1,
    logging_steps=10
)

# Create Trainer (only for evaluation, not training)
baseline_trainer = Trainer(
    model=baseline_model,
    args=baseline_inference_args,
    compute_metrics=compute_metrics_baseline
)

# Evaluate the baseline model on the test set
baseline_results = baseline_trainer.evaluate(eval_dataset=test_tokenized)
print("\n=== Baseline Llama 3.2-1B (No Fine-tuning) on Test Set ===")
print(baseline_results)

# Confusion matrix
pred_output = baseline_trainer.predict(test_tokenized)
pred_labels = np.argmax(pred_output.predictions, axis=-1)
true_labels = pred_output.label_ids
cm_base = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix (Baseline Llama):")
print(cm_base)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Baseline Llama 3.2-1B (No Fine-tuning) on Test Set ===
{'eval_loss': 3.2506046295166016, 'eval_model_preparation_time': 0.001, 'eval_accuracy': 0.1402061855670103, 'eval_f1': 0.07961807454339065, 'eval_runtime': 69.9626, 'eval_samples_per_second': 6.932, 'eval_steps_per_second': 6.932}

Confusion Matrix (Baseline Llama):
[[ 48   0  12]
 [240   0  42]
 [122   1  20]]


In [9]:
# Load Llama 3.2-1B Instruct
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    trust_remote_code=True,
)

# LoRA Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS  # Sequence classification
)

model = get_peft_model(base_model, lora_config)
print("LoRA model created for Llama 3.2-1B-Instruct.")


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
LoRA model created for Llama 3.2-1B-Instruct.


In [10]:
# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Training parameters
training_args = TrainingArguments(
    output_dir="./llama3-lora-aug-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,
    num_train_epochs=10,  # EarlyStoppingCallback will stop early if needed
    learning_rate=2e-5,
    # weight_decay=0.01,
    fp16=False,  # False for Apple MPS
    push_to_hub=False,
    logging_steps=50,
)

# Early Stopping (stops if no improvement for 2 evaluations)
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.0
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

# Start fine-tuning
train_result = trainer.train()
trainer.save_model("./llama3-lora-aug-finetuned")
print("Training complete. Best model saved.")


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6426,0.911376,0.839175,0.835858
2,0.5777,0.710816,0.872165,0.872043
3,0.4933,0.752732,0.863918,0.86371
4,0.4176,1.118943,0.861856,0.856818



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.

Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.


Training complete. Best model saved.



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.


In [13]:
# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\nTest set evaluation results:")
print(test_results)

# Confusion matrix
predictions = trainer.predict(test_tokenized)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

cm = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix:")
print(cm)



Test set evaluation results:
{'eval_loss': 0.8149625062942505, 'eval_accuracy': 0.8494845360824742, 'eval_f1': 0.8503986345937338, 'eval_runtime': 67.7065, 'eval_samples_per_second': 7.163, 'eval_steps_per_second': 7.163, 'epoch': 4.0}

Confusion Matrix:
[[ 48   9   3]
 [  5 243  34]
 [  2  20 121]]


In [12]:
# Gradio Interface

device = torch.device("mps")  
model.to(device)

# Labels & inference function
label_names = ["negative", "neutral", "positive"]

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = outputs.logits.argmax(dim=-1).item()
    
    return label_names[pred_label]

# Create Gradio interface
demo = gr.Interface(
    fn=predict_sentiment,
    inputs="text",
    outputs="text",
    title="Llama 3.2-1B-Instruct + LoRA + EarlyStopping"
)

# Launch interface
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7866


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on public URL: https://99361ec9ae84643763.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


