# Bert + text augementation

In [1]:
import re
import random
import numpy as np
import nltk
import nlpaug.augmenter.word as naw
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM ,TrainingArguments, Trainer,  EarlyStoppingCallback, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from nltk.corpus import stopwords


In [2]:
# Text cleaning (remove non-ASCII characters, keep x20~x7E)
def clean_text(text):
    return re.sub(r"[^\x20-\x7E]", "", text)

# Load CSV
file_path = "/Users/tim/Desktop/self-learning/LLM_tune/all-data.csv"

dataset = load_dataset(
    "csv",
    data_files={"full": file_path},
    encoding="ISO-8859-1",
    column_names=["sentiment", "text"]
)["full"]

print("Dataset loaded. First example:")
print(dataset[0])

# Label mapping & text cleaning
def map_label_and_clean(example):
    mapping = {"negative": 0, "neutral": 1, "positive": 2}
    example["text"] = clean_text(example["text"])
    example["labels"] = mapping[example["sentiment"]]
    return example

dataset = dataset.map(map_label_and_clean)

# Split data (80% Train, 10% Val, 10% Test)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
temp_dataset = split_dataset["test"]

temp_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print("\nDataset split:")
print("Train rows:", len(train_dataset))
print("Validation rows:", len(validation_dataset))
print("Test rows:", len(test_dataset))

Dataset loaded. First example:
{'sentiment': 'neutral', 'text': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'}

Dataset split:
Train rows: 3876
Validation rows: 485
Test rows: 485


In [3]:
nltk.data.path.append("/Users/tim/nltk_data")

# Synonym augmentation using WordNet
syn_aug = naw.SynonymAug(
    aug_src='wordnet',
    aug_max=2,  # Max 2 word replacements per sentence
    aug_p=0.3   # 30% probability per word
)

In [4]:
# English to Chinese translation
en2zh_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")
en2zh_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

# Chinese to English translation
zh2en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
zh2en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

def back_translate_en_zh_en(text):
    # English to Chinese
    inputs = en2zh_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        zh_ids = en2zh_model.generate(**inputs)
    zh_text = en2zh_tokenizer.decode(zh_ids[0], skip_special_tokens=True)

    # Chinese to English
    inputs = zh2en_tokenizer(zh_text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        en_ids = zh2en_model.generate(**inputs)
    back_translated = zh2en_tokenizer.decode(en_ids[0], skip_special_tokens=True)

    return back_translated



In [5]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english")) 

def bert_masked_augmentation(text):
    """Masked language model augmentation using FinBERT"""
    fill_mask = pipeline("fill-mask", model="bert-base-uncased")
    words = text.split()
    if len(words) < 3:
        return text
    masked_index = random.randint(0, len(words) - 1)
    masked_text = " ".join(words[:masked_index] + ["[MASK]"] + words[masked_index+1:])
    predictions = fill_mask(masked_text)
    return masked_text.replace("[MASK]", predictions[0]["token_str"])

def news_headline_style(text):
    """Convert sentence to news headline style."""
    words = text.split()
    important_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(important_words)

# Data augmentation
augmented_data = []
for example in train_dataset:
    text = example["text"]
    label = example["labels"]
    
    # Original text
    augmented_data.append({"text": text, "labels": label})
    
    # Synonym replacement (SynonymAug)
    aug_text_syn = syn_aug.augment(text)
    if isinstance(aug_text_syn, list):  # Ensure string format
        aug_text_syn = " ".join(aug_text_syn)
    augmented_data.append({"text": aug_text_syn, "labels": label})

    # Back-translation (English → Chinese → English)
    aug_text_bt = back_translate_en_zh_en(text)
    if isinstance(aug_text_bt, list):  # Ensure string format
        aug_text_bt = " ".join(aug_text_bt)
    augmented_data.append({"text": aug_text_bt, "labels": label})

    # BERT Masked Augmentation
    aug_text_bert = bert_masked_augmentation(text)
    if aug_text_bert != text:
        augmented_data.append({"text": aug_text_bert, "labels": label})

    # News Headline Style Augmentation
    aug_text_news = news_headline_style(text)
    if aug_text_news != text:
        augmented_data.append({"text": aug_text_news, "labels": label})

[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cl

In [6]:
# Convert augmented data to dataset
aug_train_dataset = Dataset.from_list(augmented_data)
print("Augmented train dataset size:", len(aug_train_dataset))

Augmented train dataset size: 17683


In [7]:
# Load FinBERT tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",  # Ensure uniform sequence length
        truncation=True,  # Truncate longer texts
        max_length=512  # Set max token length
    )
    tokenized["labels"] = examples["labels"]  # Preserve labels
    return tokenized

# Apply tokenization to datasets
train_tokenized = aug_train_dataset.map(tokenize_function, batched=True)  # Using augmented data
validation_tokenized = validation_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("\nTokenized train first example:")
print(train_tokenized[0])

Map:   0%|          | 0/17683 [00:00<?, ? examples/s]


Tokenized train first example:
{'text': "Under the terms of the agreement , Bunge will acquire Raisio 's Keiju , Makuisa and Pyszny Duet brands and manufacturing plants in Finland and Poland .", 'labels': 1, 'input_ids': [101, 2104, 1996, 3408, 1997, 1996, 3820, 1010, 21122, 3351, 2097, 9878, 15547, 20763, 1005, 1055, 26679, 9103, 1010, 5003, 5283, 14268, 1998, 1052, 7274, 2480, 4890, 11979, 9639, 1998, 5814, 4264, 1999, 6435, 1998, 3735, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [8]:
# Load BERT for 3-class classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
print("BERT model loaded successfully.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT model loaded successfully.


In [9]:
# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Training parameters
training_args = TrainingArguments(
    output_dir="./finbert-finetuned(aug)",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,  
    learning_rate=3e-5,
    warmup_steps=500,  
    # weight_decay=0.01,
    fp16=False,  
    push_to_hub=False,
    logging_steps=50,
    metric_for_best_model="f1",  # Metric for early stopping
)

# Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=validation_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,  # Stop if no F1 improvement for 2 epochs
        early_stopping_threshold=0.0
    )]
)

# Start fine-tuning
train_result = trainer.train()
trainer.save_model("./bert-finetuned(aug)")
print("Training complete. Best model saved.")


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3977,0.718986,0.861856,0.862214
2,0.1844,0.857723,0.849485,0.850704
3,0.048,1.195674,0.847423,0.848181


Training complete. Best model saved.


In [10]:
# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_tokenized)
print("\nTest set evaluation results:")
print(test_results)

# Compute confusion matrix
predictions = trainer.predict(test_tokenized)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

cm = confusion_matrix(true_labels, pred_labels)
print("\nConfusion Matrix:")
print(cm)



Test set evaluation results:
{'eval_loss': 1.2452267408370972, 'eval_accuracy': 0.8309278350515464, 'eval_f1': 0.8317147661793209, 'eval_runtime': 13.8917, 'eval_samples_per_second': 34.913, 'eval_steps_per_second': 8.782, 'epoch': 3.0}

Confusion Matrix:
[[ 48  10   2]
 [  6 240  36]
 [  4  24 115]]


In [11]:
# Gradio Interface

device = torch.device("mps")  
model.to(device)

# Labels & inference function
label_names = ["negative", "neutral", "positive"]

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = outputs.logits.argmax(dim=-1).item()
    
    return label_names[pred_label]

# Create Gradio interface
demo = gr.Interface(
    fn=predict_sentiment,
    inputs="text",
    outputs="text",
    title="Finbert + text augementation"
)

# Launch interface
demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7864


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on public URL: https://3ff6474044e28ddf1f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


