In [1]:
pip install transformers[torch] pandas numpy datasets

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers[torch]
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers[torch])
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metada

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np

In [6]:
def load_dataset_yelp():
    df =  pd.read_csv("yelp_academic_dataset_review.csv")
    dataset =  Dataset.from_pandas(df)
    return dataset.train_test_split(test_size=0.2)


In [19]:
def train_bert_model(dataset):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def tokenize(example):
        return tokenizer(example["text"], padding="max_length", truncation=True)

    tokenized_datasets = dataset.map(tokenize, batched=True)
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="no",
        logging_strategy="epoch",
        save_strategy="no",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        save_total_limit=1,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
    )
    trainer.train()
    # model.save_pretrained("./trained_bert")
    # tokenizer.save_pretrained("./trained_bert")
    return model.to("cuda" if torch.cuda.is_available() else "cpu"), tokenizer

# ---- Step 3: Inference Function ----
def analyze_sentiment(model, tokenizer, review_text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(review_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return ["Negative", "Neutral", "Positive"][prediction]


if __name__ == "__main__":
    dataset = load_dataset_yelp()
    model, tokenizer = train_bert_model(dataset)

    examples = [
        "I absolutely loved the pizza! Best in town.",
        "The service was okay, nothing special.",
        "Terrible experience. The food was cold and bland."
    ]

    for review in examples:
        sentiment = analyze_sentiment(model, tokenizer, review)
        print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
1000,0.6201
2000,0.3526


Review: I absolutely loved the pizza! Best in town.
Predicted Sentiment: Positive

Review: The service was okay, nothing special.
Predicted Sentiment: Neutral

Review: Terrible experience. The food was cold and bland.
Predicted Sentiment: Negative



In [21]:
torch.save(model.state_dict(), 'model_weights.pth')

In [22]:
tokenizer.save_pretrained(save_directory)

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json')