# Import required packages

In [None]:
pip install transformers datasets torch scikit-learn


**BertForSequenceClassification** is a model provided by the Hugging Face Transformers library. It is a fine-tuned variant of BERT (Bidirectional Encoder Representations from Transformers) designed specifically for sequence classification tasks.

# Read the data

In [None]:
import pandas as pd
data=pd.read_excel('/content/IMDB Dataset_sample.xlsx')
data.head()

# Preprocess the data we need to do

- here generally we need to do data preprocessing

- that is your task

# Train test split

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Convert sentiment to numerical labels
data['sentiment'] = data['sentiment'].map({"positive": 1, "negative": 0})

# Split the data into train and test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
type(train_df)

In [None]:
train_dataset

In [None]:
train_dataset[:5]

# Tokenize the Text

Use the BERT tokenizer to preprocess the text.

1. padding="max_length"

**Purpose:** Ensures that all tokenized sequences have the same length.

**How It Works:**
If a sequence is shorter than max_length, it will be padded with a special padding token (e.g., [PAD]) until it reaches max_length.

If a sequence is longer than max_length, it won't be padded further.

**Why It’s Important:**
Models like BERT require inputs of uniform length for batch processing.
This ensures that all sequences in a batch can be processed simultaneously.

**2. truncation=True**

**Purpose:** Ensures that sequences longer than max_length are truncated to fit within the limit.

**How It Works:**
If a sequence exceeds the max_length, it will be truncated by removing tokens from the end of the sequence.

**Why It’s Important:**
Prevents exceeding the model's maximum input size, which could cause errors or inefficiency.
Useful for handling datasets with highly variable text lengths.

**3. max_length=128**

**Purpose:** Sets the desired length for input sequences.

**How It Works:**
Determines the fixed size for all tokenized sequences after padding/truncation.

**Why It’s Important:**
Most pre-trained models have a maximum token limit (e.g., 512 for BERT).
Reducing max_length to 128 saves computation time and memory, especially if most sequences are shorter than 128 tokens.

In [None]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset

In [None]:
train_dataset[0].keys()

In [None]:
# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["review", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["review", "__index_level_0__"])
train_dataset

**Note**

- transformers models expect output name as **labels**

- here our output column name is **sentiment**

- before convert data to torch type make sure rename the column name from sentiment to labels

In [None]:
# Set the format for PyTorch
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# Rename the sentiment column to labels
train_dataset = train_dataset.rename_column("sentiment", "labels")
test_dataset = test_dataset.rename_column("sentiment", "labels")

In [None]:
# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataset

In [None]:
train_dataset[0]

**so far all together**

In [None]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["review"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["review", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["review", "__index_level_0__"])

# Rename the sentiment column to labels
train_dataset = train_dataset.rename_column("sentiment", "labels")
test_dataset = test_dataset.rename_column("sentiment", "labels")

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



# Initialize the Model

1.   BertForSequenceClassification
2.   AutoModelForSequenceClassification



Load the BertForSequenceClassification model.

In [None]:
from transformers import BertForSequenceClassification

# Load the model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Binary classification
)


# Train the model

In [None]:
from huggingface_hub import login

login(hf_token = "<YOUR_TOKEN_HERE>")

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000
)
training_args

**Trainer**

- The trainer requires

    - Model

    - arguments

    - Train datasets

    - Eval dataset

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer


In [None]:
results = trainer.evaluate()
print(results)


# Save the model

In [None]:
# model is a varaible used at the time of reading the model
# tokenizer is a variable used at the time of reading tokenizer model
# tokenzier use to generate
#       token ids , attention mask
# model will generate the embeddings and train word embeddings
#      token ids and attention mask
#      model will internally generate the emeddings
#     training
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


# Load the saved model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = BertTokenizer.from_pretrained("./sentiment_model")


# Predict on unseen data

In [None]:
review = "I really not like this movie but The story was good!"
inputs = tokenizer(
        review,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

print(inputs)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}
inputs

with torch.no_grad():
    outputs = model(**inputs)
    print(outputs)

import numpy as np
torch.argmax(outputs.logits)

**logits and probabilites**

- logits are raw values

- logits are any real values that might be postive and negative

- logits will pass through activation functions either sigmoid or softmax

- those activation functions will give the probabilties

In [None]:
def predict_sentiment(review, model, tokenizer):
    # Tokenize the input review
    inputs = tokenizer(
        review,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

    # Move tensors to the same device as the model (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Map predictions to sentiment labels
    sentiment = "positive" if predictions.item() == 1 else "negative"
    return sentiment


In [None]:
review = "I really not loved this movie. The story was not good!"
sentiment = predict_sentiment(review, model, tokenizer)
print(f"Review: {review}")
print(f"Predicted Sentiment: {sentiment}")


In [None]:
zfrom transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()
