<a href="https://colab.research.google.com/github/wangying9/NLP/blob/main/NLP_Bert_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pandas DataFrame: Create from lists of values

In [None]:
!pip install transformers

In [None]:
!pip install torch

In [None]:
!pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
# Assuming you have a DataFrame 'df' with columns 'text' and 'label' for your data
# Load and preprocess your dataset
import io
df = pd.read_csv(io.BytesIO(uploaded['spam_class.csv']),encoding = "ISO-8859-1")  # Change 'spam_data.csv' to your actual file name

In [None]:

df['label'] = df['Label'].map({'spam': 1, 'ham': 0})  # Convert labels to numerical values

In [None]:
# Tokenize the text data
max_len = 128  # Max length of input sequence
input_ids = []
attention_masks = []

In [None]:
n=0
for text in df['Messages']:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids']) # paddings are 0
    attention_masks.append(encoded_dict['attention_mask'])# using 1 and 0 indicates the encodes and paddings

    n+=1
    if n<5:
      print(f'In loop number {n}, the shape of encoded_dict is {np.array(encoded_dict).shape}\n')
      print(f'In loop number {n}, the inputID is {input_ids}\n')
      print(f'In loop number {n}, the attention_masks is {attention_masks}\n')


In [None]:
np.array(input_ids).shape,np.array(attention_masks).shape,len(df)

In [None]:
# convert 3 dims to 2 dims
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [None]:
np.array(input_ids).shape,np.array(attention_masks).shape,len(df)

In [None]:
labels = torch.tensor(df['label'].values)

In [None]:
type(labels)

In [None]:
np.array(labels).shape

In [None]:
# Split the dataset into training, validation, and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, test_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.1)

In [None]:
train_inputs.shape, test_inputs.shape, train_labels.shape, test_labels.shape

In [None]:
train_masks.shape, test_masks.shape

In [None]:
# Convert to PyTorch DataLoader
batch_size = 32
train_data = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_dataloader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = torch.utils.data.TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_dataloader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
type(train_data )

In [None]:
np.array(train_dataloader).shape

In [None]:
# Fine-tune the BERT model
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 3

In [None]:
for epoch in range(num_epochs):#3
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()#zero gradients before backpropogation
        #unpack the elements of batch tuple
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss # output loss
        total_loss += loss.item()#accumulate loss in each epoch
        loss.backward() #perform backward propogation based on the loss
        optimizer.step()#updates parameter using computing gradients and optimizer AdamW

    avg_train_loss = total_loss / len(train_dataloader)#average across total number of batches

    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss}')

In [None]:
# Evaluate the model
"""In evaluation mode, the model behaves differently compared to training mode. For example,
dropout layers are disabled, and batch normalization layers use running statistics instead of batch statistics."""
model.eval()
preds = []
true_labels = []

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():#gradients are not computed, which can save memory and computational resources
        outputs = model(input_ids, attention_mask=attention_mask)

    """retrieves the logits (raw output scores) from the output of the model.
    The logits represent the model's predictions before applying the softmax activation function."""
    logits = outputs.logits
    preds.extend(logits.argmax(dim=1).cpu().numpy())#extracts the predicted labels from the logits by taking the index of the maximum value along dimension 1 (which represents the classes).
    true_labels.extend(labels.cpu().numpy()) #extracts the true labels from the batch

accuracy = accuracy_score(true_labels, preds)
print(f'Accuracy: {accuracy}')

In [None]:
import torch

def predict_spam(text, model, tokenizer, max_len=128):
    # Tokenize the text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Perform inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

    # Map numerical label back to original label
    label_map = {1: 'spam', 0: 'ham'}
    predicted_label = label_map[predicted_label]

    return predicted_label

In [None]:
# Assuming 'model' and 'tokenizer' are already loaded and trained
text_to_predict = "Congratulations! You've won a free vacation. Click here to claim your prize!"
predicted_label = predict_spam(text_to_predict, model, tokenizer)
print("Predicted Label:", predicted_label)