In [2]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score

# Step 1: Create a Simple Dataset (Manually created)
data = {
    'text': [
        "I loved the movie, it was amazing!",  # Positive sentiment
        "The product is terrible, don't buy it.",  # Negative sentiment
        "What a fantastic experience!",  # Positive sentiment
        "I will never use this service again.",  # Negative sentiment
        "Absolutely wonderful, highly recommend it!",  # Positive sentiment
        "Worst purchase I've ever made."  # Negative sentiment
    ],
    'label': [1, 0, 1, 0, 1, 0]  # 1 for positive, 0 for negative
}

# Convert the data to a pandas DataFrame
import pandas as pd
df = pd.DataFrame(data)

# Convert the DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and test sets (80% train, 20% test)
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

# Step 2: Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize the dataset
def tokenize_data(dataset):
    return dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512), batched=True)

# Tokenize the datasets
train_dataset = tokenize_data(train_dataset)
test_dataset = tokenize_data(test_dataset)

# Step 3: Load Pre-trained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 4: Define Metrics Function (Accuracy)
def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for saving results
    evaluation_strategy="epoch",     # Evaluation strategy
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Training batch size
    per_device_eval_batch_size=16,   # Evaluation batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Step 6: Initialize Trainer
trainer = Trainer(
    model=model,                         # The model
    args=training_args,                  # The training arguments
    train_dataset=train_dataset,         # The train dataset
    eval_dataset=test_dataset,           # The test dataset
    compute_metrics=compute_metrics      # Function to compute metrics
)

# Step 7: Start the Training Process
trainer.train()

# Step 8: Evaluate the Model
results = trainer.evaluate()
print("Evaluation results:", results)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msyedanida-khader[0m ([33msyedanida-khader-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.665342,0.5
2,No log,0.623754,1.0
3,No log,0.590068,1.0


Evaluation results: {'eval_loss': 0.5900675058364868, 'eval_accuracy': 1.0, 'eval_runtime': 4.2283, 'eval_samples_per_second': 0.473, 'eval_steps_per_second': 0.237, 'epoch': 3.0}
