In [1]:
import pandas as pd
from datasets import DatasetDict, Dataset

# Generate sample data
train_data = {
    "text": [
        "The weather today is sunny and pleasant.",
        "I love reading books on weekends.",
        "The stock market is experiencing volatility.",
        "The movie was thrilling and engaging.",
        "Technology is advancing at an incredible pace."
    ],
    "label": [1.2, 2.5, -0.7, 3.8, 4.1]  # Continuous labels
}

test_data = {
    "text": [
        "Traveling is one of the best ways to learn about cultures.",
        "The economy is growing steadily this year.",
        "My favorite sport is basketball."
    ],
    "label": [3.0, 1.8, 2.7]  # Continuous labels
}

# Create datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_data))

# Combine into a DatasetDict
sample_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Print the dataset
print(sample_dataset)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3
    })
})


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from datasets import load_dataset
import random

# Define the dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Define the training loop
def train_model(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# Define the evaluation loop
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = outputs.logits.squeeze(-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    mse = mean_squared_error(all_labels, all_preds)
    return mse



In [4]:
# Load dataset using the datasets library
dataset = sample_dataset  # Replace with actual dataset name
train_data = dataset["train"]
test_data = dataset["test"]


# Extract texts and labels
train_texts = train_data["text"]
train_labels = train_data["label"]
val_texts = test_data["text"]  # Use test set as validation
val_labels = test_data["label"]

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Regression

# Create datasets and dataloaders
max_len = 128
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Set up training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, optimizer, device)
    val_mse = evaluate_model(model, val_loader, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation MSE: {val_mse:.4f}")

# Save the model
model.save_pretrained("fine_tuned_bert_regression")
tokenizer.save_pretrained("fine_tuned_bert_regression")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                       

Epoch 1/3
Training Loss: 7.6319
Validation MSE: 5.3449


                                                       

Epoch 2/3
Training Loss: 6.7984
Validation MSE: 4.8570


                                                       

Epoch 3/3
Training Loss: 6.1837
Validation MSE: 4.2945


('fine_tuned_bert_regression/tokenizer_config.json',
 'fine_tuned_bert_regression/special_tokens_map.json',
 'fine_tuned_bert_regression/vocab.txt',
 'fine_tuned_bert_regression/added_tokens.json')