In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch import nn
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, BertConfig
from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load and tokenize data

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 128

def load_data(csv_file):
    df = pd.read_csv(csv_file)
    texts = df["text"].tolist()
    labels = torch.tensor(df["label"].tolist(), dtype=torch.float)

    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
    return dataset

dataset = load_data("fineTuning.csv")  # single CSV with all samples

  df = pd.read_csv(csv_file)


ValueError: Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

## Define regression model

In [None]:
class BertForRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.init_weights()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output).squeeze(-1)

## Train/test split (80/20) + K-Fold CV

In [None]:
def run_training(dataset, k_folds=5, num_epochs=3, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    fold_results = []

    for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
        print(f"\n----- Fold {fold+1} / {k_folds} -----")

        train_subsampler = Subset(dataset, train_ids)
        test_subsampler = Subset(dataset, test_ids)

        train_loader = DataLoader(train_subsampler, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_subsampler, batch_size=batch_size)

        config = BertConfig.from_pretrained("bert-base-uncased")
        model = BertForRegression.from_pretrained("bert-base-uncased", config=config).to(device)
        optimizer = AdamW(model.parameters(), lr=2e-5)
        criterion = nn.MSELoss()

        # Training
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}"):
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
            print(f"Fold {fold+1} Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")

        # Evaluation
        model.eval()
        mse_total = 0
        with torch.no_grad():
            for input_ids, attention_mask, labels in test_loader:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                outputs = model(input_ids, attention_mask)
                mse_total += criterion(outputs, labels).item() * input_ids.size(0)

        mse_total /= len(test_subsampler)
        fold_results.append(mse_total)
        print(f"Fold {fold+1} Test MSE: {mse_total:.4f}")

    avg_mse = sum(fold_results) / len(fold_results)
    print(f"\n==== Cross-validation MSE across {k_folds} folds: {avg_mse:.4f} ====")

    # Save last model + tokenizer
    model.save_pretrained("./bert_sentiment_regression")
    tokenizer.save_pretrained("./bert_sentiment_regression")

run_training(dataset, k_folds=5, num_epochs=5)

## Test

In [None]:
model = BertForRegression.from_pretrained("./bert_sentiment_regression")
tokenizer = BertTokenizer.from_pretrained("./bert_sentiment_regression")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

text = "holy fucking shit the chiefs are so ass. why did taylor swift agree to marry this bum ass travis kelce? his old slow fat ass can’t do shit!"
inputs = tokenizer(
    text,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Keep only input_ids and attention_mask
inputs = {k: v.to(device) for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}

with torch.no_grad():
    prediction = model(**inputs)

print("Prediction (0 - 6):", prediction.item())