In [None]:
pip install transformers

In [None]:
pip install torch

In [None]:
pip install sklearn

In [None]:
pip install pandas

In [None]:
pip install google.colab

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import string

data = pd.read_csv('/content/data.csv')

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.3, random_state=42)

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove leading/trailing whitespaces
    text = text.strip()

    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text)

    return text

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, questions, document_text, answers, labels, tokenizer, max_length):
        self.questions = questions
        self.document_text = document_text
        self.answers = answers
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, index):
        question = self.questions[index]
        answer = self.answers[index]
        label = self.labels[index]

        question = preprocess_text(question)
        answer = preprocess_text(answer)
        document_text = preprocess_text(self.document_text[index])

        input_text = f"{question} {document_text} {answer}"

        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

# Define hyperparameters
batch_size = 16
max_length = 512
num_epochs = 50
learning_rate = 2e-5

# Load the pre-trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Example training data
train_questions = train_data['question'].values
train_document_text = train_data['document_text'].values
train_answers = train_data['answer'].values
train_labels = train_data['label'].values

# Example validation data
val_questions = val_data['question'].values
val_document_text = val_data['document_text'].values
val_answers = val_data['answer'].values
val_labels = val_data['label'].values

# Create the custom dataset and data loader for training
train_dataset = CustomDataset(train_questions, train_document_text, train_answers, train_labels, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create the custom dataset and data loader for validation
val_dataset = CustomDataset(val_questions, val_document_text, val_answers, val_labels, tokenizer, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
model.to(device=device)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_preds = []
    train_targets = []

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device=device)
        attention_mask = batch['attention_mask'].to(device=device)
        labels = batch['label'].to(device=device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_preds.extend(torch.argmax(logits, dim=1).tolist())
        train_targets.extend(labels.tolist())

        print(f"Train Batch: Loss={loss.item()}")

    # Calculate training accuracy
    train_accuracy = accuracy_score(train_targets, train_preds)

    # Validation loop
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device=device)
            attention_mask = batch['attention_mask'].to(device=device)
            labels = batch['label'].to(device=device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            val_preds.extend(torch.argmax(logits, dim=1).tolist())
            val_targets.extend(labels.tolist())

            print(f"Validation Batch: Loss={loss.item()}")

    # Calculate validation accuracy
    val_accuracy = accuracy_score(val_targets, val_preds)

    # Print training and validation loss and accuracy
    print(f"Epoch {epoch+1}:")
    print(f"Train Loss: {train_loss / len(train_loader)}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Validation Loss: {val_loss / len(val_loader)}")
    print(f"Validation Accuracy: {val_accuracy}")

# Save the trained model
model.save_pretrained('./')