In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import os

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define a function to load data from the text files
def load_data(directory):
    data = []
    labels = []
    for label in ['pos', 'neg']:
        dir_name = os.path.join(directory, label)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname))
                data.append(f.read())
                f.close()
                if label == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)
    return data, labels

# Load training and testing data
train_texts, train_labels = load_data('aclImdb/train')
test_texts, test_labels = load_data('aclImdb/test')

# Preprocess the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to PyTorch DataLoaders
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

# Define the training arguments
training_args = {
    "epochs": 2,
    "batch_size": 16
}

# Train the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=training_args["batch_size"], shuffle=True)

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

for epoch in range(training_args["epochs"]):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

# Evaluate the model
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=training_args["batch_size"], shuffle=True)

total = 0
correct = 0

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Test Accuracy: ', 100 * correct / total)
