In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("tomaarsen/setfit-absa-semeval-laptops")

# Print dataset information
print(dataset)
import pandas as pd

# Convert train split to a pandas DataFrame
train_df = pd.DataFrame(dataset["train"])

# Save train DataFrame to CSV
train_df.to_csv("train_dataset.csv", index=False)

# Convert test split to a pandas DataFrame
test_df = pd.DataFrame(dataset["test"])

# Save test DataFrame to CSV
test_df.to_csv("test_dataset.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'span', 'label', 'ordinal'],
        num_rows: 2358
    })
    test: Dataset({
        features: ['text', 'span', 'label', 'ordinal'],
        num_rows: 654
    })
})


In [16]:
train_df[['text','span']]

Unnamed: 0,text,span
0,I charge it at night and skip taking the cord ...,cord
1,I charge it at night and skip taking the cord ...,battery life
2,The tech guy then said the service center does...,service center
3,The tech guy then said the service center does...,"""sales"" team"
4,The tech guy then said the service center does...,tech guy
...,...,...
2353,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003
2354,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise
2355,"How Toshiba handles the repair seems to vary, ...",repair
2356,"How Toshiba handles the repair seems to vary, ...",repair


In [19]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Tokenize text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

class AspectDataset(Dataset):
    def __init__(self, data, max_length=128):  # Set max_length to desired value
        self.data = data
        self.tokenized_texts = []
        self.labels = []
        self.max_length = max_length
        
        for idx, row in self.data.iterrows():
            text = row['text']
            spans = [asp.strip() for asp in row['span'].split("|")]
            tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True)
            self.tokenized_texts.append(tokenized_text)
            label = [1 if tokenizer.decode(tokenized_text[i]) in spans else 0 for i in range(len(tokenized_text))]
            self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.tokenized_texts[idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = AspectDataset(train_df)
test_dataset = AspectDataset(test_df)

# Define the model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids)
        logits = outputs.logits
        loss = criterion(logits.view(-1, 2), labels.view(-1))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss}")

# Evaluation
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=2)
        total_correct += torch.sum(predicted_labels == labels).item()
        total_samples += labels.numel()

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 1.2983441716096422
Epoch 2, Loss: 0.004164424565715308
Epoch 3, Loss: 0.002472554174346442
Test Accuracy: 1.0


In [31]:
def predict_aspect_terms(model, tokenizer, text, device):
    tokenized_text = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(tokenized_text)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=2).squeeze().tolist()
    predicted_tokens = [tokenizer.decode(tokenized_text[0, i].item()) for i, label in enumerate(predicted_labels) if label == 1]
    return predicted_tokens

# Example usage
input_text = "I charge it at night and skip taking the cord with me because of the good battery life."
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
predicted_aspect_terms = predict_aspect_terms(model, tokenizer, input_text, device)
print("Predicted Aspect Terms:", predicted_aspect_terms)


Predicted Aspect Terms: []


In [28]:
train_df['text'][0]

'I charge it at night and skip taking the cord with me because of the good battery life.'