# Finetuning of german-sentiment-bert

To be executed in Google Colab

In [None]:
!pip install transformers

In [None]:
import csv
import os

import torch
import torch.nn.functional as F

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

In [None]:
class SentiCSVDataset(torch.utils.data.Dataset):
    """Custom dataset class for sentiment analysis data in a CSV file.
    
    Tailored towards the pretrained model oliverguhr/german-sentiment-bert."""
    def __init__(self, csv_path):
        self.csv_path = csv_path

        raw_texts = []
        raw_labels = []
        if csv_path is not None:
            with open(os.path.expanduser(csv_path), 'r') as f:
                reader = csv.reader(f, delimiter=',')
                for row in reader:
                    if len(row) != 2:
                        raise ValueError('Invalid row encountered.')
                    raw_texts.append(row[0])
                    raw_labels.append(int(row[1]))
        else:  # Default data for testing
            raw_texts = [
                'Du hirnloser Vollidiot!', 'Ich mag dich sehr.', 'Alles hat ein Ende.', 'Nur die Wurst hat zwei.',
                'So ist das Leben.', 'Der zu frühe Vogel muss auf den Wurm warten.', 'Was für eine Katastrophe.'
            ]
            raw_labels = [1, 0, 2, 2, 2, 2, 1]
       
        self.raw_texts = raw_texts
        self.raw_labels = raw_labels

        self.tokenizer = AutoTokenizer.from_pretrained('oliverguhr/german-sentiment-bert')
        self.encodings = self.tokenizer(self.raw_texts, return_tensors='pt', truncation=True, padding=True)
        self.input_ids = self.encodings['input_ids']

        self.labels = torch.tensor(self.raw_labels, dtype=torch.int64)


    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.raw_labels)

In [None]:
train_dataset = SentiCSVDataset('')

eval_dataset = SentiCSVDataset('')

model = AutoModelForSequenceClassification.from_pretrained('oliverguhr/german-sentiment-bert')

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-5,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [None]:
trainer.train()
trainer.evaluate()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs