# Finetuning of german-sentiment-bert

To be executed in Google Colab

## Set up Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
basepath = '/content/drive/My Drive/text-analytics/news-sentiment/training'

Mounted at /content/drive


In [2]:
!pip install transformers



In [3]:
import csv
import os
import re
from typing import Dict, Optional

import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

## Define dataset class

For sentiment analysis data stored in CSV

In [4]:
label_names = {0: 'positive', 1: 'negative', 2: 'neutral'}


class SentiCSVDataset(torch.utils.data.Dataset):
    """Custom dataset class for sentiment analysis data in a CSV file.
    
    Tailored towards the pretrained model oliverguhr/german-sentiment-bert.

    The methods replace_numbers() and clean_text() are based on
    https://github.com/oliverguhr/german-sentiment-lib/blob/4e5158/germansentiment/sentimentmodel.py"""
    def __init__(
            self,
            csv_path: str,
            csv_delimiter: str = '\t',
            label_remap: Optional[Dict[int, int]] = None
    ):
        self.csv_path = csv_path

        self.clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
        self.clean_http_urls = re.compile(r'https*\S+', re.MULTILINE)
        self.clean_at_mentions = re.compile(r'@\S+', re.MULTILINE)

        if label_remap is None:
            label_remap = {}

        raw_texts = []
        raw_labels = []
        # Load all data during initialization so iteration is faster.
        # This works fine as long as the dataset is extremely large.
        if csv_path is not None:
            with open(os.path.expanduser(csv_path), 'r') as f:
                reader = csv.reader(f, delimiter=csv_delimiter)
                for row in reader:
                    if len(row) != 2:
                        raise ValueError('Invalid row encountered.')
                    text = self.clean_text(row[0])
                    label = int(row[1])
                    # If the label has an entry in the label_remap dict,
                    # it is remapped accordingly. Else, the label is kept.
                    label = label_remap.get(label, label)
                    raw_texts.append(text)
                    raw_labels.append(label)
        else:  # Default data for testing
            raw_texts = [
                'Du hirnloser Vollidiot!', 'Ich mag dich sehr.', 'Alles hat ein Ende.', 'Nur die Wurst hat zwei.',
                'So ist das Leben.', 'Der zu frühe Vogel muss auf den Wurm warten.', 'Was für eine Katastrophe.'
            ]
            raw_labels = [1, 0, 2, 2, 2, 2, 1]
       
        self.raw_texts = raw_texts
        self.raw_labels = raw_labels

        self.tokenizer = AutoTokenizer.from_pretrained('oliverguhr/german-sentiment-bert')
        self.encodings = self.tokenizer(self.raw_texts, return_tensors='pt', truncation=True, padding=True)
        self.input_ids = self.encodings['input_ids']

        self.labels = torch.tensor(self.raw_labels, dtype=torch.int64)

    def replace_numbers(self, text: str) -> str:
        return text.replace("0"," null").replace("1"," eins").replace("2"," zwei")\
            .replace("3"," drei").replace("4"," vier").replace("5"," fünf") \
            .replace("6"," sechs").replace("7"," sieben").replace("8"," acht") \
            .replace("9"," neun")         

    def clean_text(self, text: str) -> str:    
        text = text.replace("\n", " ")        
        text = self.clean_http_urls.sub('', text)
        text = self.clean_at_mentions.sub('', text)        
        text = self.replace_numbers(text)                
        text = self.clean_chars.sub('', text) # use only text chars                          
        text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace   
        text = text.strip().lower()
        return text

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.raw_labels)

## Load datasets and model

In [5]:
# Remap "hostile" label (3) to "negative" (1) because the model does not yet support 4 classes
label_remap = {3: 1}

train_dataset = SentiCSVDataset(f'{basepath}/train.csv', label_remap=label_remap)

eval_dataset = SentiCSVDataset(f'{basepath}/validation.csv', label_remap=label_remap)

model = AutoModelForSequenceClassification.from_pretrained('oliverguhr/german-sentiment-bert')

In [6]:
from collections import Counter

train_label_counts = Counter(train_dataset.raw_labels)
print('Label counts in training set:')
for i in range(3):
    ci = train_label_counts[i]
    pct = 100 * ci / len(train_dataset.raw_labels)
    print(f'{label_names[i]} ({i}):\t{ci} samples ({pct:.1f}%)')
print('\nLabel counts in validation set:')
validation_label_counts = Counter(eval_dataset.raw_labels)
for i in range(3):
    ci = validation_label_counts[i]
    pct = 100 * ci / len(eval_dataset.raw_labels)
    print(f'{label_names[i]} ({i}):\t{ci} samples ({pct:.1f}%)')

Label counts in training set:
positive (0):	267 samples (18.0%)
negative (1):	615 samples (41.4%)
neutral (2):	603 samples (40.6%)

Label counts in validation set:
positive (0):	27 samples (16.4%)
negative (1):	65 samples (39.4%)
neutral (2):	73 samples (44.2%)


## Set up training

And evaluate performance before training

In [7]:
output_dir = f'{basepath}/results'
logging_dir = f'{basepath}/logs'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=8e-6,
    num_train_epochs=7,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=32,
    warmup_steps=120,
    weight_decay=0.01,
    logging_dir=logging_dir,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    dataloader_num_workers=1,
    logging_first_step=True,
    logging_steps=47,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Show how it performs before training
trainer.evaluate()

{'eval_loss': 5.430793762207031,
 'eval_runtime': 1.9356,
 'eval_samples_per_second': 85.246}

## Show tensorboard dashboard

Optional. Note that this results in huge cell outputs which can cause problems with saving the notebook. If saving no longer works or is slow, clear the output of the following cell.

In [None]:
%load_ext tensorboard
%tensorboard --logdir '/content/drive/My Drive/text-analytics/news-sentiment/training/logs'

## Train and evaluate

In [9]:
trainer.train()

# Show how it performs after training
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,4.088,1.229975,1.6039,102.875
2,0.9747,1.061316,1.6008,103.076
3,0.9371,0.995773,1.6119,102.363
4,0.799,1.017731,1.608,102.611
5,0.7631,1.077186,1.601,103.06
6,0.6704,1.110236,1.6098,102.496
7,0.6454,1.116556,1.608,102.611


{'epoch': 7.0,
 'eval_loss': 1.1165564060211182,
 'eval_runtime': 1.6133,
 'eval_samples_per_second': 102.276}