In [2]:
# torch libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# pytorch libraries
import pytorch_lightning as pl
from torchmetrics import F1Score
from torchmetrics.functional import accuracy, auroc #F1Score #f1
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

# transformers libraries
from transformers import AutoTokenizer, DebertaV2Model, AdamW, get_linear_schedule_with_warmup

from tqdm.auto import tqdm

import pandas as pd


In [3]:
#set working directory to the downloaded folder
import os
os.chdir('/Users/au486628/Library/CloudStorage/GoogleDrive-tbs.widmann@gmail.com/My Drive/Work/Tools/political-moral-emotional-mDeBERTa')

In [14]:
BERT_MODEL_NAME = "microsoft/mdeberta-v3-base"
LABEL_COLUMNS = ['morality_binary', 'emotion_binary', 'positive_binary', 'negative_binary']

class CrowdCodedTagger(pl.LightningModule):

    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = DebertaV2Model.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.last_hidden_state[:, 0])
        output = torch.sigmoid(output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):

        labels = []
        predictions = []
        for output in outputs:
          for out_labels in output["labels"].detach().cpu():
            labels.append(out_labels)
          for out_predictions in output["predictions"].detach().cpu():
            predictions.append(out_predictions)

        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)

        for i, name in enumerate(LABEL_COLUMNS):
          class_roc_auc = auroc(predictions[:, i], labels[:, i])
          self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)

    def configure_optimizers(self):

      optimizer = AdamW(self.parameters(), lr=2e-5) #DEFINING LEARNING RATE

      scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=self.n_warmup_steps,
        num_training_steps=self.n_training_steps
      )

      return dict(
        optimizer=optimizer,
        lr_scheduler=dict(
          scheduler=scheduler,
          interval='step'
        )
      )

def process_dataframe_with_transformer(df):
    model = CrowdCodedTagger(n_classes=4)
    model.load_state_dict(torch.load("./model/pytorch_model.pt"), strict = False)
    model.to("cpu") # model.to("cuda")
    model.eval()  # putting model into evaluation mode

    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
    batch_size = 8

    def predict_labels(df):
        input_text = df['sentence'].tolist()
        num_inputs = len(input_text)
        num_batches = (num_inputs - 1) // batch_size + 1
        batched_input = [input_text[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]
        batched_output = []

        for i, batch in enumerate(tqdm(batched_input)):
            encoded_input = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
            outputs = model(**encoded_input.to("cpu")) #outputs = model(**encoded_input.to("cuda"))

            tensor_values = outputs[1].tolist()
            decimal_numbers = [[num for num in sublist] for sublist in tensor_values]
            output_df = pd.DataFrame(decimal_numbers, columns=LABEL_COLUMNS)
            threshold = 0.5
            threshold_fn = lambda x: 1 if x >= threshold else 0
            output_df = output_df.applymap(threshold_fn)
            input_df = df.iloc[i * batch_size:(i + 1) * batch_size].reset_index(drop=True)
            output_df = pd.concat([input_df, output_df], axis=1)

            batched_output.append(output_df)

        output_df = pd.concat(batched_output, ignore_index=True)
        return output_df

    processed_df = predict_labels(df)
    return processed_df

In [15]:
input_df = pd.read_csv("./example_data.csv")
input_df

Unnamed: 0,sentence,date
0,Sehr geehrte Frau Präsidentin!,2023-03-02
1,Liebe Kolleginnen und Kollegen!,2023-03-02
2,"Heute vor einem Jahr, am siebten Tag des russi...",2023-03-02
3,"In solchen Minuten überfällt mich die Angst, u...",2023-03-02
4,In meiner Fantasie spielen sich jetzt schon hu...,2023-03-02
...,...,...
95,Denn wir sind gut durch diesen Winter gekommen...,2023-03-02
96,"Nichts davon ist eingetreten, weil wir entschl...",2023-03-02
97,Und hinter diesem „Wir“ steht unser ganzes Lan...,2023-03-02
98,Dadurch sind die Großhandelspreise gefallen; d...,2023-03-02


In [16]:
result = process_dataframe_with_transformer(input_df)

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.word_embeddings._weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSeque

  0%|          | 0/13 [00:00<?, ?it/s]

In [17]:
result

Unnamed: 0,sentence,date,morality_binary,emotion_binary,positive_binary,negative_binary
0,Sehr geehrte Frau Präsidentin!,2023-03-02,0,0,0,0
1,Liebe Kolleginnen und Kollegen!,2023-03-02,0,1,0,0
2,"Heute vor einem Jahr, am siebten Tag des russi...",2023-03-02,0,0,0,0
3,"In solchen Minuten überfällt mich die Angst, u...",2023-03-02,1,1,0,0
4,In meiner Fantasie spielen sich jetzt schon hu...,2023-03-02,1,1,0,0
...,...,...,...,...,...,...
95,Denn wir sind gut durch diesen Winter gekommen...,2023-03-02,0,1,0,0
96,"Nichts davon ist eingetreten, weil wir entschl...",2023-03-02,0,0,1,0
97,Und hinter diesem „Wir“ steht unser ganzes Lan...,2023-03-02,0,0,0,0
98,Dadurch sind die Großhandelspreise gefallen; d...,2023-03-02,0,0,0,0


In [20]:
result.to_csv('example_results.csv', index=False)