In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import evaluate
import torch
import random
import numpy as np
import pandas as pd
from IPython.display import display
pd.options.display.max_columns = None
from sklearn.dummy import DummyClassifier
from tqdm import tqdm
from datasets import Dataset
from collections import Counter
from transformers import set_seed, AutoTokenizer, DebertaForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [4]:
from const import *
from utils import *

In [5]:
seqeval = evaluate.load("seqeval")

Using the latest cached version of the module from /home/nhwpstam/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Wed Jul  5 16:55:09 2023) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


# prepare

In [6]:
def compute_metrics(predictions, labels):
    true_predictions = [[list(label2id.keys())[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[list(label2id.keys())[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = seqeval.compute(predictions=true_predictions, references=true_labels, scheme='IOB2')
    return results

In [7]:
def majority_predict(y_train, y_val):
    y_train_flat = [item for sublist in y_train for item in sublist]
    majority = Counter(y_train_flat).most_common(1)[0][0]
    maj_preds = []
    for elem in y_val:
        maj_preds.append([majority]*(len(elem))) #[majority-1]+[majority]*(len(elem)-1))
    
    return maj_preds

In [8]:
def random_predict(y_train, y_val, seed=97):
    random.seed(seed)
    values = sorted(list(set([item for sublist in y_train for item in sublist])))    
    rand_preds = []
    for instance in y_val:
        rand_instance = []
        for elem in instance:
            rand_instance.append(random.choice(values))
        rand_preds.append(rand_instance)
    
    return rand_preds

In [9]:
majority_results = []
random_results = []
    
for level in tqdm(['macro_l1', 'macro_l2', 'micro_l1', 'micro_l2']):    
    dataset = get_dataset(None)
    if level == 'macro_l1':
        id2label = id2label_macro_l1
        label2id = label2id_macro_l1
        dataset = dataset.add_column("labels", dataset['macro_l1_tags'])
    elif level == 'macro_l2':
        id2label = id2label_macro_l2
        label2id = label2id_macro_l2
        dataset = dataset.add_column("labels", dataset['macro_l2_tags'])
    elif level == 'micro_l1':
        id2label = id2label_micro_l1
        label2id = label2id_micro_l1
        dataset = dataset.add_column("labels", dataset['micro_l1_tags'])
    elif level == 'micro_l2':
        id2label = id2label_micro_l2
        label2id = label2id_micro_l2
        dataset = dataset.add_column("labels", dataset['micro_l2_tags'])
    else:
        print(f'error: level {level} does not exist')
    
    for fold in range(0, 10, 2):
        test_data = dataset.filter(lambda x: x["fold"] == fold)
        #val_data = dataset.filter(lambda x: x["fold"] == fold+1)
        train_data = dataset.filter(lambda x: x["fold"] not in [fold, fold+1])        
        
        X_train = train_data['tokens']
        X_test = test_data['tokens']
        y_train = train_data[level + '_tags']
        y_test = test_data[level + '_tags']
        
        majority_result = compute_metrics(majority_predict(y_train, y_test), y_test)      
        majority_result['fold'] = fold
        majority_result['level'] = level
        majority_results.append(majority_result)
        
        random_result = compute_metrics(random_predict(y_train, y_test), y_test)
        random_result['fold'] = fold
        random_result['level'] = level
        random_results.append(random_result)
                

  0%|          | 0/4 [00:00<?, ?it/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))


Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

 25%|██▌       | 1/4 [00:05<00:15,  5.13s/it]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

 50%|█████     | 2/4 [00:10<00:10,  5.09s/it]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, msg_start, len(result))


Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

 75%|███████▌  | 3/4 [00:15<00:05,  5.08s/it]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1320 [00:00<?, ? examples/s]

100%|██████████| 4/4 [00:20<00:00,  5.04s/it]


In [10]:
majority_df = pd.json_normalize(majority_results)
random_df = pd.json_normalize(random_results)

# majority classifier: results

In [11]:
cols = ['overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']
cols_ma1 = cols + ['Einleitung.f1', 'Hauptteil.f1', 'Konklusion.f1']
ma1_num = ['Einleitung.number', 'Hauptteil.number', 'Konklusion.number']
cols_ma2 = cols + ['Argument.f1', 'Gegenargument.f1']
ma2_num = ['Argument.number', 'Gegenargument.number']
cols_mi1 = cols + ['Claim.f1', 'Gegenthese.f1', 'Modifizierte-These.f1', 'Premise.f1', 'Thema.f1', 'These.f1']
mi1_num = ['Claim.number', 'Gegenthese.number', 'Modifizierte-These.number', 'Premise.number', 'Thema.number', 'These.number']
cols_mi2 = cols + ['Abwägen.f1', 'Auffordern.f1', 'Begründen.f1', 'Beschreiben.f1', 'Einschränken.f1', 'Exemplifizieren.f1', 'Konzedieren.f1', 'Positionieren.f1', 'Referieren.f1', 'Schlussfolgern.f1']
mi2_num = ['Abwägen.number', 'Auffordern.number', 'Begründen.number', 'Beschreiben.number', 'Einschränken.number', 'Exemplifizieren.number', 'Konzedieren.number', 'Positionieren.number', 'Referieren.number', 'Schlussfolgern.number']

In [12]:
majority_df[majority_df.level == 'macro_l1'][cols_ma1].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Einleitung.f1,Hauptteil.f1,Konklusion.f1
mean,0.569697,0.469253,0.514581,0.859068,0.0,0.567495,0.0


In [18]:
majority_df[majority_df.level == 'macro_l2'][cols_ma2].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Argument.f1,Gegenargument.f1
mean,0.00303,0.001504,0.00201,0.559866,0.00202,0.0


In [19]:
majority_df[majority_df.level == 'micro_l1'][cols_mi1].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Claim.f1,Gegenthese.f1,Modifizierte-These.f1,Premise.f1,Thema.f1,These.f1
mean,0.0,0.0,0.0,0.411902,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
majority_df[majority_df.level == 'micro_l2'][cols_mi2].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Abwägen.f1,Auffordern.f1,Begründen.f1,Beschreiben.f1,Einschränken.f1,Exemplifizieren.f1,Konzedieren.f1,Positionieren.f1,Referieren.f1,Schlussfolgern.f1
mean,0.010606,0.002268,0.003737,0.239666,0.0,0.0,0.0,0.009432,0.0,0.0,0.0,0.0,0.0,0.0


# random classifier: results

In [21]:
random_df[random_df.level == 'macro_l1'][cols_ma1].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Einleitung.f1,Hauptteil.f1,Konklusion.f1
mean,0.0,0.0,0.0,0.142895,0.0,0.0,0.0


In [22]:
random_df[random_df.level == 'macro_l2'][cols_ma2].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Argument.f1,Gegenargument.f1
mean,0.0,0.0,0.0,0.197735,0.0,0.0


In [23]:
random_df[random_df.level == 'micro_l1'][cols_mi1].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Claim.f1,Gegenthese.f1,Modifizierte-These.f1,Premise.f1,Thema.f1,These.f1
mean,0.0,0.0,0.0,0.078979,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
random_df[random_df.level == 'micro_l2'][cols_mi2].describe()[1:2]

Unnamed: 0,overall_precision,overall_recall,overall_f1,overall_accuracy,Abwägen.f1,Auffordern.f1,Begründen.f1,Beschreiben.f1,Einschränken.f1,Exemplifizieren.f1,Konzedieren.f1,Positionieren.f1,Referieren.f1,Schlussfolgern.f1
mean,0.0,0.0,0.0,0.044518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# class numbers

In [25]:
ma1_num = ['Einleitung.number', 'Hauptteil.number', 'Konklusion.number']
ma2_num = ['Argument.number', 'Gegenargument.number']
mi1_num = ['Claim.number', 'Gegenthese.number', 'Modifizierte-These.number', 'Premise.number', 'Thema.number', 'These.number']
mi2_num = ['Abwägen.number', 'Auffordern.number', 'Begründen.number', 'Beschreiben.number', 'Einschränken.number', 'Exemplifizieren.number', 'Konzedieren.number', 'Positionieren.number', 'Referieren.number', 'Schlussfolgern.number']
num_cols = ma1_num + ma2_num + mi1_num + mi2_num

In [26]:
random_df[random_df.level == 'macro_l1'][ma1_num].sum()

Einleitung.number     51.0
Hauptteil.number     665.0
Konklusion.number     86.0
dtype: float64

In [27]:
random_df[random_df.level == 'macro_l2'][ma2_num].sum()

Argument.number         1319.0
Gegenargument.number      16.0
dtype: float64

In [28]:
random_df[random_df.level == 'micro_l1'][mi1_num].sum()

Claim.number                 1532.0
Gegenthese.number               7.0
Modifizierte-These.number     138.0
Premise.number                501.0
Thema.number                   47.0
These.number                  832.0
dtype: float64

In [29]:
random_df[random_df.level == 'micro_l2'][mi2_num].sum()

Abwägen.number              9.0
Auffordern.number          76.0
Begründen.number          755.0
Beschreiben.number        836.0
Einschränken.number        80.0
Exemplifizieren.number     29.0
Konzedieren.number         62.0
Positionieren.number      871.0
Referieren.number           8.0
Schlussfolgern.number     424.0
dtype: float64