In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from collections import defaultdict

class NERPredictor:
    def __init__(self, model_name):
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.id2label = self.model.config.id2label

    def _aggregate_tokens(self, tokens, predictions):
        word_predictions = defaultdict(list)
        current_word = ""
        for token, pred in zip(tokens, predictions):
            if token.startswith("##"):
                current_word += token[2:]
            else:
                if current_word:
                    yield current_word, word_predictions[current_word]
                    word_predictions[current_word] = []
                current_word = token
            word_predictions[current_word].append(self.id2label[pred.item()])
        if current_word:
            yield current_word, word_predictions[current_word]

    def _get_most_common(self, labels):
        return max(set(labels), key=labels.count)

    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)[0]  # Take the first (and only) sequence
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        
        # Filter out special tokens and their corresponding predictions
        filtered_tokens = []
        filtered_predictions = []
        for token, pred in zip(tokens, predictions):
            if token not in self.tokenizer.all_special_tokens:
                filtered_tokens.append(token)
                filtered_predictions.append(pred)
        
        return [self._get_most_common(labels) for _, labels in self._aggregate_tokens(filtered_tokens, filtered_predictions)]

# Usage
D_predictor = NERPredictor("ArjanvD95/by_the_horns_D42G")
T_predictor = NERPredictor("ArjanvD95/by_the_horns_T42G")


In [2]:
import json

#open jsonl file
with open('data/by_the_horns_D/holdout.jsonl') as f:
    data = f.readlines()

#get text attribute from json file
texts = [json.loads(d)['text'] for d in data]

#do predictions on all texts, store in list
D_lllmaaa_predictions = [D_predictor.predict(text) for text in texts]
T_lllmaaa_predictions = [T_predictor.predict(text) for text in texts]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [3]:
D_llm_predictions_path = "llm_annotations_with_D_demos.jsonl"

#open jsonl file
with open(D_llm_predictions_path) as f:
    D_data = f.readlines()

#get tags attribute from jsonl file
D_llm_predictions = [json.loads(d)['tags'] for d in D_data]

In [4]:
T_llm_predictions_path = "llm_annotations_with_T_demos.jsonl"
#open jsonl file
with open(T_llm_predictions_path) as f:
    T_data = f.readlines()

#get tags attribute from jsonl file
T_llm_predictions = [json.loads(d)['tags'] for d in T_data]

In [5]:
D_human_path = "data/by_the_horns_D/holdout.jsonl"

#open jsonl file
with open(D_human_path) as f:
    D_human = f.readlines()

#get tags attribute from jsonl file
D_hum_predictions = [json.loads(d)['tags'] for d in D_human]

In [6]:
T_human_path = "data/by_the_horns_T/holdout.jsonl"

#open jsonl file
with open(T_human_path) as f:
    T_human = f.readlines()

#get tags attribute from jsonl file
T_hum_predictions = [json.loads(d)['tags'] for d in T_human]

In [9]:
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report

def evaluate_ner(true_labels, pred_labels):
    # Calculate metrics
    precision_micro = precision_score(true_labels, pred_labels, average='micro')
    recall_micro = recall_score(true_labels, pred_labels, average='micro')
    f1_micro = f1_score(true_labels, pred_labels, average='micro')
    
    precision_macro = precision_score(true_labels, pred_labels, average='macro')
    recall_macro = recall_score(true_labels, pred_labels, average='macro')
    f1_macro = f1_score(true_labels, pred_labels, average='macro')
    
    accuracy = accuracy_score(true_labels, pred_labels)
    
    # Get detailed classification report
    report = classification_report(true_labels, pred_labels)
    
    return {
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'accuracy': accuracy,
        'classification_report': report
    }


# D_lllmaaa_predictions, T_lllmaaa_predictions, D_llm_predictions, T_llm_predictions, T_hum_predictions, D_hum_predictions
# Evaluate Method 1
results_method1 = evaluate_ner(T_hum_predictions, T_lllmaaa_predictions)
print("Results for Method 1:")
for metric, value in results_method1.items():
    if metric != 'classification_report':
        print(f"{metric}: {value:.4f}")
print("\nClassification Report for Method 1:")
print(results_method1['classification_report'])


Results for Method 1:
precision_micro: 0.6049
recall_micro: 0.5904
f1_micro: 0.5976
precision_macro: 0.4695
recall_macro: 0.4690
f1_macro: 0.4652
accuracy: 0.9619

Classification Report for Method 1:
                    precision    recall  f1-score   support

Collective-Literal       0.00      0.00      0.00         0
 Organisms-Literal       0.89      0.89      0.89        27
     Parts-Literal       0.31      0.43      0.36        28
  Products-Literal       0.68      0.56      0.61       111

         micro avg       0.60      0.59      0.60       166
         macro avg       0.47      0.47      0.47       166
      weighted avg       0.65      0.59      0.62       166



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter

def calculate_metrics(y_true, y_pred, exclude_labels=None):
    if exclude_labels is None:
        exclude_labels = []
    
    # Flatten the lists
    y_true_flat = [item for sublist in y_true for item in sublist]
    y_pred_flat = [item for sublist in y_pred for item in sublist]
    
    # Get unique labels
    labels = sorted(set(y_true_flat + y_pred_flat))
    
    # Calculate metrics
    precision, recall, f1, support = precision_recall_fscore_support(y_true_flat, y_pred_flat, labels=labels, average=None)
    
    # Calculate micro and macro averages
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(y_true_flat, y_pred_flat, labels=labels, average='micro')
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(y_true_flat, y_pred_flat, labels=labels, average='macro')
    
    # Calculate micro average without excluded labels
    filtered_labels = [label for label in labels if label not in exclude_labels]
    precision_micro_filtered, recall_micro_filtered, f1_micro_filtered, _ = precision_recall_fscore_support(
        y_true_flat,
        y_pred_flat,
        labels=filtered_labels,
        average='micro'
    )
    
    # Create classification report
    report = {label: {'precision': p, 'recall': r, 'f1-score': f, 'support': s} 
              for label, p, r, f, s in zip(labels, precision, recall, f1, support)}
    
    report['micro avg'] = {'precision': precision_micro, 'recall': recall_micro, 'f1-score': f1_micro, 'support': len(y_true_flat)}
    report['macro avg'] = {'precision': precision_macro, 'recall': recall_macro, 'f1-score': f1_macro, 'support': len(y_true_flat)}
    report['micro avg (filtered)'] = {'precision': precision_micro_filtered, 'recall': recall_micro_filtered, 'f1-score': f1_micro_filtered, 'support': sum(s for label, s in zip(labels, support) if label not in exclude_labels)}
    
    return report


def print_f1_scores(report):
    print("F1 Scores for each label:")
    print("{:<30} {:<10} {:<10}".format("Label", "F1 Score", "Support"))
    print("-" * 50)
    
    for label, metrics in report.items():
        if label not in ['micro avg', 'macro avg', 'micro avg (filtered)']:
            f1 = metrics['f1-score']
            support = metrics['support']
            print("{:<30} {:<10.4f} {:<10}".format(label, f1, support))
    
    print("\nOverall Scores:")
    print("{:<25} {:<10.4f}".format("Micro F1:", report['micro avg']['f1-score']))
    print("{:<25} {:<10.4f}".format("Macro F1:", report['macro avg']['f1-score']))
    print("{:<25} {:<10.4f}".format("Micro F1 (without 'O'):", report['micro avg (filtered)']['f1-score']))

def evaluate_ner(true_labels, pred_labels, exclude_labels=None):
    report = calculate_metrics(true_labels, pred_labels, exclude_labels)
    return report

In [11]:
results_method1 = calculate_metrics(T_hum_predictions, T_lllmaaa_predictions, "O")
print("Results for Method 1:")
print_f1_scores(results_method1)

Results for Method 1:
F1 Scores for each label:
Label                          F1 Score   Support   
--------------------------------------------------
Animals-Collective-Literal     0.0000     0         
Animals-Organisms-Literal      0.9057     26        
Animals-Parts-Literal          0.3438     28        
Animals-Products-Literal       0.5342     88        
O                              0.9886     2640      
Plants-Collective-Literal      0.0000     0         
Plants-Organisms-Literal       0.0000     1         
Plants-Parts-Literal           0.0000     0         
Plants-Products-Literal        0.5538     27        

Overall Scores:
Micro F1:                 0.9619    
Macro F1:                 0.3696    
Micro F1 (without 'O'):   0.5444    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from tabulate import tabulate
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

def compare_ner_models(true_labels, pred_labels_list, method_names, exclude_labels=None):
    if exclude_labels is None:
        exclude_labels = []

    results = []
    for pred_labels in pred_labels_list:
        report = calculate_metrics(true_labels, pred_labels, exclude_labels)
        results.append(report)

    metrics = ['micro avg', 'macro avg', 'micro avg (filtered)']
    tables = []

    for metric in metrics:
        table_data = []
        for i, method in enumerate(method_names):
            row = [method]
            row.append(f"{results[i][metric]['precision']:.4f}")
            row.append(f"{results[i][metric]['recall']:.4f}")
            row.append(f"{results[i][metric]['f1-score']:.4f}")
            table_data.append(row)

        headers = ["Method", "Precision", "Recall", "F1-Score"]
        table = tabulate(table_data, headers=headers, tablefmt="grid")
        tables.append((metric, table))

    return tables

# Example usage:
true_labels = T_hum_predictions  # Your true labels


#
zwartbol_path = "zwartbol.jsonl"
#open jsonl file
with open(zwartbol_path) as f:
    zwartbol_data = f.readlines()

#get tags attribute from jsonl file
zwartbol_predictions = [json.loads(d)['tags'] for d in zwartbol_data]

kwarkbol_path = "kwarkbol.jsonl"
#open jsonl file
with open(kwarkbol_path) as f:
    kwarkbol_data = f.readlines()

#get tags attribute from jsonl file
kwarkbol_predictions = [json.loads(d)['tags'] for d in kwarkbol_data]

#


pred_labels_list = [D_lllmaaa_predictions, T_lllmaaa_predictions, D_llm_predictions, T_llm_predictions, D_hum_predictions, T_hum_predictions, zwartbol_predictions, kwarkbol_predictions]

method_names = ["D_LLLMaAA", "T_LLMaAA", "D_llm", "T_llm", "D_hum", "zwartbol", "kwarkbol"]

tables = compare_ner_models(true_labels, pred_labels_list, method_names, exclude_labels=['O'])

for metric, table in tables:
    print(f"\n{metric.upper()} Table:")
    print(table)
    print("\n")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



MICRO AVG Table:
+-----------+-------------+----------+------------+
| Method    |   Precision |   Recall |   F1-Score |
| D_LLLMaAA |      0.9648 |   0.9648 |     0.9648 |
+-----------+-------------+----------+------------+
| T_LLMaAA  |      0.9619 |   0.9619 |     0.9619 |
+-----------+-------------+----------+------------+
| D_llm     |      0.9537 |   0.9537 |     0.9537 |
+-----------+-------------+----------+------------+
| T_llm     |      0.9491 |   0.9491 |     0.9491 |
+-----------+-------------+----------+------------+
| D_hum     |      0.9851 |   0.9851 |     0.9851 |
+-----------+-------------+----------+------------+
| zwartbol  |      1      |   1      |     1      |
+-----------+-------------+----------+------------+
| kwarkbol  |      0.9648 |   0.9648 |     0.9648 |
+-----------+-------------+----------+------------+



MACRO AVG Table:
+-----------+-------------+----------+------------+
| Method    |   Precision |   Recall |   F1-Score |
| D_LLLMaAA |      0.4491 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
import jsonlines
import pandas as pd
from sklearn.metrics import f1_score
import os

# Function to read JSONL file and extract tags
def read_tags(file_path):
    tags = []
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            tags.extend(obj['tags'])
    return tags

# Path to the folder containing the files
folder_path = 'data/by_the_horns_T/predictions_with_demo'
human_annotation_path = 'data/by_the_horns_D/holdout.jsonl'

# Get list of files
files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.jsonl')]

# Load tags from all files
all_tags = [read_tags(file) for file in files]

# Load tags from the human annotation file
human_tags = read_tags(human_annotation_path)

# Calculate weighted F1 scores
f1_scores = {}
for i, tags in enumerate(all_tags):
    f1 = f1_score(human_tags, tags, average='weighted', labels=list(set(human_tags + tags)))
    f1_scores[f'File_{i+1}'] = f1

# Create a DataFrame to display the scores
df_f1_scores = pd.DataFrame(list(f1_scores.items()), columns=['File', 'Weighted F1 Score'])

# Print the DataFrame
print(df_f1_scores)


     File  Weighted F1 Score
0  File_1           0.966608
1  File_2           0.964809
2  File_3           0.966147
3  File_4           0.969881
4  File_5           0.969688


In [19]:
import numpy as np
import csv

def pairwise_ner_comparison(pred_labels_list, method_names, exclude_labels=None):
    if exclude_labels is None:
        exclude_labels = [] 

    n_methods = len(pred_labels_list)
    metrics = ['micro avg', 'macro avg', 'micro avg (filtered)']
    results = {metric: np.zeros((n_methods, n_methods)) for metric in metrics}

    for i in range(n_methods):
        for j in range(n_methods):
            if i != j:
                report = calculate_metrics(pred_labels_list[i], pred_labels_list[j], exclude_labels)
                for metric in metrics:
                    results[metric][i, j] = report[metric]['f1-score']

    # Create CSV file
    with open('pegel.csv', 'w', newline='') as csvfile: #!!!
        writer = csv.writer(csvfile)
        writer.writerow(['Metric'] + method_names)

        for metric in metrics:
            writer.writerow([f"\n{metric.upper()}"])
            for i, method in enumerate(method_names):
                row = [method] + [f"{score:.4f}" for score in results[metric][i]]
                writer.writerow(row)

    return results

# Example usage:
pred_labels_list = [D_lllmaaa_predictions, T_lllmaaa_predictions, D_llm_predictions, T_llm_predictions, D_hum_predictions, T_hum_predictions,zwartbol_predictions, kwarkbol_predictions]
pred_labels_list.reverse()
method_names = ["D_LLLMaAA", "T_LLMaAA", "D_llm", "T_llm", "D_hum", "T_hum", "zwartbol", "kwarkbol"]
method_names.reverse()
results = pairwise_ner_comparison(pred_labels_list, method_names, exclude_labels=['O'])

# Print matrices (optional)
for metric, matrix in results.items():
    print(f"\n{metric.upper()} Matrix:")
    print(np.array2string(matrix, precision=4, suppress_small=True))
    print("\n")

print("Results have been saved to 'pegel.csv'")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr


MICRO AVG Matrix:
[[0.     0.9765 0.9641 0.9658 0.9605 0.9619 0.963  0.9662]
 [0.9765 0.     0.9648 0.9641 0.9633 0.9669 0.9623 0.963 ]
 [0.9641 0.9648 0.     0.9851 0.9491 0.9537 0.9619 0.9648]
 [0.9658 0.9641 0.9851 0.     0.9541 0.958  0.9633 0.9676]
 [0.9605 0.9633 0.9491 0.9541 0.     0.9694 0.953  0.9566]
 [0.9619 0.9669 0.9537 0.958  0.9694 0.     0.9498 0.953 ]
 [0.963  0.9623 0.9619 0.9633 0.953  0.9498 0.     0.9819]
 [0.9662 0.963  0.9648 0.9676 0.9566 0.953  0.9819 0.    ]]



MACRO AVG Matrix:
[[0.     0.6514 0.3466 0.4432 0.4836 0.4467 0.5155 0.4395]
 [0.6514 0.     0.3674 0.482  0.4788 0.5244 0.4106 0.4041]
 [0.3466 0.3674 0.     0.5858 0.2817 0.46   0.3696 0.4381]
 [0.4432 0.482  0.5858 0.     0.3534 0.4831 0.4642 0.5273]
 [0.4836 0.4788 0.2817 0.3534 0.     0.4999 0.374  0.3516]
 [0.4467 0.5244 0.46   0.4831 0.4999 0.     0.3636 0.3828]
 [0.5155 0.4106 0.3696 0.4642 0.374  0.3636 0.     0.6236]
 [0.4395 0.4041 0.4381 0.5273 0.3516 0.3828 0.6236 0.    ]]



MICRO AVG (

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## **Error Analyis**

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from collections import defaultdict

class NERPredictor:
    def __init__(self, model_name):
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.id2label = self.model.config.id2label

    def _aggregate_tokens(self, tokens, predictions):
        word_predictions = defaultdict(list)
        current_word = ""
        current_start = 0
        for token, pred in zip(tokens, predictions):
            if token.startswith("##"):
                current_word += token[2:]
            else:
                if current_word:
                    yield current_word, word_predictions[current_word], current_start
                    word_predictions[current_word] = []
                    current_start += len(current_word) + 1  # +1 for space
                current_word = token
            word_predictions[current_word].append(self.id2label[pred.item()])
        if current_word:
            yield current_word, word_predictions[current_word], current_start

    def _get_most_common(self, labels):
        return max(set(labels), key=labels.count)

    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)[0]  # Take the first (and only) sequence
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        
        # Filter out special tokens and their corresponding predictions
        filtered_tokens = []
        filtered_predictions = []
        for token, pred in zip(tokens, predictions):
            if token not in self.tokenizer.all_special_tokens:
                filtered_tokens.append(token)
                filtered_predictions.append(pred)
        
        results = []
        for word, labels, start in self._aggregate_tokens(filtered_tokens, filtered_predictions):
            most_common_label = self._get_most_common(labels)
            if most_common_label != "O":  # Assuming "O" is used for non-entity tokens
                results.append({
                    "entity": word,
                    "label": most_common_label,
                    "start": start,
                    "end": start + len(word)
                })
        return results

# Usage
D_predictor = NERPredictor("ArjanvD95/by_the_horns_D42G")
T_predictor = NERPredictor("ArjanvD95/by_the_horns_T42G")

In [129]:
tokens = [json.loads(d)['tokens'] for d in D_data]
texts = [json.loads(d)['text'] for d in D_data]

D_hum_path = "data/by_the_horns_D/holdout.jsonl"
T_hum_path = "data/by_the_horns_T/holdout.jsonl"

#open jsonl file
with open(D_hum_path) as f:
    D_data = f.readlines()

with open(T_hum_path) as f:
    T_data = f.readlines()


#get tags attribute from jsonl file
D_llm_predictions = [json.loads(d)['tags'] for d in D_data]

with open('llm_annotations_with_D_demos.jsonl') as f:
    direct_D_data = f.readlines()
    
with open('llm_annotations_with_T_demos.jsonl') as f:
    direct_T_data = f.readlines()

In [130]:
labels_D = [json.loads(d)['labels'] for d in D_data]
labels_T = [json.loads(d)['labels'] for d in T_data]
labels_direct_D = [json.loads(d)['labels'] for d in direct_D_data]
labels_direct_T = [json.loads(d)['labels'] for d in direct_T_data]
labels_indirect_D = [[{'span': item['entity'], 'type': item['label']} for item in entry] for entry in [D_predictor.predict(text) for text in texts]]
labels_indirect_T = [[{'span': item['entity'], 'type': item['label']} for item in entry] for entry in [T_predictor.predict(text) for text in texts]]



In [131]:
# Combine all the annotations into a list of dictionaries
combined_annotations = []
for i in range(len(texts)):
    combined_annotations.append({
        'text': texts[i],
        'labels_D': labels_D[i],
        'labels_T': labels_T[i],
        'labels_direct_D': labels_direct_D[i],
        'labels_direct_T': labels_direct_T[i],
        'labels_indirect_D': labels_indirect_D[i],
        'labels_indirect_T': labels_indirect_T[i]
    })

# Create DataFrame
df = pd.DataFrame(combined_annotations)

In [132]:
df.to_csv("error_analysis.csv")

In [133]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from collections import defaultdict

class NERPredictor:
    def __init__(self, model_name):
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.id2label = self.model.config.id2label

    def _aggregate_tokens(self, tokens, predictions):
        word_predictions = defaultdict(list)
        current_word = ""
        current_start = 0
        for token, pred in zip(tokens, predictions):
            if token.startswith("##"):
                current_word += token[2:]
            else:
                if current_word:
                    yield current_word, word_predictions[current_word], current_start
                    word_predictions[current_word] = []
                    current_start += len(current_word) + 1  # +1 for space
                current_word = token
            word_predictions[current_word].append(self.id2label[pred.item()])
        if current_word:
            yield current_word, word_predictions[current_word], current_start

    def _get_most_common(self, labels):
        return max(set(labels), key=labels.count)

    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.argmax(dim=-1)[0]  # Take the first (and only) sequence
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        
        # Filter out special tokens and their corresponding predictions
        filtered_tokens = []
        filtered_predictions = []
        for token, pred in zip(tokens, predictions):
            if token not in self.tokenizer.all_special_tokens:
                filtered_tokens.append(token)
                filtered_predictions.append(pred)
        
        results = []
        for word, labels, start in self._aggregate_tokens(filtered_tokens, filtered_predictions):
            most_common_label = self._get_most_common(labels)
            if most_common_label != "O":  # Assuming "O" is used for non-entity tokens
                results.append({
                    "entity": word,
                    "label": most_common_label,
                    "start": start,
                    "end": start + len(word)
                })
        return results

# Usage
D_predictor = NERPredictor("ArjanvD95/by_the_horns_D42G")
T_predictor = NERPredictor("ArjanvD95/by_the_horns_T42G")

In [135]:
def all_tools_same(row):
    annotations = [
        row['labels_D'], 
        row['labels_T'], 
        row['labels_direct_D'], 
        row['labels_direct_T'], 
        row['labels_indirect_D'], 
        row['labels_indirect_T']
    ]
    
    # Flatten the list of annotations and convert to a set of tuples
    annotations_flat = [tuple(item.items()) for sublist in annotations for item in sublist]
    
    # Check if all flattened annotations are the same
    return len(set(annotations_flat)) == 1

# Apply the function to each row
df['all_tools_same'] = df.apply(all_tools_same, axis=1)

# Count how often all tools give the same annotation
same_annotation_count = df['all_tools_same'].sum()

print(f"All tools give the same annotation {same_annotation_count} times.")

All tools give the same annotation 2 times.


In [136]:
import pandas as pd
import json

# Assuming you have your DataFrame `df`
# df = pd.DataFrame({...})

# Prepare the text content
text_content = ""

for index, row in df.iterrows():
    text_content += f"For sentence {index+1}:\n"
    text_content += f"Text: {row['text']}\n"
    text_content += "labels_D has: " + json.dumps(row['labels_D'], indent=4) + "\n"
    text_content += "labels_T has: " + json.dumps(row['labels_T'], indent=4) + "\n"
    text_content += "labels_direct_D has: " + json.dumps(row['labels_direct_D'], indent=4) + "\n"
    text_content += "labels_direct_T has: " + json.dumps(row['labels_direct_T'], indent=4) + "\n"
    text_content += "labels_indirect_D has: " + json.dumps(row['labels_indirect_D'], indent=4) + "\n"
    text_content += "labels_indirect_T has: " + json.dumps(row['labels_indirect_T'], indent=4) + "\n"
    text_content += "\n"

# Save the results to a text file
with open('annotation_comparison_results.txt', 'w') as file:
    file.write(text_content)

print("Results saved to annotation_comparison_results.txt")


Results saved to annotation_comparison_results.txt


In [None]:
src/confusion_matrices.py

In [None]:
python src/ner_confusion_matrices.py --d_human_annotations annotations/parsed_annotations/annotations_D.json  --t_human_annotations annotations/parsed_annotations/annotation_T.json --direct_demo1 data/by_the_horns_D/predictions_with_demo --direct_demo2 data/by_the_horns_T/predictions_with_T --output_folder 