
# ASSIGNMENT TASK 3 : INTER ANNOTATOR AGREEMENT
Task 3: Implementing Inter-Annotator Agreement Export the annotations in either JSON or CSV files. Using Python code, calculate Cohen’s Kappa & Fleiss Kappa. Use Cohen’s Kappa for the NLP Dataset Task. Use Fleiss Kappa for the CV Dataset Task. Get the third annotation from any other team and then calculate the Fleiss Kappa. Output the agreement score and interpret its significance.

In [None]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

# Cohen's Kappa for NLP Annotations

In [None]:
# Load annotations from CSV files
def load_csv_annotations(file_path, label_column): # Process annotations from a csv file
    data = pd.read_csv(file_path)
    extracted_tags = []
    for _, row in data.iterrows():
        try:
            tags = eval(row[label_column])  # Convert stringified lists to Python objects
            tags.sort(key=lambda x: x["start"])  # Sort by the "start" field
            extracted_tags.append([tag["labels"][0] for tag in tags])
        except Exception:
            extracted_tags.append([])  # Handle rows with invalid or missing labels
    return extracted_tags

# Align POS tags between two annotators
def synchronize_csv_annotations(tags_list1, tags_list2): # Synchronize the annotations of two annotators
    aligned_list1 = []
    aligned_list2 = []
    for tags1, tags2 in zip(tags_list1, tags_list2):
        length = min(len(tags1), len(tags2))
        aligned_list1.extend(tags1[:length])
        aligned_list2.extend(tags2[:length])
    return aligned_list1, aligned_list2



In [None]:
# File paths for the two annotators
csv_file_annotator1 = "het_nlp_csv.csv"
csv_file_annotator2 = "shivraj_nlp_csv.csv"

# Load and process annotations
data_annotator1 = load_csv_annotations(csv_file_annotator1, label_column="label")
data_annotator2 = load_csv_annotations(csv_file_annotator2, label_column="label")

# Synchronize annotations for comparison
aligned_tags_annotator1, aligned_tags_annotator2 = synchronize_csv_annotations(data_annotator1, data_annotator2)

In [None]:
# Display aligned tags
for index in range(len(aligned_tags_annotator1)):
    print(f'Tag Index: {index} ----> {aligned_tags_annotator1[index]}   :    {aligned_tags_annotator2[index]}')

# Compute Cohen's Kappa score
agreement_score = cohen_kappa_score(aligned_tags_annotator1, aligned_tags_annotator2)

# Display the agreement score
print(f"Cohen's Kappa Score: {agreement_score:.4f}")

Tag Index: 0 ----> PROPN   :    PROPN
Tag Index: 1 ----> ADP   :    NOUN
Tag Index: 2 ----> PROPN   :    PROPN
Tag Index: 3 ----> ADP   :    ADJ
Tag Index: 4 ----> NUM   :    VERB
Tag Index: 5 ----> NOUN   :    NOUN
Tag Index: 6 ----> X   :    X
Tag Index: 7 ----> X   :    DET
Tag Index: 8 ----> DET   :    DET
Tag Index: 9 ----> NOUN   :    NOUN
Tag Index: 10 ----> PART   :    PART
Tag Index: 11 ----> PRON_WH   :    PRON_WH
Tag Index: 12 ----> ADJ   :    ADJ
Tag Index: 13 ----> NOUN   :    NOUN
Tag Index: 14 ----> ADP   :    ADJ
Tag Index: 15 ----> PART   :    PART
Tag Index: 16 ----> X   :    X
Tag Index: 17 ----> NUM   :    DET
Tag Index: 18 ----> CONJ   :    CONJ
Tag Index: 19 ----> PROPN   :    PROPN
Tag Index: 20 ----> NOUN   :    NOUN
Tag Index: 21 ----> PROPN   :    PROPN
Tag Index: 22 ----> ADP   :    ADP
Tag Index: 23 ----> VERB   :    VERB
Tag Index: 24 ----> PRON   :    PRON
Tag Index: 25 ----> NOUN   :    NOUN
Tag Index: 26 ----> X   :    X
Tag Index: 27 ----> PROPN   :    

In [None]:
# Interpret the agreement score
if agreement_score > 0.8:
    score_interpretation = "Almost Perfect Agreement"
elif agreement_score > 0.6:
    score_interpretation = "Substantial Agreement"
elif agreement_score > 0.4:
    score_interpretation = "Moderate Agreement"
elif agreement_score > 0.2:
    score_interpretation = "Fair Agreement"
elif agreement_score > 0:
    score_interpretation = "Slight Agreement"
else:
    score_interpretation = "No Agreement"

print(f"Interpretation: {score_interpretation}")

Interpretation: Moderate Agreement


*  The range of moderate agreement (0.41–0.60) is represented by a Cohen's Kappa score of 0.5223.
*  This suggests that there is some degree of discrepancy in the annotations made by the two annotators, despite their reasonable level of agreement.
*  This score indicates that although the annotators agree on a large number of judgments (in this case, POS tagging), there are some points of contention, and the annotation procedures or standards would require more work to reach greater agreement.



# Fleiss's Kappa for CV Annotations

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.inter_rater import fleiss_kappa
from collections import defaultdict
import statsmodels

In [None]:
# Paths to the CSV files (assumed to contain annotations from different annotators)
csv_files = ['het_cv_csv.csv', 'shivraj_cv_csv.csv', 'TEAM2_cv_csv.csv']

def compute_fleiss_kappa(csv_files, label_column="label", image_column="image"): # Fleiss Kappa for inter annotator agreement
    # Container to store annotations from all annotators
    annotation_matrix = []

    # Process each annotator's CSV file
    for idx, file_path in enumerate(csv_files):
        with open(file_path, 'r') as file:
            # Load the CSV file into a DataFrame
            data = pd.read_csv(file)

            # Extract the labels for classification
            annotations = list(data[label_column])
            annotation_matrix.append(annotations)

    # Display aligned annotations for each image
    for i in range(len(annotation_matrix[0])):
        print(f"Image {i+1}: Annotator 1 -> {annotation_matrix[0][i]}, Annotator 2 -> {annotation_matrix[1][i]}, Annotator 3 -> {annotation_matrix[2][i]}")

    # Aggregate annotations to a table format suitable for Fleiss' Kappa
    fleiss_table = statsmodels.stats.inter_rater.aggregate_raters(annotation_matrix)
    print("Fleiss Table:\n", fleiss_table)

    # Compute Fleiss' Kappa score
    kappa_score = statsmodels.stats.inter_rater.fleiss_kappa(fleiss_table[0], method='fleiss')
    return kappa_score


# Compute Fleiss' Kappa score for the given CSV files
fleiss_kappa_result = compute_fleiss_kappa(csv_files)

# Output the Fleiss' Kappa score
print(f"Fleiss' Kappa Score: {fleiss_kappa_result:.4f}")


Image 1: Annotator 1 -> Truck, Annotator 2 -> No Truck, Annotator 3 -> No Truck
Image 2: Annotator 1 -> Truck, Annotator 2 -> No Truck, Annotator 3 -> No Truck
Image 3: Annotator 1 -> No Truck, Annotator 2 -> No Truck, Annotator 3 -> Truck
Image 4: Annotator 1 -> No Truck, Annotator 2 -> Truck, Annotator 3 -> No Truck
Image 5: Annotator 1 -> Truck, Annotator 2 -> No Truck, Annotator 3 -> No Truck
Image 6: Annotator 1 -> Truck, Annotator 2 -> No Truck, Annotator 3 -> No Truck
Image 7: Annotator 1 -> No Truck, Annotator 2 -> Truck, Annotator 3 -> No Truck
Image 8: Annotator 1 -> Truck, Annotator 2 -> No Truck, Annotator 3 -> Truck
Image 9: Annotator 1 -> Truck, Annotator 2 -> Truck, Annotator 3 -> No Truck
Image 10: Annotator 1 -> No Truck, Annotator 2 -> Truck, Annotator 3 -> No Truck
Image 11: Annotator 1 -> Truck, Annotator 2 -> Truck, Annotator 3 -> Truck
Image 12: Annotator 1 -> Truck, Annotator 2 -> Truck, Annotator 3 -> Truck
Image 13: Annotator 1 -> Truck, Annotator 2 -> Truck, A

*  Poor agreement (almost "No Agreement") is indicated by a Fleiss' Kappa score of 0.0200.
*  According to this score, there is little consistency in the annotators' classifications of photos as "Truck" or "No Truck," indicating that they differ greatly.
*  The low level of agreement could result from:
1. Ambiguity in the dataset: Annotators may interpret certain photographs differently if they don't clearly show a truck within a reasonable distance or none at all.
2. Absence of explicit annotation guidelines: It's possible that the annotators used different labeling standards, due to the vague definition of a "Truck". Some heavy vehicles carrying loads like a tempo might be viewed by some as a truck.