# Calculate Interrater Reliability

## Setup

In [2]:
# Load libraries
import pandas as pd
import numpy as np
import itertools
import os
import glob

In [97]:
# Set directories
irr_path = "/Users/tylund/Library/CloudStorage/Dropbox/1. Side Projects/2025.1-Deepfake Threat Landscape/3-tests/irr"

In [98]:
# Load data
files = glob.glob(os.path.join(irr_path, '*.csv'))

df_1 = pd.read_csv(files[0])
df_2 = pd.read_csv(files[1])

## Data Preparation

In [99]:
import re

def expand_to_binary(df, code_cols, id_col='incident_id'):
    """
    Expands multiple code columns with delimited values into binary one-hot encoded columns.
    Handles multiple delimiters, lowercases, strips whitespace, and prefixes each binary column
    with the source column name for clarity.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with one or more code columns.
    code_cols : list of str
        Names of columns containing delimited code strings.
    id_col : str, default 'record_id'
        Column name identifying each record (preserved in the output).
    """
    df = df.copy()
    out_df = df[[id_col]].copy()

    # Normalize all column names
    df.columns = [col.strip().lower() for col in df.columns]

    # Normalize the record_col
    id_col = id_col.lower()

    # Make sure code_cols are lowercase to match normalized column names
    code_cols = [col.lower() for col in code_cols]
    
    all_binary_cols = []  # list to hold DataFrames for each code_col

    for code_col in code_cols:
        # Normalize delimiters and lowercase
        codes_series = (
            df[code_col]
            .fillna('')
            .astype(str)
            .apply(lambda x: re.sub(r'[;|/]', ',', x))
            .apply(lambda x: sorted(set([c.strip().lower() for c in x.split(',') if c.strip()])))
        )

        # Find all unique codes for this column
        unique_codes = sorted(set(code for codes in codes_series for code in codes))

        # Build a DataFrame for this code_col
        binary_data = pd.DataFrame(
            {f"{code_col}_{code}": codes_series.apply(lambda codes: int(code in codes))
             for code in unique_codes},
            index=df.index
        )

        all_binary_cols.append(binary_data)

    # Concatenate all binary columns at once with the record_id
    out_df = pd.concat([out_df] + all_binary_cols, axis=1)

    return out_df

In [100]:
def get_code_cols(df, start_col='Media Type'):
    """
    Retrieves a list of columns that include codes.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with one or more code columns.
    start_col : str
        Name of first code column.
    """
    df = df.copy()
    start_idx = df.columns.get_loc(start_col)
    code_cols = df.columns[start_idx:].tolist()

    return code_cols

In [101]:
def align_dfs(df1, df2, id_col='incident_id'):
    """
    Align two binary-coded data frames for IRR computation.

    - Ensures both data frames have the same record order (based on id_col)
    - Ensures both data frames have the same columns (union of all columns)
    - Fills missing columns with 0
    """
    df1 = df1.copy()
    df2 = df2.copy()
    
    id_col = id_col.lower()  # normalize column name
    
    # Union of all binary columns (excluding id_col)
    all_cols = sorted(set(df1.columns) | set(df2.columns))
    all_cols = [c for c in all_cols if c.lower() != id_col]
    
    # Add missing columns filled with 0
    for col in all_cols:
        if col not in df1.columns:
            df1[col] = 0
        if col not in df2.columns:
            df2[col] = 0

    # Reorder columns: id_col first, then sorted binary columns
    df1 = df1[[id_col] + all_cols].sort_values(id_col).reset_index(drop=True)
    df2 = df2[[id_col] + all_cols].sort_values(id_col).reset_index(drop=True)
    
    return df1, df2

In [102]:
# Create binary data frames
binary_df_1 = expand_to_binary(df_1,
                               code_cols = get_code_cols(df_1)
                              )

binary_df_2 = expand_to_binary(df_2,
                               code_cols = get_code_cols(df_2)
                              )

# Ensure dataframes are aligned for IRR tests
aligned_1, aligned_2 = align_dfs(binary_df_1, binary_df_2, id_col='incident_id')

## Calculate IRR

In [103]:
import krippendorff
from sklearn.metrics import cohen_kappa_score, jaccard_score

In [104]:
def compute_irr_metrics(df1, df2, id_col='incident_id'):
    """
    Compute IRR metrics (Cohen's Kappa, Krippendorff's Alpha, Jaccard similarity)
    between two aligned binary-coded DataFrames.

    Parameters
    ----------
    df1, df2 : pd.DataFrame
        Two aligned binary-coded DataFrames with the same rows and columns.
    id_col : str
        Column name for record identifier (will be excluded from calculations).

    Returns
    -------
    dict
        Dictionary with per-code and average values for all three IRR metrics.
    """
    df1 = df1.copy()
    df2 = df2.copy()
    
    # Normalize column names
    df1.columns = [col.lower() for col in df1.columns]
    df2.columns = [col.lower() for col in df2.columns]
    id_col = id_col.lower()

    # Drop the ID column
    X1 = df1.drop(columns=id_col)
    X2 = df2.drop(columns=id_col)

    code_cols = X1.columns.tolist()

    # Fill NaN with 0 for Kappa and Jaccard (treat missing as "not assigned")
    X1_filled = X1.fillna(0).astype(int)
    X2_filled = X2.fillna(0).astype(int)

    # Cohen's Kappa per code
    kappa_scores = {col: cohen_kappa_score(X1_filled[col], X2_filled[col]) for col in code_cols}
    average_kappa = np.mean(list(kappa_scores.values()))

    # Krippendorff's Alpha (use NaN for missing values)
    rater1 = X1.to_numpy().ravel()
    rater2 = X2.to_numpy().ravel()
    
    # Stack them vertically to create the matrix
    data_stack = np.vstack([rater1, rater2])
    
    kripp_alpha = krippendorff.alpha(reliability_data=data_stack, level_of_measurement='nominal')

    # Jaccard similarity per code
    jaccard_scores = {col: jaccard_score(X1_filled[col], X2_filled[col]) for col in code_cols}
    average_jaccard = np.mean(list(jaccard_scores.values()))

    return {
        'cohen_kappa_per_code': kappa_scores,
        'average_kappa': average_kappa,
        'krippendorff_alpha': kripp_alpha,
        'jaccard_per_code': jaccard_scores,
        'average_jaccard': average_jaccard
    }

In [105]:
irr_results = compute_irr_metrics(aligned_1, aligned_2, id_col='incident_id')

print("Average Cohen's Kappa:", irr_results['average_kappa'])
print("Krippendorff's Alpha:", irr_results['krippendorff_alpha'])
print("Average Jaccard:", irr_results['average_jaccard'])

Average Cohen's Kappa: 0.32619007169217346
Krippendorff's Alpha: 0.49624708036402787
Average Jaccard: 0.3195751279887796


In [106]:
# irr_results is the output from compute_irr_metrics()
kappa_per_code = irr_results['cohen_kappa_per_code']

# Sort codes by Kappa ascending → lowest agreement first
lowest_kappa = dict(sorted(kappa_per_code.items(), key=lambda item: item[1]))

print("Codes with lowest Cohen's Kappa:")
for code, score in list(lowest_kappa.items())[:10]:
    print(f"{code}: {score:.2f}")

Codes with lowest Cohen's Kappa:
response_usage policy change: -0.04
audience_private individual - adult: -0.03
threat actor_activist: -0.03
tool_open-source: -0.03
audience_high profile individuals: 0.00
audience_high-profile individuals: 0.00
audience_national or political constituency: 0.00
audience_unknown: 0.00
goal_unknown: 0.00
harm-audience_economic or financial harm: 0.00


In [108]:
from datetime import datetime

# Get current date as YYYYMMDD
today = datetime.today().strftime('%Y%m%d')

# Construct the filename dynamically
filename = os.path.join(irr_path, f"irr_results_{today}.txt")

with open(filename, "w") as f:
    f.write("Cohen's Kappa per code:\n")
    for code, score in irr_results['cohen_kappa_per_code'].items():
        f.write(f"{code}: {score:.3f}\n")
    
    f.write(f"\nAverage Cohen's Kappa: {irr_results['average_kappa']:.3f}\n")
    f.write(f"Krippendorff's Alpha: {irr_results['krippendorff_alpha']:.3f}\n\n")