In [1]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append('..')  # This moves one directory up in the directory structure
from utils.code_party_utils import calculate_fleiss_kappa, select_highest_agreement_code

# Open raw data with individual annotations 
df_path = Path.cwd().parent / 'data' / 'warnme-codeparty-raw.csv'
df = pd.read_csv(df_path)
df = df.rename(columns={"Unnamed: 0": ""}).set_index("")

# Slice for IAA 
df_IAA = df.iloc[105:205, :]
df.shape # For debugging 

(310, 26)

## Calculate Inner-Annotator Agreement 

In [5]:
# 100 codes that all 4 annotators have annotated
loc_columns = ['LOC_D', 'LOC_S', 'LOC_E', 'LOC_C']
des_columns = ['DES1_D', 'DES1_S', 'DES1_E', 'DES1_C','DES2_D', 'DES2_S', 'DES2_E', 'DES2_C']
pv_columns = ['PV_D', 'PV_S', 'PV_E', 'PV_C']
ps_columns = ['PS_D', 'PS_S', 'PS_E', 'PS_C']

loc_kappa = calculate_fleiss_kappa(df_IAA,loc_columns)
des_kappa = calculate_fleiss_kappa(df_IAA,des_columns)
ps_kappa = calculate_fleiss_kappa(df_IAA,ps_columns)
pv_kappa = calculate_fleiss_kappa(df_IAA,pv_columns)

print(f"LOC Fleiss' kappa: {loc_kappa}")
print(f"DES Fleiss' kappa: {des_kappa}")
print(f"PV Fleiss' kappa: {pv_kappa}")
print(f"PS Fleiss' kappa: {ps_kappa}")

# Weighted average calculation
total_codes = 3 * 1 + 2  # 3 single-column codes (LOC, PV, PS) and 2 columns for DES
weighted_avg_kappa = (loc_kappa + pv_kappa + ps_kappa + 2 * des_kappa) / total_codes

print("\nWeighted Average Fleiss' kappa:", weighted_avg_kappa)

LOC Fleiss' kappa: 0.4625645019959108
DES Fleiss' kappa: 0.4471799187038619
PV Fleiss' kappa: 0.4517692105867306
PS Fleiss' kappa: 0.6054570870302552

Weighted Average Fleiss' kappa: 0.4828301274041241


## Select Codes with Highest Agreement

In [3]:
# Highest Agreement Codes
LOC_df = select_highest_agreement_code(df, loc_columns, "LOC")
PS_df = select_highest_agreement_code(LOC_df, ps_columns, "PS")
PV_df = select_highest_agreement_code(PS_df, pv_columns, "PV")
new_df = select_highest_agreement_code(PV_df, des_columns, "DES1", "DES2")
new_df = new_df.filter(regex='_all$')

new_df.shape # Confirm # of rows matches original

(310, 5)

In [32]:
# Combine highest agreement codes with original data
final_df = pd.concat([df.iloc[:, :6], new_df], axis=1)
# Export to CSV for further analysis
final_df.to_csv(Path.cwd().parent / 'data' / 'warnme-codeparty-all.csv')