## Step 1: Load Libraries

In [None]:
# Install all required packages
!pip install pandas krippendorff statsmodels scikit-learn seaborn matplotlib numpy nltk

In [None]:
import pandas as pd
import krippendorff
from statsmodels.stats.inter_rater import fleiss_kappa
from sklearn.metrics import cohen_kappa_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.inter_rater import aggregate_raters, fleiss_kappa
import itertools
import seaborn as sns
from scipy.stats import entropy
from sklearn.cluster import KMeans
from scipy.special import rel_entr
from scipy.stats import entropy
import itertools
from collections import Counter
import pandas as pd
import ast 
from statsmodels.stats.inter_rater import fleiss_kappa
from itertools import product
from nltk.metrics import agreement
from nltk.metrics import masi_distance
from collections import defaultdict
import json

## Step 2: Read and Prepare Data

In [None]:
df = pd.read_csv("./gold_dataset.csv")

In [None]:
df.head()

#### **Data Cleaning and Pre-Proccessing**

Here, I want to keep HITId (represnting speeches), WorkerId, and Answer.manifesto_class.labels

In [None]:
# only keep needed columns and rename 
df_clean = df[['HITId', 'WorkerId', 'Answer.manifesto_class.labels']].copy() #remove unnecessary columns
df_clean.rename(columns={
    'HITId': 'speech_id',
    'WorkerId': 'worker_id',
    'Answer.manifesto_class.labels': 'answer'
}, inplace=True)

# extra cleaning, dropping nulls, making sure all labels are ints, 
df_clean = df_clean.sort_values(by=['speech_id', 'worker_id']).reset_index(drop=True)

# Final preview

df_clean

In [None]:
# sanity check

# First, ensure 'answer' is treated as a list, not a string
df_clean['answer'] = df_clean['answer'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Filter out empty annotations
df_non_empty = df_clean[df_clean['answer'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# Count valid annotations per speech
annotation_counts = df_non_empty.groupby('speech_id').size()

# Identify speeches that don't have exactly 3 valid annotations
incomplete_speeches = annotation_counts[annotation_counts != 3]

# Print result
print(f"Speeches with <3 valid annotations: {len(incomplete_speeches)}")
print(incomplete_speeches)


define 'answer's and workers

In [None]:
workers = [
    'A1BQ37ZGUM16XI', 
    'A1NBQ61Y6KO3O9', 
    'A1X47COW2Y9SEL', 
    'A335R4YE2E34H6', 
    'A3GS2NTQ4XU059', 
    'APXGSUB250NVH'
]

In [None]:
# only run this once!!!!!!!!!!!!!!!
# df_clean['answer'] = df_clean['answer'].apply(eval)  # Safely parse list from string

Rename all df columns to have a T/F for each category

In [None]:

# turn all the answers into an array 
all_answers = df_clean['answer'].dropna().apply(ast.literal_eval)
all_categories = [category for sublist in all_answers for category in sublist]
unique_categories = pd.Series(all_categories).unique()

print(unique_categories) #check 

In [None]:
with open("unique_categories.json", "w") as f:
    json.dump(unique_categories.tolist(), f)

In [None]:
for category in unique_categories:
    df[category] = df['Answer.manifesto_class.labels'].str.contains(category)

Create a secondary matrix with unique speech rows and worker columns

In [211]:
df_pivot = df_clean.pivot(index='speech_id', columns='worker_id', values='answer')
df_pivot = df_pivot.reindex(columns=workers)
df_pivot = df_pivot.fillna('[]')  # maybe change

df_pivot

worker_id,A1BQ37ZGUM16XI,A1NBQ61Y6KO3O9,A1X47COW2Y9SEL,A335R4YE2E34H6,A3GS2NTQ4XU059,APXGSUB250NVH
speech_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
301KG0KXAZ355GRD9LSX6LBA2CHH2H,[],"[Market regulation, Middle Class & Professiona...",[Market regulation],[Market regulation],[],[]
302OLP89EMP3TBB1087B1CXS184CAA,[],"[Government and administrative efficiency, Eco...",[Economic Goals],[],[],[]
306996CF7J2OPI6VQPINFRY7FTYB1E,[],"[Anti-Growth Economy, Market regulation]",[Government and administrative efficiency],"[Economic Planning, Economic Growth]",[],[]
3087LXLJ79YEVAXTZ80JKC6IPZKF0X,[],"[Market regulation, Economic Planning, Economi...","[Free Market Economy, Economic Growth]","[Market regulation, Economic Growth]",[],[]
308KJXFUKEOKFSDCMU21QMXWPU4ATR,[],"[Economic Planning, Market regulation, Economi...",[Economic Goals],"[Market regulation, Economic Planning]",[],[]
...,...,...,...,...,...,...
3YZ7A3YHSSBMUMOG2Y6U86LOVAD5S5,[],"[Government and administrative efficiency, Mar...","[Government and administrative efficiency, Fre...",[],[Free Market Economy],[]
3ZUE82NE1XJWDZ2WUCDYDRC08AKF8Y,[],"[Corporatism / Mixed Economy, Anti-Growth Econ...",[],[Corporatism / Mixed Economy],[],"[Economic Goals, Government and administrative..."
3ZURAPD29V5E2ZIBVR4JYHPSURQF10,[],"[Middle Class & Professional Groups, Incentive...",[Technology and Infrastructure],"[Labor Groups Positive, Technology and Infrast...",[],[]
3ZXNP4Z3AE3EDD796IOQU4LPUM0L7M,[],"[Middle Class & Professional Groups, Market re...","[Market regulation, Government and administrat...",[],[Free Market Economy],[]


Seperate DataFrame by categories in answers

In [214]:
# Step 1: Ensure each cell is a list (not a string)
df_pivot_cleaned = df_pivot.applymap(lambda x: eval(x) if isinstance(x, str) else x)

# Step 2: Create binary DataFrames per category
category_dfs = {}

for category in unique_categories:
    rows = []
    speech_ids = []

    for speech_id, row in df_pivot_cleaned.iterrows():
        votes = []

        for annotations in row.values:
            if isinstance(annotations, list) and annotations:  # non-empty list
                vote = 1 if category in annotations else 0
                votes.append(vote)

        # Only include rows with exactly 3 annotations
        if len(votes) == 3:
            rows.append(votes)
            speech_ids.append(speech_id)

    df_binary = pd.DataFrame(rows, index=speech_ids, columns=['worker1', 'worker2', 'worker3'])
    category_dfs[category] = df_binary

category_dfs['Market regulation'] # test one category

  df_pivot_cleaned = df_pivot.applymap(lambda x: eval(x) if isinstance(x, str) else x)


Unnamed: 0,worker1,worker2,worker3
301KG0KXAZ355GRD9LSX6LBA2CHH2H,1,1,1
306996CF7J2OPI6VQPINFRY7FTYB1E,1,0,0
3087LXLJ79YEVAXTZ80JKC6IPZKF0X,1,0,1
308KJXFUKEOKFSDCMU21QMXWPU4ATR,1,0,1
30EMX9PEW71PC59J9LA0AV46S08KS5,1,0,0
...,...,...,...
3YZ7A3YHSSBMUMOG2Y6U86LOVAD5S5,1,0,0
3ZUE82NE1XJWDZ2WUCDYDRC08AKF8Y,0,0,0
3ZURAPD29V5E2ZIBVR4JYHPSURQF10,0,0,0
3ZXNP4Z3AE3EDD796IOQU4LPUM0L7M,1,1,0


## Try 2: run Fleiss Kappa on each seperate category

In [None]:
for category, category_dfs[category] in category_dfs.items():
    try:
        # For each row (speech), count number of annotators giving 0 and 1
        counts_matrix = []

        for _, row in category_dfs[category].iterrows():
            counts = [list(row).count(0), list(row).count(1)]  # [num 0s, num 1s]
            counts_matrix.append(counts)

        counts_array = np.array(counts_matrix)
        
        # Compute Fleiss' Kappa
        kappa = fleiss_kappa(counts_array)
        print(f"{category}: Fleiss’ Kappa = {kappa:.3f}")

    except Exception as e:
        print(f"Error computing Fleiss’ Kappa for {category}: {e}")

This is a bad number likely due to sparse annotations

In [None]:
[(cat, df.sum().sum()) for cat, df in category_dfs.items()]

## Try 3: run Krippendorff’s Alpha on annotations

 Krippendorff’s Alpha doesn't require that each annotator label once, so this measure will work for us. We can either run this measure on the entire dataset, or seperate by category and run on each one. 

In [None]:
# first attempt to run krippendorff on the entire dataset
alpha_value = krippendorff.alpha(
    reliability_data=binary_matrix,
    level_of_measurement='nominal'  # Change to 'interval', 'ordinal', etc. if needed
)

print(f"Krippendorff’s Alpha: {alpha_value:.3f}")




This number is not extremely good; therefore, I will run Krippendorffs on every category. First, I need to make new dataframes for each category. 

In [None]:
#check category_dfs to see if its working
category_dfs['Market regulation'].head()

Now we calculate each score. It is important to note we are measuring agreement on whether a single label was applied, not overall multi-label set overlap.

In [None]:
for category, df_binary in category_dfs.items():
    try:
        # Convert to a list of lists (rows = items, columns = annotators)
        data_matrix = category_dfs[category].to_numpy().tolist()
        
        # Compute Krippendorff's alpha
        alpha_score = krippendorff.alpha(
            reliability_data=data_matrix,
            level_of_measurement='nominal'
        )
        
        print(f"Category: {category} → Krippendorff’s alpha: {alpha_score:.3f}")
    
    except Exception as e:
        print(f"Error for category {category}: {e}")

Interpretation: very low agreement, try to analyze more

In [None]:
df = category_dfs['Market regulation']
total_cells = df.shape[0] * df.shape[1]
positive = df.sum().sum()
sparsity = positive / total_cells

print(f"Total annotations: {total_cells}")
print(f"Positive annotations: {positive}")
print(f"Sparsity (positive rate): {sparsity:.2%}")

## Interpretations

**Original Fliess Kappa with nulls as 0s**

<p>Market regulation: Fleiss’ Kappa = 0.051<br>
Middle Class & Professional Groups: Fleiss’ Kappa = -0.003<br>
Government and administrative efficiency: Fleiss’ Kappa = 0.024<br>
Economic Planning: Fleiss’ Kappa = 0.053<br>
Economic Goals: Fleiss’ Kappa = 0.013<br>
Anti-Growth Economy: Fleiss’ Kappa = 0.023<br>
Economic Growth: Fleiss’ Kappa = 0.155<br>
Free Market Economy: Fleiss’ Kappa = 0.087<br>
Incentives: Fleiss’ Kappa = 0.079<br>
Keynesian Demand Management: Fleiss’ Kappa = 0.078<br>
Corporatism / Mixed Economy: Fleiss’ Kappa = 0.018<br>
Technology and Infrastructure: Fleiss’ Kappa = 0.131<br>
Labor Groups Positive: Fleiss’ Kappa = 0.104<br>
Labor Groups Negative: Fleiss’ Kappa = -0.001</p>

**Fliess Kappa with arbitrary workers**
*These use less speeches, since 65 speeches had only two answer or less*

<p>Market regulation: Fleiss’ Kappa = 0.289<br>
Middle Class & Professional Groups: Fleiss’ Kappa = 0.001<br>
Government and administrative efficiency: Fleiss’ Kappa = 0.179<br>
Economic Planning: Fleiss’ Kappa = 0.173<br>
Economic Goals: Fleiss’ Kappa = 0.065<br>
Anti-Growth Economy: Fleiss’ Kappa = 0.092<br>
Economic Growth: Fleiss’ Kappa = 0.460<br>
Free Market Economy: Fleiss’ Kappa = 0.280<br>
Incentives: Fleiss’ Kappa = 0.236<br>
Keynesian Demand Management: Fleiss’ Kappa = 0.203<br>
Corporatism / Mixed Economy: Fleiss’ Kappa = 0.058<br>
Technology and Infrastructure: Fleiss’ Kappa = 0.409<br>
Labor Groups Positive: Fleiss’ Kappa = 0.305<br>
Labor Groups Negative: Fleiss’ Kappa = -0.002</p>

## Cutting Workers Pt. 2

First, we will cut workers from df_clean

In [178]:
df_clean['worker_id'].value_counts()

worker_id
A1NBQ61Y6KO3O9    186
A335R4YE2E34H6    152
A1X47COW2Y9SEL    128
A3GS2NTQ4XU059    104
APXGSUB250NVH      83
A1BQ37ZGUM16XI      7
Name: count, dtype: int64

In [207]:
# for each category_df[category], we must cut worker A1BQ37ZGUM16XI
workers_to_remove = ['A1NBQ61Y6KO3O9']

# drop this column from df_pivot
df_pivot.drop(columns = workers_to_remove, inplace=True)
df_pivot

worker_id,A1BQ37ZGUM16XI,A1X47COW2Y9SEL,A335R4YE2E34H6,A3GS2NTQ4XU059,APXGSUB250NVH
speech_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
301KG0KXAZ355GRD9LSX6LBA2CHH2H,[],[Market regulation],[Market regulation],[],[]
302OLP89EMP3TBB1087B1CXS184CAA,[],[Economic Goals],[],[],[]
306996CF7J2OPI6VQPINFRY7FTYB1E,[],[Government and administrative efficiency],"[Economic Planning, Economic Growth]",[],[]
3087LXLJ79YEVAXTZ80JKC6IPZKF0X,[],"[Free Market Economy, Economic Growth]","[Market regulation, Economic Growth]",[],[]
308KJXFUKEOKFSDCMU21QMXWPU4ATR,[],[Economic Goals],"[Market regulation, Economic Planning]",[],[]
...,...,...,...,...,...
3YZ7A3YHSSBMUMOG2Y6U86LOVAD5S5,[],"[Government and administrative efficiency, Fre...",[],[Free Market Economy],[]
3ZUE82NE1XJWDZ2WUCDYDRC08AKF8Y,[],[],[Corporatism / Mixed Economy],[],"[Economic Goals, Government and administrative..."
3ZURAPD29V5E2ZIBVR4JYHPSURQF10,[],[Technology and Infrastructure],"[Labor Groups Positive, Technology and Infrast...",[],[]
3ZXNP4Z3AE3EDD796IOQU4LPUM0L7M,[],"[Market regulation, Government and administrat...",[],[Free Market Economy],[]


Then, we will re-run the script to pivot the dataframe and recompute the dictionary of different categorical dataframes

In [212]:
# Step 1: Ensure each cell is a list (not a string)
df_pivot_cleaned = df_pivot.applymap(lambda x: eval(x) if isinstance(x, str) else x)

# Step 2: Create binary DataFrames per category
category_dfs = {}

for category in unique_categories:
    rows = []
    speech_ids = []

    for speech_id, row in df_pivot_cleaned.iterrows():
        votes = []

        for annotations in row.values:
            if isinstance(annotations, list) and annotations:  # non-empty list
                vote = 1 if category in annotations else 0
                votes.append(vote)

        # Only include rows with exactly 3 annotations
        if len(votes) == 3:
            rows.append(votes)
            speech_ids.append(speech_id)

    df_binary = pd.DataFrame(rows, index=speech_ids, columns=['worker1', 'worker2', 'worker3'])
    category_dfs[category] = df_binary

category_dfs['Market regulation'] # test one category

  df_pivot_cleaned = df_pivot.applymap(lambda x: eval(x) if isinstance(x, str) else x)


Unnamed: 0,worker1,worker2,worker3
301KG0KXAZ355GRD9LSX6LBA2CHH2H,1,1,1
306996CF7J2OPI6VQPINFRY7FTYB1E,1,0,0
3087LXLJ79YEVAXTZ80JKC6IPZKF0X,1,0,1
308KJXFUKEOKFSDCMU21QMXWPU4ATR,1,0,1
30EMX9PEW71PC59J9LA0AV46S08KS5,1,0,0
...,...,...,...
3YZ7A3YHSSBMUMOG2Y6U86LOVAD5S5,1,0,0
3ZUE82NE1XJWDZ2WUCDYDRC08AKF8Y,0,0,0
3ZURAPD29V5E2ZIBVR4JYHPSURQF10,0,0,0
3ZXNP4Z3AE3EDD796IOQU4LPUM0L7M,1,1,0


Finally, we will re-compute the Fleiss Kappa scores and look at any differences

In [213]:
all_kappas = {}

for category, category_dfs[category] in category_dfs.items():
    try:
        # For each row (speech), count number of annotators giving 0 and 1
        counts_matrix = []

        for _, row in category_dfs[category].iterrows():
            counts = [list(row).count(0), list(row).count(1)]  # [num 0s, num 1s]
            counts_matrix.append(counts)

        counts_array = np.array(counts_matrix)
        
        # Compute Fleiss' Kappa
        kappa = fleiss_kappa(counts_array)
        all_kappas[category] = kappa
        print(f"{category}: Fleiss’ Kappa = {kappa:.3f}")

    except Exception as e:
        print(f"Error computing Fleiss’ Kappa for {category}: {e}")
        
if all_kappas:
    avg_kappa = np.mean(list(all_kappas.values()))
    print(f"\nAverage Fleiss’ Kappa across categories: {avg_kappa:.3f}")

Market regulation: Fleiss’ Kappa = 0.289
Middle Class & Professional Groups: Fleiss’ Kappa = 0.001
Government and administrative efficiency: Fleiss’ Kappa = 0.179
Economic Planning: Fleiss’ Kappa = 0.173
Economic Goals: Fleiss’ Kappa = 0.065
Anti-Growth Economy: Fleiss’ Kappa = 0.092
Economic Growth: Fleiss’ Kappa = 0.460
Free Market Economy: Fleiss’ Kappa = 0.280
Incentives: Fleiss’ Kappa = 0.236
Keynesian Demand Management: Fleiss’ Kappa = 0.203
Corporatism / Mixed Economy: Fleiss’ Kappa = 0.058
Technology and Infrastructure: Fleiss’ Kappa = 0.409
Labor Groups Positive: Fleiss’ Kappa = 0.305
Labor Groups Negative: Fleiss’ Kappa = -0.002

Average Fleiss’ Kappa across categories: 0.196


**Old with last worker (for comparisons):**

<p>Market regulation: Fleiss’ Kappa = 0.289<br>
Middle Class & Professional Groups: Fleiss’ Kappa = 0.001<br>
Government and administrative efficiency: Fleiss’ Kappa = 0.179<br>
Economic Planning: Fleiss’ Kappa = 0.173<br>
Economic Goals: Fleiss’ Kappa = 0.065<br>
Anti-Growth Economy: Fleiss’ Kappa = 0.092<br>
Economic Growth: Fleiss’ Kappa = 0.460<br>
Free Market Economy: Fleiss’ Kappa = 0.280<br>
Incentives: Fleiss’ Kappa = 0.236<br>
Keynesian Demand Management: Fleiss’ Kappa = 0.203<br>
Corporatism / Mixed Economy: Fleiss’ Kappa = 0.058<br>
Technology and Infrastructure: Fleiss’ Kappa = 0.409<br>
Labor Groups Positive: Fleiss’ Kappa = 0.305<br>
Labor Groups Negative: Fleiss’ Kappa = -0.002</p>