<a href="https://colab.research.google.com/github/zyferion/privacyguardians/blob/main/notebooks/inter_rater_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is used to determine the inter rater agreement score.


In [34]:
# Read in required packages
import pandas as pd
import csv
from sklearn.metrics import cohen_kappa_score

In [166]:
# Read in raw file from GitHub
url = 'https://raw.githubusercontent.com/zyferion/privacyguardians/main/data/pre_label_sample2.csv'

df = pd.read_csv(url)# Dataset is now stored in a Pandas Dataframe

In [169]:
df.tail(10)

Unnamed: 0,id,erika_103976052,sahil_104191542,yash_104353261,pavan_103818552
72,A11471,Acceptable,Acceptable,Acceptable,Acceptable
73,A11517,Acceptable,Acceptable,Acceptable,Acceptable
74,A11880,Acceptable,Acceptable,Acceptable,Unacceptable
75,A12163,Unacceptable,Unacceptable,Unacceptable,Acceptable
76,A12200,Acceptable,Acceptable,Acceptable,Acceptable
77,A12552,Acceptable,Acceptable,Acceptable,Acceptable
78,A12579,Acceptable,Acceptable,Acceptable,Acceptable
79,A12840,Acceptable,Acceptable,Acceptable,Acceptable
80,A13090,Acceptable,Acceptable,Acceptable,Acceptable
81,A13162,Acceptable,Acceptable,Unacceptable,Unacceptable


In [170]:
# Replace Acceptable with 1 and Unaccpetable with 0

df = df.replace('Acceptable',1)
df = df.replace('Unacceptable',0)
df = df.replace('Ignore',1)

In [171]:
df.tail(10)

Unnamed: 0,id,erika_103976052,sahil_104191542,yash_104353261,pavan_103818552
72,A11471,1,1,1,1
73,A11517,1,1,1,1
74,A11880,1,1,1,0
75,A12163,0,0,0,1
76,A12200,1,1,1,1
77,A12552,1,1,1,1
78,A12579,1,1,1,1
79,A12840,1,1,1,1
80,A13090,1,1,1,1
81,A13162,1,1,0,0


In [172]:
# Define a list of column names for the raters
rater_columns = ['erika_103976052', 'sahil_104191542', 'yash_104353261', 'pavan_103818552']


In [173]:
# Create df with just ratings
ratings = df[rater_columns]
ratings['observed_acceptable'] = ratings.apply(np.sum, axis=1)
ratings['observed_unacceptable'] = len(rater_columns) - ratings['observed_acceptable']

In [73]:
ratings

Unnamed: 0,erika_103976052,sahil_104191542,yash_104353261,pavan_103818552,observed_acceptable,observed_unacceptable
0,1,1,1,1,4,0
1,1,1,1,1,4,0
2,1,1,1,1,4,0
3,1,1,1,1,4,0
4,1,1,1,1,4,0
...,...,...,...,...,...,...
77,1,1,1,1,4,0
78,1,1,1,1,4,0
79,1,1,1,1,4,0
80,1,1,1,1,4,0


In [174]:
df['observed_acceptable'] = ratings['observed_acceptable']
df['observed_unacceptable'] = ratings['observed_unacceptable']

# Create a new df2 for observing major differences
df2 = df.sort_values('observed_unacceptable',ascending=False)
df2 = df2[df2.observed_unacceptable != 0]
df2 = df2[df2.observed_unacceptable != 4]
df2

Unnamed: 0,id,erika_103976052,sahil_104191542,yash_104353261,pavan_103818552,observed_acceptable,observed_unacceptable
75,A12163,0,0,0,1,1,3
27,A5044,0,0,0,1,1,3
81,A13162,1,1,0,0,2,2
30,A5596,0,1,1,0,2,2
31,A5685,1,0,0,1,2,2
35,A6134,1,0,0,1,2,2
39,A6390,1,0,0,1,2,2
74,A11880,1,1,1,0,3,1
53,A8796,1,1,1,0,3,1
55,A9052,1,1,1,0,3,1


In [175]:
# Manual calculation of Fleiss' Kappa based on instructions from the website: https://datatab.net/tutorial/fleiss-kappa

num_acceptable = sum(ratings['observed_acceptable'])
num_unacceptable = sum(ratings['observed_unacceptable'])
total_observed = num_acceptable+num_unacceptable
prop_a = num_acceptable/total_observed
prop_ua = num_unacceptable/total_observed
pe = prop_a**2 + prop_ua**2
#pe

po_part1 = (len(ratings)*len(rater_columns)*(len(rater_columns)-1))**-1
po_part2 = sum((ratings['observed_acceptable']**2)+(ratings['observed_unacceptable']**2))
po_part3 = len(ratings)*len(rater_columns)
po = po_part1 * (po_part2 - po_part3)

kappa = (po - pe)/(1-pe)

print(f"Fleiss' Kappa Score: {kappa}")

Fleiss' Kappa Score: 0.4830277169107756


In [177]:
# Function version of fleiss' kappa calculation

def fleiss_kappa(dataframe, rater_columns):
    # Create a DataFrame containing the counts of each response for each item
    counts_df = dataframe[rater_columns].apply(pd.Series.value_counts, axis=1).fillna(0)

    # Calculate the proportion of agreement for each item
    agreement_proportions = (counts_df * (counts_df - 1)).sum(axis=1) / (len(rater_columns) * (len(rater_columns) - 1))

    # Calculate the overall proportion of agreement
    overall_agreement = agreement_proportions.mean()

    # Calculate the expected proportion of agreement
    p = (counts_df.sum(axis=0) / (len(dataframe) * len(rater_columns))) ** 2
    expected_agreement = p.sum()

    # Calculate Fleiss' Kappa
    fleiss_kappa_score = (overall_agreement - expected_agreement) / (1 - expected_agreement)

    return fleiss_kappa_score


# Apply function
kappa_score = fleiss_kappa(df, rater_columns)
print(f"Fleiss' Kappa Score: {kappa_score}")

Fleiss' Kappa Score: 0.4830277169107756


In [191]:
# Test on different sample size
sample_df = df.sample(n=40)
kappa_score = fleiss_kappa(sample_df, rater_columns)
print(f"Fleiss' Kappa Score: {kappa_score}")


Fleiss' Kappa Score: 0.6812260536398459
