In [14]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa
import krippendorff

pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns (if needed)

In [15]:
# Load data from Excel
filepath_release_1 = "../Data/Mode1/Output/mode_1_release_1_all_output.xlsx"
filepath_release_2 = "../Data/Mode1/Output/mode_1_release_2_all_output.xlsx"
filepath_release_3 = "../Data/Mode1/Output/mode_1_release_3_all_output.xlsx"
raw_data_1 = pd.read_excel(filepath_release_1)
raw_data_2 = pd.read_excel(filepath_release_2)
raw_data_3 = pd.read_excel(filepath_release_3)

In [16]:
def process_raw_data(raw_data):
    processed_rows = []
    
    for _, row in raw_data.iterrows():
        group = row["group"]
        task_data = json.loads(row["data"])["Data"]["taskData"][0]  # Parse the JSON data
        
        grouped_data = {}
        for key, value in task_data.items():
            prefix, topic = key.split('_', 1)
            if prefix not in grouped_data:
                grouped_data[prefix] = {}
            grouped_data[prefix][topic] = value

        # Convert grouped data into records for DataFrame
        for prefix, topics in grouped_data.items():
            record = {"group": group, "prefix": prefix, **topics}
            processed_rows.append(record)
    
    return pd.DataFrame(processed_rows)

def extract_true_values(df):
    def find_true_value(mapping):
        # Identify the key with a True value
        for key, value in mapping.items():
            if value:
                return key
        return None

    # Apply to all topic columns
    topic_columns = df.columns.difference(["group", "prefix"])
    for column in topic_columns:
        df[column] = df[column].apply(find_true_value)

    return df

def count_unique_values(df):
    unique_counts = {}
    for column in df.columns.difference(["group", "prefix"]):
        unique_counts[column] = df[column].value_counts().to_dict()
    return unique_counts

In [17]:
# Process and transform data
processed_data_1 = process_raw_data(raw_data_1)
processed_data_2 = process_raw_data(raw_data_2)
processed_data_3 = process_raw_data(raw_data_3)

# Extract only the "True" values
data_with_true_values_1 = extract_true_values(processed_data_1)
data_with_true_values_2 = extract_true_values(processed_data_2)
data_with_true_values_3 = extract_true_values(processed_data_3)

# Count unique values
unique_counts_1 = count_unique_values(data_with_true_values_1)
unique_counts_2 = count_unique_values(data_with_true_values_2)
unique_counts_3 = count_unique_values(data_with_true_values_3)

# Display the results
print(data_with_true_values_1)
print("\nUnique counts:", unique_counts_1)


      group prefix validness       relatedness appropriateness
0   group_1   3243       Yes  Slightly Related              No
1   group_1   5230       Yes           Related              No
2   group_1   8141       Yes           Related              No
3   group_1  10373       Yes  Slightly Related              No
4   group_1  10555       Yes           Related              No
5   group_1  12564       Yes  Slightly Related              No
6   group_1  12623       Yes           Related              No
7   group_1  13094       Yes           Related              No
8   group_1  13180       Yes           Related              No
9   group_1  14406       Yes  Strongly Related              No
10  group_1  15261       Yes  Slightly Related              No
11  group_1  15478       Yes  Strongly Related              No
12  group_1  17964       Yes  Slightly Related              No
13  group_1  18333       Yes       Not Related              No
14  group_1  18608       Yes       Not Related         

In [18]:
def map_values(df):
    # Define mappings
    validness_map = {"Yes": 1, "No": 0}
    relatedness_map = {
        "Strongly Related": 3,
        "Related": 2,
        "Slightly Related": 1,
        "Not Related": 0
    }
    appropriateness_map = {"No": 1, "Yes": 0}

    # Apply mappings
    df["validness"] = df["validness"].map(validness_map)
    df["relatedness"] = df["relatedness"].map(relatedness_map)
    df["appropriateness"] = df["appropriateness"].map(appropriateness_map)

    return df.copy()

df_1 = map_values(data_with_true_values_1)
df_2 = map_values(data_with_true_values_2)
df_3 = map_values(data_with_true_values_3)

In [19]:
print(f"Release 1 Result Table: \n{df_1[:10].to_markdown(index=False)}\n")
print(f"Release 2 Result Table: \n{df_2[:10].to_markdown(index=False)}\n")
print(f"Release 3 Result Table: \n{df_3[:10].to_markdown(index=False)}\n")

Release 1 Result Table: 
| group   |   prefix |   validness |   relatedness |   appropriateness |
|:--------|---------:|------------:|--------------:|------------------:|
| group_1 |     3243 |           1 |             1 |                 1 |
| group_1 |     5230 |           1 |             2 |                 1 |
| group_1 |     8141 |           1 |             2 |                 1 |
| group_1 |    10373 |           1 |             1 |                 1 |
| group_1 |    10555 |           1 |             2 |                 1 |
| group_1 |    12564 |           1 |             1 |                 1 |
| group_1 |    12623 |           1 |             2 |                 1 |
| group_1 |    13094 |           1 |             2 |                 1 |
| group_1 |    13180 |           1 |             2 |                 1 |
| group_1 |    14406 |           1 |             3 |                 1 |

Release 2 Result Table: 
| group   |   prefix |   validness |   relatedness |   appropriateness |


In [10]:
# Randomly sample test samples to do manual evaluation on
data_with_true_values_test_samples = data_with_true_values_1.groupby('group').sample(n=2, random_state=42)

data_with_true_values_test_samples.to_excel('test_samples_manual_evaluation.xlsx', index=False)

print(data_with_true_values_test_samples)
print(data_with_true_values_test_samples.dtypes)

      group prefix  validness  relatedness  appropriateness
0   group_1   3243          1            1                1
17  group_1  20157          1            3                1
39  group_2  25047          1            2                1
36  group_2  20198          1            1                0
50  group_3  14943          1            1                1
47  group_3  11686          1            1                1
60  group_4     28          1            1                1
78  group_4  19975          1            3                1
85  group_5   4712          1            3                1
82  group_5   2905          1            3                1
group              object
prefix             object
validness           int64
relatedness         int64
appropriateness     int64
dtype: object


In [9]:
# Worker's Manual test sample evaluation
worker_release_1_eval = pd.read_excel('/Users/tkang/Documents/research/nlp_followupqg/Human_Evaluation/Data/Mode1/Analysis/mode_1_release_1_manual_test_evaluation.xlsx', sheet_name="worker_release_1")

print(f"Worker Result: \n{worker_release_1_eval.to_markdown(index=False)}\n")

# David's Manual test sample evaluation
david_release_1_eval = pd.read_excel('/Users/tkang/Documents/research/nlp_followupqg/Human_Evaluation/Data/Mode1/Analysis/mode_1_release_1_manual_test_evaluation.xlsx', sheet_name="david_release_1")

print(f"David Result: \n{david_release_1_eval.to_markdown(index=False)}\n")

# Zhe's Manual test sample evaluation
zhe_release_1_eval = pd.read_excel('/Users/tkang/Documents/research/nlp_followupqg/Human_Evaluation/Data/Mode1/Analysis/mode_1_release_1_manual_test_evaluation.xlsx', sheet_name="zhe_release_1")

print(f"Zhe Result: \n{zhe_release_1_eval.to_markdown(index=False)}\n")


Worker Result: 
| group   |   prefix |   validness |   relatedness |   appropriateness |
|:--------|---------:|------------:|--------------:|------------------:|
| group_1 |     3243 |           1 |             1 |                 1 |
| group_1 |    20157 |           1 |             3 |                 1 |
| group_2 |    25047 |           1 |             2 |                 1 |
| group_2 |    20198 |           1 |             1 |                 0 |
| group_3 |    14943 |           1 |             1 |                 1 |
| group_3 |    11686 |           1 |             1 |                 1 |
| group_4 |       28 |           1 |             1 |                 1 |
| group_4 |    19975 |           1 |             3 |                 1 |
| group_5 |     4712 |           1 |             3 |                 1 |
| group_5 |     2905 |           1 |             3 |                 1 |

David Result: 
| group   |   prefix |   validness |   relatedness |   appropriateness |
|:--------|--------

In [45]:
test_sample_questions = {
    "group_1": {
        {
            "id": 3243,
            "question": "ELI5: Why electric cars don't use removable batteries?",
            "answer": "It's doable from a technical perspective, but it now means that you have to dramatically increase the total number of batteries in circulation as you need more than one for each vehicle on the road. Batteries are already a pretty extreme production and cost concern (roughly half the value of an EV is just the battery), and increasing the number of battery packs would inevitably mean increasing costs for the consumers who buy the EV.",
            "follow-up": "What kind of infrastructure investments would be necessary for a widespread network of battery swapping stations?"
        },
        {
            "id": 20157,
            "question": "ELI5: Why does a debit increase an asset account?",
            "answer": "You would not be left with $5k, you would be left with $15k.  You don't debit \"from\" an asset account, you debit \"into\" it.  A credit to an asset account is taking \"from\" it.",
            "follow-up": "What other types of accounts, besides asset accounts, are affected by debits and credits?"
        }
    },
    "group_2": {
        {
            "id": 25047,
            "question": "ELI5: what needs to happen before we can unmask safely?",
            "answer": "There are two answers. Neither of them very clear. One personally, and two society-wise. Both depend on risk. The first to your self , the second as a group.   Your risk is based on how likely you are to actually cross paths with someone infectious and how likely that is to cause you serious harm. So are you in an at risk group and where are you are considerations.   The second - how likely you are to be part of a chain of infection, how at risk are the people around you and how well society - such as the health service is coping.   Masks not perfect and life always holds a risk. In the West we have never masked up for flu which kills thousands of people a year - around 20,000 in the U.K. annually I think - though we vaccinate the at risk. Possibly COVId may have some nasty long term consequences even if you dint get seriously ill. But if the consequences of COVID dropped to equivalent to flu , then it would hardly be unreasonable to behave in a similar way. I say that nit to suggest we wait till then ( if it ever happened) but as a comparative starting point. No one can really say at what precise point in infections and consequences between that \u2018flu\u2019 point and the worst of COVID is the point at which we are \u2018safe\u2019.   I expect government ps will come up with a balance of infection rates, hospitalisations and economics with a dose of politics.   Bear in mind that in the East people regularly wear masks on public transport so as either not to do catch viruses or not to spread them , I guess it\u2019s just considered normal hygiene even for colds.   That\u2019s a long winded way of saying that presuming you are not mandated by the governments and pretty much everyone has had access to a vaccine you will have to decide whether you feel either at risk yourself, or feel that you need to still protect other people.   Personally I feel like the vaccine plus some lowering of case rates make me feel safe enough - but I wear one sometimes out of consideration to people around me who are still wearing them or in particularly crowded and enclosed environments or both.",
            "follow-up": "What are the benefits of using transparent face shields as an alternative to masks?"
        },
        {
            "id": 20198,
            "question": "eli5: What is confusion?",
            "answer": "In many cases it is because of sensory overload. It's what happens when the brain gives up processing the input correctly.",
            "follow-up": "How do excessive and insufficient information contribute differently to confusion?"
        },
    },
    "group_3": {
        {
            "id": 14943,
            "question": "Eli5 How do players go pro into esports?",
            "answer": "Do you mean how do they develop the skills to be professional or what defines them as professional? The answer to the former is discipline and practice. The answer to the later is that they are sponsored and between sponsorships and winnings from tournaments make some or all of their living wage by gaming competitively.",
            "follow-up": "What are some key elements that mentorship might cover for aspiring professional esports players?"
        },
        {
            "id": 11686,
            "question": "ELI5: Why is it convention for websites to use boxes for questions with multiple answers, but dots for questions with single answers?",
            "answer": "They are two different types of controls.  The check boxes are independent, and you can check multiple boxes.  The dots are called radio buttons, and when they're grouped together you can only check one.  Checking another will uncheck the previously checked one.  Since that functionality has already been built into the controls, it means less coding for the web designer.",
            "follow-up": "How does the use of checkboxes and radio buttons adhere to accessibility principles?"
        },
    },
    "group_4": {
        {
            "id": 28,
            "question": "ELI5: Why aren\u2019t there insects the size of man or larger?",
            "answer": "They don\u2019t breath the same way mammals do. They have tubes in various parts of their body that expose their circulatory system to air where it absorbs oxygen. Because they don\u2019t inhale, they air limited in size by the amount of oxygen in the atmosphere. Which is why they used to be bigger in past eras when there was more oxygen in the atmosphere.",
            "follow-up": "How does gravity affect the biomechanics of larger insects differently than smaller ones?"
        },
        {
            "id": 19975,
            "question": "ELI5: Why is deflation worse than inflation?",
            "answer": "Why would I pay $1000 for a TV today when it will be cheaper tomorrow.   Obviously not that simple.  But if people don't spend money, shops go broke, people lose jobs etc",
            "follow-up": "What are the mechanisms by which deflation can cause a credit crunch?"
        },
    },
    "group_5": {
        {
            "id": 4712,
            "question": "eli5: Why do commonly used items, such as CRT's or old consoles skyrocket in value after they stop production? Assuming that these items stopped production today, why do sealed boxes or hell even good condition second-hand items almost double in price mere weeks after the end of production?",
            "answer": "CRTs have some advantages over LCDs and other flat-panel display technologies that matter for certain niche applications. To the typical consumer, the advantages of flat panels outweigh these rather specific disadvantages, and also different flat panel technologies have complementary strengths so most people don't need a CRT. So once flat panels became cheap enough to be accessible to most consumers, CRT production took a nosedive. But the niche market for them remained, and so now you have a situation where the supply is almost 0 but there is still some demand for it. Of course, you could argue that supply could still track demand and so prices should be unaffected, but that's not the case because (1) with this big drop in production, economies of scale have been lost and so it's more expensive to manufacture a CRT monitor these days and (2) the demand is no longer from average consumers who want to pay low prices for medium-quality products, but rather it comes from specialists or enthusiasts who need CRTs with specific, high-quality specifications and are willing to pay more for that.  Of course, the second point does not explain why even older, second-hand CRTs have gone up in price. That's more due to point (1) combined with a somewhat separate (though overlapping) demand from enthusiasts and collectors who don't care so much about performance.  Anyway, in short: CRTs are an example of a product that has gotten more expensive because the mass market for it disappeared, and now it's turned into a niche market, where costs are higher and customers are willing to pay more.  Things like old consoles, where production has fully and permanently stopped, are a different situation, because there you're dealing with a somewhat steady (if small) demand combined with a steadily dwindling supply (that will eventually drop to 0). So e.g. if you really want to buy a NES today, your options are very limited as you're dependent on the ever-decreasing number of NES's in the world that still work, *and* that people are willing to sell. So even though very few people are looking to buy NES's, the supply is even smaller.  I'm not sure it's accurate to say that these consoles shoot up in price weeks after their production ends (that would surprise me but I'm happy to be corrected on that). By that point, presumably nearly everyone who wanted to buy one already did so, and people who couldn't previously afford one now are looking to buy one cheaper in the second-hand market that is ramping up. Also, often consoles that go out of production are superseded by newer models and the bulk of the demand will switch to them, while the collectors' market will take years to get off the ground. The only exception I can think of would be if the manufacturer (for some atypical reason) stopped producing the console before they had exhausted the (profitable) demand for it (and didn't release a new console yet).",
            "follow-up": "How does the release of newer console models impact the demand for older, discontinued ones?"
        },
        {
            "id": 2905,
            "question": "ELI5: why are there a huge amount of different insect varieties, like in ants, but only a small amount of different varieties in animals such as crocodiles?",
            "answer": "Tiny changes in those small animals allow them to fulfill different niches and avoid competition.  A tiny change in a crocodiles morphology wouldn't do as much the separate it from the other croc species.    Ex.  A slight change in the size of a birds beak (I know I changed examples but still) will change its primary food source.  A small change in a crocs mouth.. will still make it be a top predator.",
            "follow-up": "How does the reproductive rate of insects contribute to their high species diversity?"
        },
    }
}

TypeError: unhashable type: 'dict'

In [20]:
import pandas as pd

df_worker = worker_release_1_eval.copy()
df_david = david_release_1_eval.copy()
df_zhe = zhe_release_1_eval.copy()

def change_col_name(df, name):
    df.columns = [
        col + f'_{name}' if col not in ['group', 'prefix'] else col 
        for col in df.columns
    ]
    
    return df

change_col_name(df_david, 'david')
change_col_name(df_zhe, 'zhe')
change_col_name(df_worker, 'worker')

print(df_david)

NameError: name 'worker_release_1_eval' is not defined

In [11]:
merged_df = df_david.merge(df_zhe, on=['group', 'prefix'], how='outer').merge(df_worker, on=['group', 'prefix'], how='outer')

merged_df

Unnamed: 0,group,prefix,validness_david,relatedness_david,appropriateness_david,validness_zhe,relatedness_zhe,appropriateness_zhe,validness_worker,relatedness_worker,appropriateness_worker
0,group_1,3243,1,1,1,1,1,1,1,1,1
1,group_1,20157,1,2,1,1,2,1,1,3,1
2,group_2,20198,1,2,1,1,3,1,1,1,0
3,group_2,25047,1,1,1,1,1,1,1,2,1
4,group_3,11686,1,1,1,1,2,1,1,1,1
5,group_3,14943,1,2,1,1,2,1,1,1,1
6,group_4,28,1,0,1,1,1,1,1,1,1
7,group_4,19975,1,2,1,1,1,1,1,3,1
8,group_5,2905,1,2,1,1,2,1,1,3,1
9,group_5,4712,1,3,1,1,3,1,1,3,1


In [28]:
def calculate_agreement_metrics(df, columns_to_check=['validness', 'relatedness', 'appropriateness']):
    columns_to_check = ['validness', 'relatedness', 'appropriateness']
    exclude_columns = ['group', 'prefix']

    results = {}

    for column in columns_to_check:
        david_column = f"{column}_david"
        zhe_column = f"{column}_zhe"
        worker_column = f"{column}_worker"

        # Calculate percent agreement
        agreement_david_zhe = (df[david_column] == df[zhe_column]).mean() * 100
        agreement_david_worker = (df[david_column] == df[worker_column]).mean() * 100
        agreement_zhe_worker = (df[zhe_column] == df[worker_column]).mean() * 100

        fleiss_k = 200
        kripp_alpha = 200
        cohen_k_dz = 200
        cohen_k_dw = 200
        cohen_k_zw = 200

        if column == "relatedness":
            # Convert to numpy array for kappa calculations
            # numeric_columns = [col for col in df.columns if col not in exclude_columns]
            # df[numeric_columns] = df[numeric_columns].astype(int)
            
            ratings = df.filter(like="relatedness").to_numpy()

            # print(ratings)
            # print(np.apply_along_axis(lambda x: np.bincount(x, minlength=2), axis=1, arr=ratings))
                  
            # Compute Fleiss' Kappa
            fleiss_k = fleiss_kappa(
                np.apply_along_axis(lambda x: np.bincount(x), axis=1, arr=ratings)
            )

            # Compute Krippendorff's Alpha safely
            unique_values = np.unique(ratings)
            if len(unique_values) > 1:
                kripp_alpha = krippendorff.alpha(ratings.T)
            else:
                kripp_alpha = np.nan  # Not computable

            # Compute pairwise Cohen's Kappa
            cohen_k_dz = cohen_kappa_score(df[david_column], df[zhe_column])
            cohen_k_dw = cohen_kappa_score(df[david_column], df[worker_column])
            cohen_k_zw = cohen_kappa_score(df[zhe_column], df[worker_column])

        # Store results
        results[column] = {
            'david_zhe_percent_agreement': agreement_david_zhe,
            'david_worker_percent_agreement': agreement_david_worker,
            'zhe_worker_percent_agreement': agreement_zhe_worker,
            'fleiss_kappa': fleiss_k,
            'krippendorff_alpha': kripp_alpha,
            'cohen_kappa_david_zhe': cohen_k_dz,
            'cohen_kappa_david_worker': cohen_k_dw,
            'cohen_kappa_zhe_worker': cohen_k_zw
        }

    return results

In [27]:
# evaluating by converting 1,2,3 values to 1 and keeping 0 as-is for 'relatedness' column
df = pd.DataFrame(merged_df)

def recode_relatedness(column):
    return column.apply(lambda x: 0 if x == 0 else 1)

# Recode relatedness columns based on the disagreement rule
df["relatedness_david"] = recode_relatedness(df["relatedness_david"])
df["relatedness_zhe"] = recode_relatedness(df["relatedness_zhe"])
df["relatedness_worker"] = recode_relatedness(df["relatedness_worker"])

# Calculate agreement metrics
agreement_metrics = calculate_agreement_metrics(df, columns_to_check)

print("Mode 1 release 1\nGrouping by rule 'B' : 0 if 0 or 1, and 1 if 2 or 3")

# Display results
for column, metrics in agreement_metrics.items():
    print(f"Agreement Metrics for '{column}':")
    for key, value in metrics.items():
        print(f"  {key}: {value:.4f}")

Mode 1 release 1
Grouping by rule 'B' : 0 if 0 or 1, and 1 if 2 or 3
Agreement Metrics for 'validness':
  david_zhe_percent_agreement: 100.0000
  david_worker_percent_agreement: 100.0000
  zhe_worker_percent_agreement: 100.0000
  fleiss_kappa: 200.0000
  krippendorff_alpha: 200.0000
  cohen_kappa_david_zhe: 200.0000
  cohen_kappa_david_worker: 200.0000
  cohen_kappa_zhe_worker: 200.0000
Agreement Metrics for 'relatedness':
  david_zhe_percent_agreement: 90.0000
  david_worker_percent_agreement: 90.0000
  zhe_worker_percent_agreement: 100.0000
  fleiss_kappa: -0.0345
  krippendorff_alpha: 0.0000
  cohen_kappa_david_zhe: 0.0000
  cohen_kappa_david_worker: 0.0000
  cohen_kappa_zhe_worker: nan
Agreement Metrics for 'appropriateness':
  david_zhe_percent_agreement: 100.0000
  david_worker_percent_agreement: 90.0000
  zhe_worker_percent_agreement: 90.0000
  fleiss_kappa: 200.0000
  krippendorff_alpha: 200.0000
  cohen_kappa_david_zhe: 200.0000
  cohen_kappa_david_worker: 200.0000
  cohen_kap

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


In [29]:
import pandas as pd
import numpy as np
from statsmodels.stats.inter_rater import fleiss_kappa
from sklearn.metrics import cohen_kappa_score
import krippendorff

def recode_relatedness(column):
    return column.apply(lambda x: 0 if (x == 0 or x == 1) else 1)

# Recode relatedness columns based on the disagreement rule
df["relatedness_david"] = recode_relatedness(df["relatedness_david"])
df["relatedness_zhe"] = recode_relatedness(df["relatedness_zhe"])
df["relatedness_worker"] = recode_relatedness(df["relatedness_worker"])

# Calculate agreement metrics
agreement_metrics = calculate_agreement_metrics(df, columns_to_check)

print("Mode 1 release 1\nGrouping by rule 'B' : 0 if 0 or 1, and 1 if 2 or 3")

# Display results
for column, metrics in agreement_metrics.items():
    print(f"Agreement Metrics for '{column}':")
    for key, value in metrics.items():
        print(f"  {key}: {value:.4f}")


Mode 1 release 1
Grouping by rule 'B' : 0 if 0 or 1, and 1 if 2 or 3
Agreement Metrics for 'validness':
  david_zhe_percent_agreement: 100.0000
  david_worker_percent_agreement: 100.0000
  zhe_worker_percent_agreement: 100.0000
  fleiss_kappa: 200.0000
  krippendorff_alpha: 200.0000
  cohen_kappa_david_zhe: 200.0000
  cohen_kappa_david_worker: 200.0000
  cohen_kappa_zhe_worker: 200.0000
Agreement Metrics for 'relatedness':
  david_zhe_percent_agreement: 100.0000
  david_worker_percent_agreement: 100.0000
  zhe_worker_percent_agreement: 100.0000
  fleiss_kappa: nan
  krippendorff_alpha: nan
  cohen_kappa_david_zhe: nan
  cohen_kappa_david_worker: nan
  cohen_kappa_zhe_worker: nan
Agreement Metrics for 'appropriateness':
  david_zhe_percent_agreement: 100.0000
  david_worker_percent_agreement: 90.0000
  zhe_worker_percent_agreement: 90.0000
  fleiss_kappa: 200.0000
  krippendorff_alpha: 200.0000
  cohen_kappa_david_zhe: 200.0000
  cohen_kappa_david_worker: 200.0000
  cohen_kappa_zhe_work

  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


In [21]:
from tabulate import tabulate

# Mode 1 Mean and Variance
def calculate_mean_var(df):
    # Calculate mean and variance
    mean_values = df[["validness", "relatedness", "appropriateness"]].mean()
    var_values = df[["validness", "relatedness", "appropriateness"]].var()

    # Create a DataFrame for results
    result = pd.DataFrame(
        [mean_values, var_values],
        index=["Mean", "Variance"]
    )

    # Transpose the result for better readability
    result = result.T
    result.reset_index(inplace=True)
    result.rename(columns={"index": "Metric"}, inplace=True)

    return result

df_1_mean_var = calculate_mean_var(df_1)
df_2_mean_var = calculate_mean_var(df_2)
df_3_mean_var = calculate_mean_var(df_3)

print(df_1_mean_var)
print(df_2_mean_var)
print(df_3_mean_var)

combined_mean_var_table = pd.merge(df_1_mean_var, df_2_mean_var, on='Metric', how="outer", suffixes=('_release_1', '_release_2'))
combined_mean_var_table = pd.merge(combined_mean_var_table, df_3_mean_var, on='Metric', how="outer")
columns_to_rename = combined_mean_var_table.columns[-2:]
rename_mapping = {col: f"{col}_release_3" for col in columns_to_rename}
combined_mean_var_table = combined_mean_var_table.rename(columns=rename_mapping)

# Pretty-print the table
table = tabulate(
    combined_mean_var_table,
    headers="keys",
    tablefmt="grid",
    showindex=False
)

print(f"Mode 1 Release 1 Mean and Variance:\n{table}")

            Metric  Mean  Variance
0        validness  1.00  0.000000
1      relatedness  1.94  0.925657
2  appropriateness  0.84  0.135758
            Metric  Mean  Variance
0        validness  0.99  0.010000
1      relatedness  2.05  1.159091
2  appropriateness  0.94  0.056970
            Metric  Mean  Variance
0        validness  0.83  0.142525
1      relatedness  2.22  0.819798
2  appropriateness  0.98  0.019798
Mode 1 Release 1 Mean and Variance:
+-----------------+------------------+----------------------+------------------+----------------------+------------------+----------------------+
| Metric          |   Mean_release_1 |   Variance_release_1 |   Mean_release_2 |   Variance_release_2 |   Mean_release_3 |   Variance_release_3 |
| appropriateness |             0.84 |             0.135758 |             0.94 |            0.0569697 |             0.98 |             0.019798 |
+-----------------+------------------+----------------------+------------------+----------------------+---

In [22]:
# Averagimg Across the Calculated Mean and Variance
mean_columns = [col for col in combined_mean_var_table.columns if col.startswith("Mean_")]
var_columns = [col for col in combined_mean_var_table.columns if col.startswith("Variance_")]
combined_mean_var_table["Mean_across_releases"] = combined_mean_var_table[mean_columns].mean(axis=1)
combined_mean_var_table["Variance_across_releases"] = combined_mean_var_table[var_columns].mean(axis=1)

combined_mean_var_table

Unnamed: 0,Metric,Mean_release_1,Variance_release_1,Mean_release_2,Variance_release_2,Mean_release_3,Variance_release_3,Mean_across_releases,Variance_across_releases
0,appropriateness,0.84,0.135758,0.94,0.05697,0.98,0.019798,0.92,0.070842
1,relatedness,1.94,0.925657,2.05,1.159091,2.22,0.819798,2.07,0.968182
2,validness,1.0,0.0,0.99,0.01,0.83,0.142525,0.94,0.050842


In [23]:
# Averagimg Across the Original Dataset
# Combine df1 df2 and df3 into 1 table

combined_df = pd.concat([df_1, df_2, df_3], ignore_index=True)

combined_df_mean_var = calculate_mean_var(combined_df)

# Pretty-print the table
table = tabulate(
    combined_df_mean_var,
    headers="keys",
    tablefmt="grid",
    showindex=False
)

print(f"Mode 1 ALL DATA COMBINED Mean and Variance:\n{table}")

Mode 1 ALL DATA COMBINED Mean and Variance:
+-----------------+--------+------------+
| Metric          |   Mean |   Variance |
| validness       |   0.94 |  0.0565886 |
+-----------------+--------+------------+
| relatedness     |   2.07 |  0.975017  |
+-----------------+--------+------------+
| appropriateness |   0.92 |  0.0738462 |
+-----------------+--------+------------+


In [31]:
print(combined_df["relatedness"].value_counts())

print(len(combined_df))

print((129+91+52)/300)

relatedness
3    129
2     91
1     52
0     28
Name: count, dtype: int64
300
0.9066666666666666


In [113]:
# Calculate Interannotator Agreement
# converting column names and merging tables
df_1_int = df_1.copy()
df_2_int = df_2.copy()
df_3_int = df_3.copy()

def change_col_name(df, name):
    df.columns = [
        col + f'_{name}' if col not in ['group', 'prefix'] else col 
        for col in df.columns
    ]
    
    return df

change_col_name(df_1_int, 'release_1')
change_col_name(df_2_int, 'release_2')
change_col_name(df_3_int, 'release_3')

df_123_merged = df_1_int.merge(df_2_int, on=['group', 'prefix'], how='outer').merge(df_3_int, on=['group', 'prefix'], how='outer')

print(df_123_merged.columns)

Index(['group', 'prefix', 'validness_release_1', 'relatedness_release_1',
       'appropriateness_release_1', 'validness_release_2',
       'relatedness_release_2', 'appropriateness_release_2',
       'validness_release_3', 'relatedness_release_3',
       'appropriateness_release_3'],
      dtype='object')


In [114]:
df_123_merged

Unnamed: 0,group,prefix,validness_release_1,relatedness_release_1,appropriateness_release_1,validness_release_2,relatedness_release_2,appropriateness_release_2,validness_release_3,relatedness_release_3,appropriateness_release_3
0,group_1,10373,1,1,1,1,2,1,1,1,1
1,group_1,10555,1,2,1,1,1,1,1,3,1
2,group_1,12564,1,1,1,1,0,1,1,2,1
3,group_1,12623,1,2,1,1,2,1,1,3,1
4,group_1,13094,1,2,1,1,1,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...
95,group_5,3293,1,3,1,1,3,1,1,3,1
96,group_5,3697,1,3,1,1,3,1,0,3,1
97,group_5,4712,1,3,1,1,3,1,0,3,1
98,group_5,4754,1,3,1,1,3,1,0,3,1


In [182]:
def calculate_agreement_metrics_between_annotators(df, columns_to_check):
    results = {}

    for column in columns_to_check:
        release_1 = f"{column}_release_1"
        release_2 = f"{column}_release_2"
        release_3 = f"{column}_release_3"

        # print(type(df[release_1]))

        # Compute pairwise Cohen's Kappa
        cohen_k_1_2 = cohen_kappa_score(df[release_1], df[release_2])
        cohen_k_1_3 = cohen_kappa_score(df[release_1], df[release_2])
        cohen_k_2_3 = cohen_kappa_score(df[release_3], df[release_3])
        
        # Calculate pairwise percent agreement
        agreement_1_2 = (df[release_1] == df[release_2]).mean() * 100
        agreement_1_3 = (df[release_1] == df[release_3]).mean() * 100
        agreement_2_3 = (df[release_2] == df[release_3]).mean() * 100

        agreement_all_3 = df[[release_1, release_2, release_3]].apply(lambda row: row.nunique() == 1, axis=1).mean() * 100

        # fleiss_k = 200
        # kripp_alpha = 200
        # cohen_k_dz = 200
        # cohen_k_dw = 200
        # cohen_k_zw = 200
            
        ratings = df.filter(like=column).to_numpy()
                
        # # Compute Fleiss' Kappa
        fleiss_k = fleiss_kappa(
            np.apply_along_axis(lambda x: np.bincount(x, minlength=2), axis=1, arr=ratings)
        )

        # Compute Krippendorff's Alpha safely
        unique_values = np.unique(ratings)
        if len(unique_values) > 1:
            kripp_alpha = krippendorff.alpha(ratings.T)
            kripp_alpha_1_2 = krippendorff.alpha(ratings[:,0:2].T)
            kripp_alpha_1_3 = krippendorff.alpha(ratings[:, 1].T)
            kripp_alpha_2_3 = krippendorff.alpha(ratings[:,1:3].T)
        else:
            kripp_alpha = np.nan  # Not computable

        pabak_1_2 = 2 * ((df[release_1] == df[release_2]).mean()) - 1
        pabak_1_3 = 2 * ((df[release_1] == df[release_3]).mean()) - 1
        pabak_2_3 = 2 * ((df[release_2] == df[release_3]).mean()) - 1

        # Store results
        results[column] = {
            '1_2_percent_agreement': agreement_1_2,
            '1_3_percent_agreement': agreement_1_3,
            '2_3_percent_agreement': agreement_2_3,
            'percent_agreement_all_3': agreement_all_3,
            'fleiss_kappa': fleiss_k,
            'krippendorff_alpha': kripp_alpha,
            'cohen_kappa_1_2': cohen_k_1_2,
            'cohen_kappa_1_3': cohen_k_1_3,
            'cohen_kappa_2_3': cohen_k_2_3,
            'kripp_alpha_1_2': kripp_alpha_1_2,
            'kripp_alpha_1_3': kripp_alpha_1_3,
            'kripp_alpha_2_3': kripp_alpha_2_3,
            'pabak_1_2': pabak_1_2,
            'pabak_1_3': pabak_1_3,
            'pabak_2_3': pabak_2_3,
        }

    return results

In [12]:
### Jan 31 - Redo kappa calculation
def calculate_agreement_scores(df, kappas_per_row, alpha_per_row):
    results = {}

    for column in columns_to_check:
        release_1 = f"{column}_release_1"
        release_2 = f"{column}_release_2"
        release_3 = f"{column}_release_3"

        # Compute pairwise Cohen's Kappa
        # cohen_k_1_2 = cohen_kappa_score(df[release_1], df[release_2])
        # cohen_k_1_3 = cohen_kappa_score(df[release_1], df[release_2])
        # cohen_k_2_3 = cohen_kappa_score(df[release_3], df[release_3])
        
        # Calculate pairwise percent agreement
        agreement_1_2 = (df[release_1] == df[release_2]).mean() * 100
        agreement_1_3 = (df[release_1] == df[release_3]).mean() * 100
        agreement_2_3 = (df[release_2] == df[release_3]).mean() * 100

        agreement_all_3 = df[[release_1, release_2, release_3]].apply(lambda row: row.nunique() == 1, axis=1).mean() * 100
            
        ratings = df.filter(like=column).to_numpy()
        
        kripp_alpha = 1
        # kripp_alpha_1_2 = np.nan
        # kripp_alpha_1_3 = np.nan
        # kripp_alpha_2_3 = np.nan
        
        # Compute Krippendorff's Alpha safely
        unique_values = np.unique(ratings)
        if len(unique_values) > 1:
            kripp_alpha = krippendorff.alpha(ratings.T)
            # kripp_alpha_1_2 = krippendorff.alpha(ratings[:,0:2].T)
            # kripp_alpha_1_3 = krippendorff.alpha(ratings[:, 1].T)
            # kripp_alpha_2_3 = krippendorff.alpha(ratings[:,1:3].T)
        else:
            kripp_alpha = 1  # Not computable
            # kripp_alpha_1_2 = 1
            # kripp_alpha_1_3 = 1
            # kripp_alpha_2_3 = 1

        cohen_k_1_2 = 1
        cohen_k_1_3 = 1
        cohen_k_2_3 = 1

        if len(set(df[release_1])) > 1 and len(set(df[release_2])) > 1:
            cohen_k_1_2 = cohen_kappa_score(df[release_1], df[release_2])
        if len(set(df[release_1])) > 1 and len(set(df[release_3])) > 1:
            cohen_k_1_3 = cohen_kappa_score(df[release_1], df[release_3])
        if len(set(df[release_2])) > 1 and len(set(df[release_3])) > 1:
            cohen_k_2_3 = cohen_kappa_score(df[release_2], df[release_3])
        

        pabak_1_2 = 2 * ((df[release_1] == df[release_2]).mean()) - 1
        pabak_1_3 = 2 * ((df[release_1] == df[release_3]).mean()) - 1
        pabak_2_3 = 2 * ((df[release_2] == df[release_3]).mean()) - 1

        kappas_per_row[column].append([cohen_k_1_2, cohen_k_1_3, cohen_k_2_3])
        alpha_per_row[column].append(kripp_alpha)

        # Store results
        results[column] = {
            '1_2_percent_agreement': agreement_1_2,
            '1_3_percent_agreement': agreement_1_3,
            '2_3_percent_agreement': agreement_2_3,
            'percent_agreement_all_3': agreement_all_3,
            'krippendorff_alpha': kripp_alpha,
            'cohen_kappa_1_2': cohen_k_1_2,
            'cohen_kappa_1_3': cohen_k_1_3,
            'cohen_kappa_2_3': cohen_k_2_3,
            # 'kripp_alpha_1_2': kripp_alpha_1_2,
            # 'kripp_alpha_1_3': kripp_alpha_1_3,
            # 'kripp_alpha_2_3': kripp_alpha_2_3,
            'pabak_1_2': pabak_1_2,
            'pabak_1_3': pabak_1_3,
            'pabak_2_3': pabak_2_3,
        }

    return results    

# Calculate Interannotator Agreement
# converting column names and merging tables
df_1_int = df_1.copy()
df_2_int = df_2.copy()
df_3_int = df_3.copy()

def change_col_name(df, name):
    df.columns = [
        col + f'_{name}' if col not in ['group', 'prefix'] else col 
        for col in df.columns
    ]
    
    return df

change_col_name(df_1_int, 'release_1')
change_col_name(df_2_int, 'release_2')
change_col_name(df_3_int, 'release_3')

rel_1_A = df_1[0:20]
kappas_per_row = {
    "validness": [],
    "relatedness": [],
    "appropriateness": [],
}
alpha_per_row = {
    "validness": [],
    "relatedness": [],
    "appropriateness": [],
}

columns_to_check=['validness', 'relatedness', 'appropriateness']

for i in range(5):
    df = df_1_int[i*20:(i+1)*20].merge(df_2_int[i*20:(i+1)*20], on=['group', 'prefix'], how='outer').merge(df_3_int[i*20:(i+1)*20], on=['group', 'prefix'], how='outer')

    relatedness_cols = [col for col in df.columns if col.startswith("relatedness_")]

    # Apply transformation to all matching columns (Binning rule A)
    df[relatedness_cols] = df[relatedness_cols].applymap(lambda x: int(x != 0))

    results = calculate_agreement_scores(df, kappas_per_row, alpha_per_row)

    # print("Mode 1 release 1, 2, 3 \nGrouping by rule 'A': 0 if 0, 1 otherwise")
    # # Display results
    # for column, metrics in results.items():
    #     print(f"\nMetrics for '{column}':")
    #     for key, value in metrics.items():
    #         print(f"  {key}: {value:.5f}")

    # df = None

print(f"Cohen's Kappa Per Row: {kappas_per_row}")
print(f"Krippendorff's Alpha per Row: {alpha_per_row}")

average_of_all_kappas = {
    "validness": [],
    "relatedness": [],
    "appropriateness": [],
}
average_of_all_alphas = {
    "validness": [],
    "relatedness": [],
    "appropriateness": [],
}

# for each Cohen's Kappa set, take the average
for column, values in kappas_per_row.items():
    for value in values:
        average_of_all_kappas[column].append(np.mean(value))

for column, values in alpha_per_row.items():
    for value in values:
        average_of_all_alphas[column].append(np.mean(value))

print(f"Cohen's Kappa Average per Row: {average_of_all_kappas}")
print(f"Krippendorff's Alpha Average per Row: {average_of_all_alphas}")

total_value = 0
# for each Cohen's Kappa set, take the average
for column, values in average_of_all_kappas.items():
    print(f"Average of all Cohen's Kappa for column {column}: {np.mean(values)}")
    total_value += np.mean(values)

for column, values in average_of_all_alphas.items():
    print(f"Average of all Krippendorff's Alpha for column {column}: {np.mean(values)}")

print(f"Average of all questions cohen's Kappa: {total_value / 3}")



Cohen's Kappa Per Row: {'validness': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 0.21875]], 'relatedness': [[0.02777777777777768, 0.45945945945945954, 0.21875], [0.2727272727272727, -0.16666666666666674, -0.0714285714285714], [1.0, 1.0, 1.0], [1, 1, 0.5], [1, 1, 1]], 'appropriateness': [[1, 1, 1], [-0.04838709677419373, 1, 1], [1.0, -0.05263157894736836, -0.05263157894736836], [1, 1, 1], [0.7727272727272727, -0.0714285714285714, -0.08108108108108092]]}
Krippendorff's Alpha per Row: {'validness': [1, 1, -0.1132075471698113, -0.0535714285714286, 0.04582210242587603], 'relatedness': [0.17400000000000004, 0.007211538461538436, 1.0, 0.20485175202156336, 1], 'appropriateness': [1, -0.22370370370370374, 0.3099415204678362, 1, 0.2716049382716049]}
Cohen's Kappa Average per Row: {'validness': [1.0, 1.0, 1.0, 1.0, 0.7395833333333334], 'relatedness': [0.23532907907907907, 0.011544011544011523, 1.0, 0.8333333333333334, 1.0], 'appropriateness': [1.0, 0.6505376344086021, 0.29824561403508776,

  df[relatedness_cols] = df[relatedness_cols].applymap(lambda x: int(x != 0))
  df[relatedness_cols] = df[relatedness_cols].applymap(lambda x: int(x != 0))
  df[relatedness_cols] = df[relatedness_cols].applymap(lambda x: int(x != 0))
  df[relatedness_cols] = df[relatedness_cols].applymap(lambda x: int(x != 0))
  df[relatedness_cols] = df[relatedness_cols].applymap(lambda x: int(x != 0))


In [183]:
# evaluating by converting 1,2,3 values to 1 and keep 0 as 0 for the 'relatedness' column
import pandas as pd
from sklearn.metrics import cohen_kappa_score

df = pd.DataFrame(df_123_merged.copy())

# print(pd.DataFrame(df["validness_release_1"], df["validness_release_2"]))

# kappa = cohen_kappa_score(df["validness_release_1"], df["validness_release_2"])
# # print(kappa)

# observed_agreement = (df["validness_release_1"] == df["validness_release_2"]).mean()
# pabak = 2 * observed_agreement - 1
# print(f"PABAK Score: {pabak:.4f}")

# print(f"Krippendorff’s Alpha: {kripp_alpha:.4f}")

# Function to recode 'relatedness' based on the disagreement rule
def recode_relatedness(column):
    return column.apply(lambda x: 0 if x == 0 else 1)

# Recode 'relatedness' columns
df["relatedness_release_1"] = recode_relatedness(df["relatedness_release_1"])
df["relatedness_release_2"] = recode_relatedness(df["relatedness_release_2"])
df["relatedness_release_3"] = recode_relatedness(df["relatedness_release_3"])

columns_to_check = ['validness', 'relatedness', 'appropriateness']

# Calculate percent agreement and Cohen's Kappa
results = calculate_agreement_metrics_between_annotators(df, columns_to_check)

print("Mode 1 release 1, 2, 3 \nGrouping by rule 'A': 0 if 0, 1 otherwise")
# Display results
for column, metrics in results.items():
    print(f"\nMetrics for '{column}':")
    for key, value in metrics.items():
        print(f"  {key}: {value:.5f}")
    


Mode 1 release 1, 2, 3 
Grouping by rule 'A': 0 if 0, 1 otherwise

Metrics for 'validness':
  1_2_percent_agreement: 99.00000
  1_3_percent_agreement: 83.00000
  2_3_percent_agreement: 84.00000
  percent_agreement_all_3: 83.00000
  fleiss_kappa: -0.00473
  krippendorff_alpha: -0.00138
  cohen_kappa_1_2: 0.00000
  cohen_kappa_1_3: 0.00000
  cohen_kappa_2_3: 1.00000
  kripp_alpha_1_2: 0.00000
  kripp_alpha_1_3: 0.00000
  kripp_alpha_2_3: 0.02808
  pabak_1_2: 0.98000
  pabak_1_3: 0.66000
  pabak_2_3: 0.68000

Metrics for 'relatedness':
  1_2_percent_agreement: 84.00000
  1_3_percent_agreement: 89.00000
  2_3_percent_agreement: 89.00000
  percent_agreement_all_3: 81.00000
  fleiss_kappa: 0.25158
  krippendorff_alpha: 0.25407
  cohen_kappa_1_2: 0.18616
  cohen_kappa_1_3: 0.18616
  cohen_kappa_2_3: 1.00000
  kripp_alpha_1_2: 0.18693
  kripp_alpha_1_3: 0.00000
  kripp_alpha_2_3: 0.36348
  pabak_1_2: 0.68000
  pabak_1_3: 0.78000
  pabak_2_3: 0.78000

Metrics for 'appropriateness':
  1_2_percen

In [181]:
# evaluating by converting 2,3 values to 1 and converting 0,1 values to 0 for the 'relatedness' column
df = pd.DataFrame(df_123_merged.copy())

# print(df.head())

def recode_relatedness_2(column):
    return column.apply(lambda x: 0 if x in [0,1] else 1)

# Recode relatedness columns based on the disagreement rule
df["relatedness_release_1"] = recode_relatedness_2(df["relatedness_release_1"])
df["relatedness_release_2"] = recode_relatedness_2(df["relatedness_release_2"])
df["relatedness_release_3"] = recode_relatedness_2(df["relatedness_release_3"])

columns_to_check = ['validness', 'relatedness', 'appropriateness']

# Calculate percent agreement
percent_agreement = calculate_agreement_metrics_between_annotators(df, columns_to_check)

print("Mode 1 release 1,2,3 \nGrouping by rule 'B' : 0 if 0 or 1, 1 otherwise")
# Display results
for column, agreements in percent_agreement.items():
    print(f"Percent Agreement for '{column}':")
    for key, value in agreements.items():
        print(f"  {key}: {value:.2f}")
    

Mode 1 release 1,2,3 
Grouping by rule 'B' : 0 if 0 or 1, 1 otherwise
Percent Agreement for 'validness':
  1_2_percent_agreement: 99.00
  1_3_percent_agreement: 83.00
  2_3_percent_agreement: 84.00
  percent_agreement_all_3: 83.00
  fleiss_kappa: -0.00
  krippendorff_alpha: -0.00
  cohen_kappa_1_2: 0.00
  cohen_kappa_1_3: 0.00
  cohen_kappa_2_3: 1.00
  kripp_alpha_1_2: 0.00
  kripp_alpha_1_3: 0.00
  kripp_alpha_2_3: 0.03
Percent Agreement for 'relatedness':
  1_2_percent_agreement: 70.00
  1_3_percent_agreement: 71.00
  2_3_percent_agreement: 73.00
  percent_agreement_all_3: 57.00
  fleiss_kappa: 0.27
  krippendorff_alpha: 0.27
  cohen_kappa_1_2: 0.29
  cohen_kappa_1_3: 0.29
  cohen_kappa_2_3: 1.00
  kripp_alpha_1_2: 0.29
  kripp_alpha_1_3: 0.00
  kripp_alpha_2_3: 0.27
Percent Agreement for 'appropriateness':
  1_2_percent_agreement: 86.00
  1_3_percent_agreement: 82.00
  2_3_percent_agreement: 92.00
  percent_agreement_all_3: 80.00
  fleiss_kappa: 0.09
  krippendorff_alpha: 0.10
  coh