In [1]:
import os
import pandas as pd
import simpledorff
from sklearn.metrics import cohen_kappa_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Set the folder path where the CSV files are located
folder_path = r'C:\Users\nithi\Downloads\LGBQTweet-master\LGBQTweet-master\dataset\labeled'

# Create an empty list to store the dataframes
dataframes = []

# Loop through all the CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        
        # Read the CSV file into a dataframe
        filepath = os.path.join(folder_path, filename)
        df = pd.read_csv(filepath)
        
        # Select the desired columns
        df = df[['Unnamed: 0', 'text', 'annotation', 'annotation_agent', 'id']]
        
        # Rename the 'Unnamed: 0' column to 'ID'
        df = df.rename(columns={'Unnamed: 0': 'ID'})
        
        # Replace values in the 'annotation' column
        df['annotation'] = df['annotation'].replace({
            '1 (extremely positive)': 'non-hateful',
            '2 (positive)': 'non-hateful',
            '3 (neutral)': 'non-hateful',
            '4 (hateful)': 'hateful',
            '5 (extremely hateful)': 'hateful'
        })
        
        # Filter the dataframe to include only 'hateful' or 'non-hateful' rows
        df = df[df['annotation'].isin(['hateful', 'non-hateful'])]
        
        # Considering only some columns
        df = df[['id', 'text', 'annotation', 'annotation_agent']]
        
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all the dataframes in the list into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

# Print the combined dataframe
combined_df

Unnamed: 0,id,text,annotation,annotation_agent
0,2501,i am so excited to gay it up today damN,non-hateful,argilla
1,2502,Had the pleasure of Speaking at my colleges 9t...,non-hateful,argilla
2,2503,"""If you're black, if you're white, if you're g...",non-hateful,argilla
3,2504,Thinking &amp; speaking mindfully &amp; authen...,non-hateful,argilla
4,2505,The MTF transsexual on Springer had no idea sh...,hateful,argilla
...,...,...,...,...
1824,2244,"Born intersex, I am so glad I got a generous s...",non-hateful,argilla
1825,2245,Tell me: yay or nay to this title for a grant ...,hateful,argilla
1826,2246,Georgina Sparks should’ve been bisexual,hateful,argilla
1827,2247,I understand being mad that Darren is performi...,non-hateful,argilla


In [3]:
# Read the labeled_sentences dataframe from file
labeled_sentences = pd.read_json(r'updated_10k.json')

# Convert the values in the 'label' column to lowercase
labeled_sentences['label'] = labeled_sentences['label'].str.lower()

# Replace any characters that are not 'hateful' or 'non-hateful' with an empty string
labeled_sentences['label'] = labeled_sentences['label'].str.replace('[^a-zA-Z ]', '')

# Map the values to 'hateful' or 'non-hateful'
labeled_sentences['label'] = labeled_sentences['label'].map({'hateful': 'hateful', 'nonhateful': 'non-hateful'})

# Considering only some columns
labeled_sentences =labeled_sentences[['id', 'text', 'label', 'reason']]

# Print the resulting dataframe
labeled_sentences

Unnamed: 0,id,text,label,reason
0,1,if you need to soothe a flustered Caucasian th...,hateful,The sentence is classified as Hateful because ...
1,2,Afaik I’m not intersex and I don’t know enough...,hateful,"hateful - The sentence contains a slur (""c*ntb..."
2,3,"Queer &amp; trans kids, I know it's not this e...",hateful,Hateful. The sentence incites fear and discrim...
3,4,I got called 'gay' (as if it's an insult to be...,hateful,hateful
4,5,Valid to unfollow me at any time but if you do...,hateful,Hateful. The sentence contains a threat and an...
...,...,...,...,...
11713,11714,'Dillon Callum Jackson' is bisexual ahahahah I...,hateful,hateful - The sentence contains a derogatory t...
11714,11715,Every STRAIGHT girl has a ATLEAST one GAY frie...,non-hateful,The sentence is non-hateful because it does no...
11715,11716,who’s the pos in a thousand oaks with a shitty...,hateful,The sentence contains explicit references to t...
11716,11717,lesbian character : gets screen time\nme : !!\...,hateful,The sentence contains offensive language and e...


In [4]:
# Merge the dataframes based on the 'text' column
merged_df = pd.merge(combined_df, labeled_sentences, on='id', suffixes=('_combined', '_labeled'))
merged_df = merged_df.rename(columns={'annotation': 'human_annotation', 'label': 'gpt_annotation'})
merged_df.head(3)

Unnamed: 0,id,text_combined,human_annotation,annotation_agent,text_labeled,gpt_annotation,reason
0,2501,i am so excited to gay it up today damN,non-hateful,argilla,i am so excited to gay it up today damN,hateful,The sentence should be classified as hateful b...
1,2502,Had the pleasure of Speaking at my colleges 9t...,non-hateful,argilla,Had the pleasure of Speaking at my colleges 9t...,non-hateful,The sentence is non-hateful because it refers ...
2,2503,"""If you're black, if you're white, if you're g...",non-hateful,argilla,"""If you're black, if you're white, if you're g...",non-hateful,The sentence promotes inclusivity and love tow...


In [5]:
# Considering only some columns
merged_df = merged_df[['id', 'text_combined', 'human_annotation', 'gpt_annotation', 'reason']]

# Renaming column name from text_combined to text
merged_df = merged_df.rename(columns={'text_combined': 'text'})

# Printing merged dataframe
merged_df

Unnamed: 0,id,text,human_annotation,gpt_annotation,reason
0,2501,i am so excited to gay it up today damN,non-hateful,hateful,The sentence should be classified as hateful b...
1,2502,Had the pleasure of Speaking at my colleges 9t...,non-hateful,non-hateful,The sentence is non-hateful because it refers ...
2,2503,"""If you're black, if you're white, if you're g...",non-hateful,non-hateful,The sentence promotes inclusivity and love tow...
3,2504,Thinking &amp; speaking mindfully &amp; authen...,non-hateful,non-hateful,The sentence is non-hateful because it promote...
4,2505,The MTF transsexual on Springer had no idea sh...,hateful,hateful,The sentence contains a direct reference to a ...
...,...,...,...,...,...
1824,2244,"Born intersex, I am so glad I got a generous s...",non-hateful,non-hateful,The reason for the non-hateful classification ...
1825,2245,Tell me: yay or nay to this title for a grant ...,hateful,non-hateful,The sentence is non-hateful because it is a re...
1826,2246,Georgina Sparks should’ve been bisexual,hateful,non-hateful,The sentence does not contain any explicit ref...
1827,2247,I understand being mad that Darren is performi...,non-hateful,non-hateful,The sentence does not contain any explicit ref...


In [6]:
# Dropping null values from the merged dataframe
merged_df.dropna(inplace=True)

In [7]:
# Checking the shape of the data after dropping null values
merged_df

Unnamed: 0,id,text,human_annotation,gpt_annotation,reason
0,2501,i am so excited to gay it up today damN,non-hateful,hateful,The sentence should be classified as hateful b...
1,2502,Had the pleasure of Speaking at my colleges 9t...,non-hateful,non-hateful,The sentence is non-hateful because it refers ...
2,2503,"""If you're black, if you're white, if you're g...",non-hateful,non-hateful,The sentence promotes inclusivity and love tow...
3,2504,Thinking &amp; speaking mindfully &amp; authen...,non-hateful,non-hateful,The sentence is non-hateful because it promote...
4,2505,The MTF transsexual on Springer had no idea sh...,hateful,hateful,The sentence contains a direct reference to a ...
...,...,...,...,...,...
1824,2244,"Born intersex, I am so glad I got a generous s...",non-hateful,non-hateful,The reason for the non-hateful classification ...
1825,2245,Tell me: yay or nay to this title for a grant ...,hateful,non-hateful,The sentence is non-hateful because it is a re...
1826,2246,Georgina Sparks should’ve been bisexual,hateful,non-hateful,The sentence does not contain any explicit ref...
1827,2247,I understand being mad that Darren is performi...,non-hateful,non-hateful,The sentence does not contain any explicit ref...


In [8]:
matched_rows = merged_df[merged_df['human_annotation'] == merged_df['gpt_annotation']]
matched_rows

Unnamed: 0,id,text,human_annotation,gpt_annotation,reason
1,2502,Had the pleasure of Speaking at my colleges 9t...,non-hateful,non-hateful,The sentence is non-hateful because it refers ...
2,2503,"""If you're black, if you're white, if you're g...",non-hateful,non-hateful,The sentence promotes inclusivity and love tow...
3,2504,Thinking &amp; speaking mindfully &amp; authen...,non-hateful,non-hateful,The sentence is non-hateful because it promote...
4,2505,The MTF transsexual on Springer had no idea sh...,hateful,hateful,The sentence contains a direct reference to a ...
5,2506,i cant believe my BOYFRIEND ASHER is FUCKING GAY,hateful,hateful,hateful
...,...,...,...,...,...
1821,2241,Y'know Pitch Perfect was amazing but there was...,non-hateful,non-hateful,The sentence is non-hateful because it is expr...
1823,2243,DON'T YOU MARRY THE FOREIGN QUEER JENNIFER,hateful,hateful,The sentence contains a direct reference to th...
1824,2244,"Born intersex, I am so glad I got a generous s...",non-hateful,non-hateful,The reason for the non-hateful classification ...
1827,2247,I understand being mad that Darren is performi...,non-hateful,non-hateful,The sentence does not contain any explicit ref...


In [9]:
unmatched_rows = merged_df[merged_df['human_annotation'] != merged_df['gpt_annotation']]
unmatched_rows

Unnamed: 0,id,text,human_annotation,gpt_annotation,reason
0,2501,i am so excited to gay it up today damN,non-hateful,hateful,The sentence should be classified as hateful b...
8,2512,tbh im so...SO so done over people being judge...,non-hateful,hateful,The sentence should be classified as hateful b...
12,2519,straight people Love to make life hell for LGB...,non-hateful,hateful,hateful - The sentence contains an explicit re...
13,2520,It's fucking 2017 and folk still think mocking...,non-hateful,hateful,hateful
16,2524,"Doesn't matter if someone gay, bisexual, or le...",non-hateful,hateful,hateful. The sentence contains an explicit ref...
...,...,...,...,...,...
1817,2235,Fucking Jared leto ruined my lesbian fantasy a...,non-hateful,hateful,The sentence is classified as hateful because ...
1819,2237,Crap I think my header represents lesbians....?,hateful,non-hateful,The sentence does not contain any explicit ref...
1822,2242,great trope: people asking juzo if he's some s...,hateful,non-hateful,The sentence does not contain any explicit ref...
1825,2245,Tell me: yay or nay to this title for a grant ...,hateful,non-hateful,The sentence is non-hateful because it is a re...


In [10]:
# Convert the 'human_annotation' and 'gpt_annotation' columns to categorical variables
categories = ['non-hateful', 'hateful']
merged_df['human_annotation'] = pd.Categorical(merged_df['human_annotation'], categories=categories)
merged_df['gpt_annotation'] = pd.Categorical(merged_df['gpt_annotation'], categories=categories)

# Compute the Cohen's kappa score between the 'human_annotation' and 'gpt_annotation' columns
kappa = cohen_kappa_score(merged_df['human_annotation'], merged_df['gpt_annotation'], labels=['non-hateful', 'hateful'])

print(f"Cohen's kappa score: {kappa}")

Cohen's kappa score: 0.2828614726916402


In [11]:
# Create a new DataFrame with the required columns
data = pd.DataFrame(columns=["document_id", "annotator_id", "annotation"])

# Loop through the rows of the merged_df DataFrame
for i, row in merged_df.iterrows():
    # Add a row for the human annotation
    data = data.append({"document_id": i, "annotator_id": "human", "annotation": row["human_annotation"]}, ignore_index=True)
    # Add a row for the GPT annotation
    data = data.append({"document_id": i, "annotator_id": "gpt", "annotation": row["gpt_annotation"]}, ignore_index=True)

# Print the resulting DataFrame
data

Unnamed: 0,document_id,annotator_id,annotation
0,0,human,non-hateful
1,0,gpt,hateful
2,1,human,non-hateful
3,1,gpt,non-hateful
4,2,human,non-hateful
...,...,...,...
3651,1826,gpt,non-hateful
3652,1827,human,non-hateful
3653,1827,gpt,non-hateful
3654,1828,human,non-hateful


In [12]:
# Calculate the Krippendorff's alpha score
alpha_score = simpledorff.calculate_krippendorffs_alpha_for_df(data, experiment_col="document_id", annotator_col="annotator_id", class_col="annotation")

# Print the alpha score
print(f"Krippendorff's alpha: {alpha_score}")

Krippendorff's alpha: 0.2301178537267713
