In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

# MFTC

In [2]:
mftc_df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), "raw", "moral_values_prediction", "mftc.csv"), usecols=["tweet_text"] + [f'annotation_{i}' for i in range(1, 9)])
mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,care,"care,purity","care,purity",care,,,,
1,Wholeheartedly support these protests & acts o...,subversion,subversion,loyalty,"loyalty,subversion",,,,
2,This Sandra Bland situation man no disrespect ...,"harm,cheating",fairness,cheating,"care,cheating",,,,
3,"Commitment to peace, healing and loving neighb...",care,purity,"care,purity",care,,,,
4,Injustice for one is an injustice for all #All...,"fairness,cheating",loyalty,"care,loyalty,purity",cheating,,,,
...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"harm,loyalty",care,"authority,harm",,,,,
34983,While those affected by sandy continue to suff...,"subversion,harm","subversion,harm",cheating,,,,,
34984,After losing the election to 2 unisex names ma...,fairness,"subversion,fairness",fairness,,,,,
34985,AT_USER Price gouging looting and rage Sandy c...,cheating,"subversion,cheating",cheating,,,,,


In [3]:
mftc_df['tweet_text'] = mftc_df['tweet_text'].str.strip()
mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,care,"care,purity","care,purity",care,,,,
1,Wholeheartedly support these protests & acts o...,subversion,subversion,loyalty,"loyalty,subversion",,,,
2,This Sandra Bland situation man no disrespect ...,"harm,cheating",fairness,cheating,"care,cheating",,,,
3,"Commitment to peace, healing and loving neighb...",care,purity,"care,purity",care,,,,
4,Injustice for one is an injustice for all #All...,"fairness,cheating",loyalty,"care,loyalty,purity",cheating,,,,
...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"harm,loyalty",care,"authority,harm",,,,,
34983,While those affected by sandy continue to suff...,"subversion,harm","subversion,harm",cheating,,,,,
34984,After losing the election to 2 unisex names ma...,fairness,"subversion,fairness",fairness,,,,,
34985,AT_USER Price gouging looting and rage Sandy c...,cheating,"subversion,cheating",cheating,,,,,


In [4]:
# lowercase everything in the annotation_x columns
for col in mftc_df.columns:
    if "annotation" in col:
        mftc_df[col] = mftc_df[col].str.lower()

mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,care,"care,purity","care,purity",care,,,,
1,Wholeheartedly support these protests & acts o...,subversion,subversion,loyalty,"loyalty,subversion",,,,
2,This Sandra Bland situation man no disrespect ...,"harm,cheating",fairness,cheating,"care,cheating",,,,
3,"Commitment to peace, healing and loving neighb...",care,purity,"care,purity",care,,,,
4,Injustice for one is an injustice for all #All...,"fairness,cheating",loyalty,"care,loyalty,purity",cheating,,,,
...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"harm,loyalty",care,"authority,harm",,,,,
34983,While those affected by sandy continue to suff...,"subversion,harm","subversion,harm",cheating,,,,,
34984,After losing the election to 2 unisex names ma...,fairness,"subversion,fairness",fairness,,,,,
34985,AT_USER Price gouging looting and rage Sandy c...,cheating,"subversion,cheating",cheating,,,,,


In [5]:
# every value under annotation_x columns is either NaN or a comma separated string or a string.
# Using the following mapping, convert each token in the string to the corresponding label:
# harm -> care
# cheating -> fairness
# betrayal -> loyalty
# subversion -> authority
# degradation -> purity
# nh -> non-moral
# nm -> non-moral
# After mapping, convert the strings to list of strings
# If the value is NaN, make it an empty list
for col in mftc_df.columns:
    if "annotation" in col:
        mftc_df[col] = mftc_df[col].fillna("")
        mftc_df[col] = mftc_df[col].str.replace("harm", "care")
        mftc_df[col] = mftc_df[col].str.replace("cheating", "fairness")
        mftc_df[col] = mftc_df[col].str.replace("betrayal", "loyalty")
        mftc_df[col] = mftc_df[col].str.replace("subversion", "authority")
        mftc_df[col] = mftc_df[col].str.replace("degradation", "purity")
        mftc_df[col] = mftc_df[col].str.replace("nh", "non-moral")
        mftc_df[col] = mftc_df[col].str.replace("nm", "non-moral")
        mftc_df[col] = mftc_df[col].str.split(",")
        mftc_df[col] = mftc_df[col].apply(lambda x: [] if x == [''] else x)

mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,[care],"[care, purity]","[care, purity]",[care],[],[],[],[]
1,Wholeheartedly support these protests & acts o...,[authority],[authority],[loyalty],"[loyalty, authority]",[],[],[],[]
2,This Sandra Bland situation man no disrespect ...,"[care, fairness]",[fairness],[fairness],"[care, fairness]",[],[],[],[]
3,"Commitment to peace, healing and loving neighb...",[care],[purity],"[care, purity]",[care],[],[],[],[]
4,Injustice for one is an injustice for all #All...,"[fairness, fairness]",[loyalty],"[care, loyalty, purity]",[fairness],[],[],[],[]
...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"[care, loyalty]",[care],"[authority, care]",[],[],[],[],[]
34983,While those affected by sandy continue to suff...,"[authority, care]","[authority, care]",[fairness],[],[],[],[],[]
34984,After losing the election to 2 unisex names ma...,[fairness],"[authority, fairness]",[fairness],[],[],[],[],[]
34985,AT_USER Price gouging looting and rage Sandy c...,[fairness],"[authority, fairness]",[fairness],[],[],[],[],[]


In [6]:
# For each of the annotator columns, prevent duplicates by converting the list to a set and then back to a list
for col in mftc_df.columns:
    if "annotation" in col:
        mftc_df[col] = mftc_df[col].apply(lambda x: list(set(x)))

mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,[care],"[purity, care]","[purity, care]",[care],[],[],[],[]
1,Wholeheartedly support these protests & acts o...,[authority],[authority],[loyalty],"[authority, loyalty]",[],[],[],[]
2,This Sandra Bland situation man no disrespect ...,"[fairness, care]",[fairness],[fairness],"[fairness, care]",[],[],[],[]
3,"Commitment to peace, healing and loving neighb...",[care],[purity],"[purity, care]",[care],[],[],[],[]
4,Injustice for one is an injustice for all #All...,[fairness],[loyalty],"[purity, care, loyalty]",[fairness],[],[],[],[]
...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"[care, loyalty]",[care],"[care, authority]",[],[],[],[],[]
34983,While those affected by sandy continue to suff...,"[care, authority]","[care, authority]",[fairness],[],[],[],[],[]
34984,After losing the election to 2 unisex names ma...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[]
34985,AT_USER Price gouging looting and rage Sandy c...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[]


In [7]:
# I want to know all of the unique labels in the dataset
for col in mftc_df.columns:
    if "annotation" in col:
        print(mftc_df[col].explode().unique())

['care' 'authority' 'fairness' 'non-moral' 'loyalty' 'purity']
['purity' 'care' 'authority' 'fairness' 'loyalty' 'non-moral']
['purity' 'care' 'loyalty' 'fairness' 'non-moral' 'authority']
['care' 'authority' 'loyalty' 'fairness' nan 'non-moral' 'purity']
[nan 'authority' 'loyalty' 'non-moral' 'care' 'fairness' 'purity']
[nan 'non-moral' 'fairness' 'loyalty' 'purity' 'care' 'authority']
[nan 'purity' 'fairness' 'authority' 'non-moral' 'care']
[nan 'authority' 'loyalty']


In [8]:
# Show the rows where a single or more annotator labeled both 'non-moral' and another label for the same tweet
mftc_df[mftc_df.apply(lambda row: any('non-moral' in x and any(label != 'non-moral' for label in x) for x in row[['annotation_1', 'annotation_2', 'annotation_3', 'annotation_4', 'annotation_5', 'annotation_6', 'annotation_7', 'annotation_8']]), axis=1)]

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8
25514,"@JoyAnnReid So Eminem, Coach Pop, and Robert D...",[fairness],[care],"[non-moral, fairness]",[purity],[fairness],"[authority, fairness, loyalty]",[],[]
25599,"Seems like your use of the word \""lying\"" is l...",[fairness],[fairness],"[non-moral, fairness]","[purity, fairness]",[non-moral],"[authority, loyalty]",[],[]
26224,Learn German with Babbel - Babbel is supported...,[non-moral],"[non-moral, care, loyalty]",[non-moral],[non-moral],[],[],[],[]
26527,@realDonaldTrump @FLOTUS - what a corrupt fami...,"[purity, fairness]",[fairness],"[non-moral, authority, loyalty, purity]",[fairness],[],[],[],[]
26987,#SandraBullock is a patriot who supports Ameri...,[loyalty],[authority],"[non-moral, authority]",[],[],[],[],[]
26988,@realDonaldTrump will protect the sanctity of ...,"[purity, authority]","[purity, authority]","[non-moral, authority]",[],[],[],[],[]
26989,"Leftists cannot admit the kindness of the man,...","[authority, care]","[authority, fairness, care]","[non-moral, authority, care]",[],[],[],[],[]
26992,recruit the youngans to obey me. @realDonaldTrump,[authority],[authority],"[non-moral, authority]",[],[],[],[],[]
26995,"@POTUS @realDonaldTrump Hmm, that might hurt t...","[fairness, care]",[authority],"[non-moral, authority]",[],[],[],[],[]
27008,@realDonaldTrump oh but the charges are comin ...,[authority],[authority],"[non-moral, authority]",[],[],[],[],[]


In [9]:
# Let's have two columns having values of 'moral_count' and 'non_moral_count'. For each of the annotation columns (annotation_1 to annotation_8), if the annotation column only had 'non-moral', the non_moral_count increments.
# If the annotation column had one or more of the other (moral) labels and not 'non-moral' label, the moral_count_increments.
# If the annotation column had both 'non-moral' and other labels in it just skip that column and don't increment anything.
# If the annotation column was an empty list again skip that column and don't increment anything.
mftc_df['moral_count'] = 0
mftc_df['non_moral_count'] = 0

for col in mftc_df.columns:
    if "annotation" in col:
        mftc_df['moral_count'] += mftc_df[col].apply(lambda x: 1 if len(set(x)) >= 1 and 'non-moral' not in x else 0)
        mftc_df['non_moral_count'] += mftc_df[col].apply(lambda x: 1 if len(set(x)) == 1 and 'non-moral' in x else 0)

mftc_df


Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8,moral_count,non_moral_count
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,[care],"[purity, care]","[purity, care]",[care],[],[],[],[],4,0
1,Wholeheartedly support these protests & acts o...,[authority],[authority],[loyalty],"[authority, loyalty]",[],[],[],[],4,0
2,This Sandra Bland situation man no disrespect ...,"[fairness, care]",[fairness],[fairness],"[fairness, care]",[],[],[],[],4,0
3,"Commitment to peace, healing and loving neighb...",[care],[purity],"[purity, care]",[care],[],[],[],[],4,0
4,Injustice for one is an injustice for all #All...,[fairness],[loyalty],"[purity, care, loyalty]",[fairness],[],[],[],[],4,0
...,...,...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"[care, loyalty]",[care],"[care, authority]",[],[],[],[],[],3,0
34983,While those affected by sandy continue to suff...,"[care, authority]","[care, authority]",[fairness],[],[],[],[],[],3,0
34984,After losing the election to 2 unisex names ma...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[],3,0
34985,AT_USER Price gouging looting and rage Sandy c...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[],3,0


In [10]:
assert not mftc_df[(mftc_df['moral_count'] == 0) & (mftc_df['non_moral_count'] == 0)].any(axis=None)

In [11]:
# Have a column 'binary_label' which is 'moral' if the 'moral_count' is greater than 'non_moral_count' and 'non-moral' if the 'non_moral_count' is greater than 'moral_count'.
# If the moral_count and non_moral_count are equal, the binary_label is 'tie'.
mftc_df['binary_label'] = mftc_df.apply(lambda row: 'moral' if row['moral_count'] > row['non_moral_count'] else 'non-moral' if row['non_moral_count'] > row['moral_count'] else 'tie', axis=1)
mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8,moral_count,non_moral_count,binary_label
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,[care],"[purity, care]","[purity, care]",[care],[],[],[],[],4,0,moral
1,Wholeheartedly support these protests & acts o...,[authority],[authority],[loyalty],"[authority, loyalty]",[],[],[],[],4,0,moral
2,This Sandra Bland situation man no disrespect ...,"[fairness, care]",[fairness],[fairness],"[fairness, care]",[],[],[],[],4,0,moral
3,"Commitment to peace, healing and loving neighb...",[care],[purity],"[purity, care]",[care],[],[],[],[],4,0,moral
4,Injustice for one is an injustice for all #All...,[fairness],[loyalty],"[purity, care, loyalty]",[fairness],[],[],[],[],4,0,moral
...,...,...,...,...,...,...,...,...,...,...,...,...
34982,AT_USER Proud of all your efforts to help thos...,"[care, loyalty]",[care],"[care, authority]",[],[],[],[],[],3,0,moral
34983,While those affected by sandy continue to suff...,"[care, authority]","[care, authority]",[fairness],[],[],[],[],[],3,0,moral
34984,After losing the election to 2 unisex names ma...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[],3,0,moral
34985,AT_USER Price gouging looting and rage Sandy c...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[],3,0,moral


In [12]:
# What is the distribution of the binary_label column?
mftc_df['binary_label'].value_counts()

binary_label
moral        19939
non-moral    12607
tie           2441
Name: count, dtype: int64

In [13]:
# Show some of the rows where the binary_label is 'tie'
mftc_df[mftc_df['binary_label'] == 'tie']

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8,moral_count,non_moral_count,binary_label
10,Yes RT @arthur_affect:Do ppl who change #Black...,[non-moral],[fairness],[fairness],[non-moral],[],[],[],[],2,2,tie
69,#UniteBlue: Some Whites aren't Pure Evil https...,[non-moral],[loyalty],"[fairness, loyalty]",[non-moral],[],[],[],[],2,2,tie
94,"Don't want a baby, be a lady! #DefundPP #AllLi...",[non-moral],[purity],[fairness],[non-moral],[],[],[],[],2,2,tie
4445,#SOSBLAKAUSTRALIA #SolidarityWithBaltimore #Oa...,[authority],[non-moral],[loyalty],[non-moral],[],[],[],[],2,2,tie
4512,Exclusive Video: Violence Inside Rikers Youth ...,"[care, authority]",[non-moral],[non-moral],[care],[],[],[],[],2,2,tie
...,...,...,...,...,...,...,...,...,...,...,...,...
33651,humanity hoboken newjersey sandy URL,[purity],[care],[non-moral],[non-moral],[],[],[],[],2,2,tie
34390,SandyHelp Jon BonJovi living on a prayer The d...,[purity],[non-moral],[loyalty],[non-moral],[],[],[],[],2,2,tie
34405,Nature is invincible sandy,[purity],[non-moral],[non-moral],[authority],[],[],[],[],2,2,tie
34455,prosperity ifoundgas HurricaneSandy URL,[care],[non-moral],[purity],[non-moral],[],[],[],[],2,2,tie


In [14]:
# Drop the rows having binary_label as 'tie'
mftc_df = mftc_df[mftc_df['binary_label'] != 'tie'].reset_index(drop=True)
mftc_df

Unnamed: 0,tweet_text,annotation_1,annotation_2,annotation_3,annotation_4,annotation_5,annotation_6,annotation_7,annotation_8,moral_count,non_moral_count,binary_label
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,[care],"[purity, care]","[purity, care]",[care],[],[],[],[],4,0,moral
1,Wholeheartedly support these protests & acts o...,[authority],[authority],[loyalty],"[authority, loyalty]",[],[],[],[],4,0,moral
2,This Sandra Bland situation man no disrespect ...,"[fairness, care]",[fairness],[fairness],"[fairness, care]",[],[],[],[],4,0,moral
3,"Commitment to peace, healing and loving neighb...",[care],[purity],"[purity, care]",[care],[],[],[],[],4,0,moral
4,Injustice for one is an injustice for all #All...,[fairness],[loyalty],"[purity, care, loyalty]",[fairness],[],[],[],[],4,0,moral
...,...,...,...,...,...,...,...,...,...,...,...,...
32541,AT_USER Proud of all your efforts to help thos...,"[care, loyalty]",[care],"[care, authority]",[],[],[],[],[],3,0,moral
32542,While those affected by sandy continue to suff...,"[care, authority]","[care, authority]",[fairness],[],[],[],[],[],3,0,moral
32543,After losing the election to 2 unisex names ma...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[],3,0,moral
32544,AT_USER Price gouging looting and rage Sandy c...,[fairness],"[fairness, authority]",[fairness],[],[],[],[],[],3,0,moral


# MFRC

In [15]:
mfrc_df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), "raw", "moral_values_prediction", "mfrc.csv"), usecols=["text", "annotator", "annotation", "confidence"])
mfrc_df

Unnamed: 0,text,annotator,annotation,confidence
0,That particular part of the debate is especial...,annotator03,Non-Moral,Confident
1,That particular part of the debate is especial...,annotator01,Purity,Confident
2,That particular part of the debate is especial...,annotator02,Thin Morality,Confident
3,"/r/france is pretty lively, with it's own ling...",annotator03,Non-Moral,Confident
4,"/r/france is pretty lively, with it's own ling...",annotator00,Non-Moral,Somewhat Confident
...,...,...,...,...
61221,Well I can discern from your vehemence toward ...,annotator05,Equality,Confident
61222,Kick! Punch! It's all in the mind. If you wann...,annotator05,Thin Morality,Somewhat Confident
61223,Reddit can’t help you this is some seriously t...,annotator05,Thin Morality,Confident
61224,Yes. Disordered eating is insidious. And Rita ...,annotator05,Non-Moral,Somewhat Confident


In [16]:
# unique list of annotations
mfrc_df['annotation'].unique()

array(['Non-Moral', 'Purity', 'Thin Morality', 'Equality', 'Authority',
       'Loyalty,Equality', 'Care', 'Care,Loyalty', 'Loyalty',
       'Purity,Loyalty,Authority', 'Purity,Loyalty', 'Proportionality',
       'Purity,Authority', 'Care,Loyalty,Authority,Proportionality',
       'Care,Equality,Proportionality', 'Purity,Care,Equality',
       'Authority,Proportionality', 'Care,Loyalty,Equality',
       'Proportionality,Care,Equality', 'Loyalty,Proportionality',
       'Care,Purity,Equality,Proportionality', 'Loyalty,Authority',
       'Care,Authority', 'Care,Equality',
       'Care,Authority,Proportionality',
       'Authority,Equality,Proportionality',
       'Care,Loyalty,Purity,Equality', 'Care,Purity',
       'Care,Proportionality', 'Care,Purity,Equality',
       'Proportionality,Equality', 'Purity,Care',
       'Care,Loyalty,Authority', 'Purity,Loyalty,Proportionality',
       'Authority,Equality', 'Care,Purity,Authority,Equality',
       'Care,Loyalty,Proportionality',
       'P

In [17]:
# lowercase everything in the annotation column
mfrc_df['annotation'] = mfrc_df['annotation'].str.lower()
mfrc_df

Unnamed: 0,text,annotator,annotation,confidence
0,That particular part of the debate is especial...,annotator03,non-moral,Confident
1,That particular part of the debate is especial...,annotator01,purity,Confident
2,That particular part of the debate is especial...,annotator02,thin morality,Confident
3,"/r/france is pretty lively, with it's own ling...",annotator03,non-moral,Confident
4,"/r/france is pretty lively, with it's own ling...",annotator00,non-moral,Somewhat Confident
...,...,...,...,...
61221,Well I can discern from your vehemence toward ...,annotator05,equality,Confident
61222,Kick! Punch! It's all in the mind. If you wann...,annotator05,thin morality,Somewhat Confident
61223,Reddit can’t help you this is some seriously t...,annotator05,thin morality,Confident
61224,Yes. Disordered eating is insidious. And Rita ...,annotator05,non-moral,Somewhat Confident


In [18]:
# Convert the annotation column to a list of strings by splitting on commas
mfrc_df['annotation'] = mfrc_df['annotation'].str.split(",")
mfrc_df

Unnamed: 0,text,annotator,annotation,confidence
0,That particular part of the debate is especial...,annotator03,[non-moral],Confident
1,That particular part of the debate is especial...,annotator01,[purity],Confident
2,That particular part of the debate is especial...,annotator02,[thin morality],Confident
3,"/r/france is pretty lively, with it's own ling...",annotator03,[non-moral],Confident
4,"/r/france is pretty lively, with it's own ling...",annotator00,[non-moral],Somewhat Confident
...,...,...,...,...
61221,Well I can discern from your vehemence toward ...,annotator05,[equality],Confident
61222,Kick! Punch! It's all in the mind. If you wann...,annotator05,[thin morality],Somewhat Confident
61223,Reddit can’t help you this is some seriously t...,annotator05,[thin morality],Confident
61224,Yes. Disordered eating is insidious. And Rita ...,annotator05,[non-moral],Somewhat Confident


In [19]:
# I want to know the unique labels in the annotation column
mfrc_df['annotation'].explode().unique()

array(['non-moral', 'purity', 'thin morality', 'equality', 'authority',
       'loyalty', 'care', 'proportionality'], dtype=object)

In [20]:
# replace equality and proportionality with fairness
mfrc_df['annotation'] = mfrc_df['annotation'].apply(lambda x: [label.replace("equality", "fairness") if label == "equality" else label for label in x])
mfrc_df['annotation'] = mfrc_df['annotation'].apply(lambda x: [label.replace("proportionality", "fairness") if label == "proportionality" else label for label in x])
mfrc_df

Unnamed: 0,text,annotator,annotation,confidence
0,That particular part of the debate is especial...,annotator03,[non-moral],Confident
1,That particular part of the debate is especial...,annotator01,[purity],Confident
2,That particular part of the debate is especial...,annotator02,[thin morality],Confident
3,"/r/france is pretty lively, with it's own ling...",annotator03,[non-moral],Confident
4,"/r/france is pretty lively, with it's own ling...",annotator00,[non-moral],Somewhat Confident
...,...,...,...,...
61221,Well I can discern from your vehemence toward ...,annotator05,[fairness],Confident
61222,Kick! Punch! It's all in the mind. If you wann...,annotator05,[thin morality],Somewhat Confident
61223,Reddit can’t help you this is some seriously t...,annotator05,[thin morality],Confident
61224,Yes. Disordered eating is insidious. And Rita ...,annotator05,[non-moral],Somewhat Confident


In [21]:
# I want to know the unique labels in the annotation column
mfrc_df['annotation'].explode().unique()

array(['non-moral', 'purity', 'thin morality', 'fairness', 'authority',
       'loyalty', 'care'], dtype=object)

In [22]:
# Find the rows where the annotation column has both 'non-moral' and other labels
mfrc_df[mfrc_df.apply(lambda row: 'non-moral' in row['annotation'] and any(label != 'non-moral' for label in row['annotation']), axis=1)]

Unnamed: 0,text,annotator,annotation,confidence
4491,I think they were mostly neutral in public (be...,annotator01,"[loyalty, non-moral]",Confident
6826,They would rather not vote for a full retard l...,annotator03,"[non-moral, fairness]",Confident
40039,"Yes, we can. It's great to be against police b...",annotator02,"[care, non-moral]",Confident
46889,McConnell is always uncertain about things tha...,annotator02,"[non-moral, fairness]",Confident


In [23]:
# Find the rows where the annotation column has both 'thin morality' and other labels
mfrc_df[mfrc_df.apply(lambda row: 'thin morality' in row['annotation'] and any(label != 'thin morality' for label in row['annotation']), axis=1)]

Unnamed: 0,text,annotator,annotation,confidence
16107,just as much as people diskliked the communist...,annotator04,"[care, thin morality, fairness, fairness]",Confident
16362,&gt; The two big traditional parties (Socialis...,annotator04,"[thin morality, fairness]",
17891,Sometimes a nazi is a nazi. Le Pen came from a...,annotator04,"[care, thin morality, fairness]",Confident
18457,"Man, you really fucking hate Muslims. It's alw...",annotator04,"[care, thin morality, fairness, fairness]",Confident
21897,I think that’s not appropriate and a 28 year o...,annotator04,"[care, thin morality, fairness]",Confident
33737,I suspect that your husband draws power from y...,annotator04,"[fairness, care, thin morality, fairness]",Confident
33857,This man is trying to force you in to sex acts...,annotator04,"[thin morality, fairness, care, loyalty, fairn...",
37331,Would have been nice for him to simply say to ...,annotator04,"[thin morality, fairness]",Confident
49452,There's no dignity in work. But life is about ...,annotator04,"[care, thin morality, fairness, fairness]",Confident
57374,Would have been nice for him to simply say to ...,annotator04,"[thin morality, fairness]",Confident


In [24]:
# Find the empty lists or NaNs in the annotation column
mfrc_df[mfrc_df['annotation'].apply(lambda x: x == [''] or x == ['nan'] or x == [])]

Unnamed: 0,text,annotator,annotation,confidence


In [25]:
# unique list of annotators
mfrc_df['annotator'].unique()

array(['annotator03', 'annotator01', 'annotator02', 'annotator00',
       'annotator04', 'annotator05'], dtype=object)

In [26]:
# Create 5 columns for each annotator: array(['annotator03', 'annotator01', 'annotator02', 'annotator00', 'annotator04', 'annotator05'])
# Each column is going to have the corresponding annotation column value
mfrc_df['annotation_00'] = mfrc_df.apply(lambda row: row['annotation'] if row['annotator'] == 'annotator00' else [], axis=1)
mfrc_df['annotation_01'] = mfrc_df.apply(lambda row: row['annotation'] if row['annotator'] == 'annotator01' else [], axis=1)
mfrc_df['annotation_02'] = mfrc_df.apply(lambda row: row['annotation'] if row['annotator'] == 'annotator02' else [], axis=1)
mfrc_df['annotation_03'] = mfrc_df.apply(lambda row: row['annotation'] if row['annotator'] == 'annotator03' else [], axis=1)
mfrc_df['annotation_04'] = mfrc_df.apply(lambda row: row['annotation'] if row['annotator'] == 'annotator04' else [], axis=1)
mfrc_df['annotation_05'] = mfrc_df.apply(lambda row: row['annotation'] if row['annotator'] == 'annotator05' else [], axis=1)
mfrc_df

Unnamed: 0,text,annotator,annotation,confidence,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
0,That particular part of the debate is especial...,annotator03,[non-moral],Confident,[],[],[],[non-moral],[],[]
1,That particular part of the debate is especial...,annotator01,[purity],Confident,[],[purity],[],[],[],[]
2,That particular part of the debate is especial...,annotator02,[thin morality],Confident,[],[],[thin morality],[],[],[]
3,"/r/france is pretty lively, with it's own ling...",annotator03,[non-moral],Confident,[],[],[],[non-moral],[],[]
4,"/r/france is pretty lively, with it's own ling...",annotator00,[non-moral],Somewhat Confident,[non-moral],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
61221,Well I can discern from your vehemence toward ...,annotator05,[fairness],Confident,[],[],[],[],[],[fairness]
61222,Kick! Punch! It's all in the mind. If you wann...,annotator05,[thin morality],Somewhat Confident,[],[],[],[],[],[thin morality]
61223,Reddit can’t help you this is some seriously t...,annotator05,[thin morality],Confident,[],[],[],[],[],[thin morality]
61224,Yes. Disordered eating is insidious. And Rita ...,annotator05,[non-moral],Somewhat Confident,[],[],[],[],[],[non-moral]


In [27]:
# Drop the duplicates for rows having the same text and annotator
mfrc_df = mfrc_df.drop_duplicates(subset=['text', 'annotator'], keep='first').reset_index(drop=True)
mfrc_df

Unnamed: 0,text,annotator,annotation,confidence,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
0,That particular part of the debate is especial...,annotator03,[non-moral],Confident,[],[],[],[non-moral],[],[]
1,That particular part of the debate is especial...,annotator01,[purity],Confident,[],[purity],[],[],[],[]
2,That particular part of the debate is especial...,annotator02,[thin morality],Confident,[],[],[thin morality],[],[],[]
3,"/r/france is pretty lively, with it's own ling...",annotator03,[non-moral],Confident,[],[],[],[non-moral],[],[]
4,"/r/france is pretty lively, with it's own ling...",annotator00,[non-moral],Somewhat Confident,[non-moral],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...
53530,Well I can discern from your vehemence toward ...,annotator05,[fairness],Confident,[],[],[],[],[],[fairness]
53531,Kick! Punch! It's all in the mind. If you wann...,annotator05,[thin morality],Somewhat Confident,[],[],[],[],[],[thin morality]
53532,Reddit can’t help you this is some seriously t...,annotator05,[thin morality],Confident,[],[],[],[],[],[thin morality]
53533,Yes. Disordered eating is insidious. And Rita ...,annotator05,[non-moral],Somewhat Confident,[],[],[],[],[],[non-moral]


In [28]:
# show the row which contains text: "That particular part of the debate is espe"
mfrc_df[mfrc_df['text'].str.contains("That particular part of the debate is espe")]

Unnamed: 0,text,annotator,annotation,confidence,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
0,That particular part of the debate is especial...,annotator03,[non-moral],Confident,[],[],[],[non-moral],[],[]
1,That particular part of the debate is especial...,annotator01,[purity],Confident,[],[purity],[],[],[],[]
2,That particular part of the debate is especial...,annotator02,[thin morality],Confident,[],[],[thin morality],[],[],[]


In [29]:
# show the row which contains text: "/r/france is pretty lively, with it's own l"
mfrc_df[mfrc_df['text'].str.contains("/r/france is pretty lively, with it's own l")]

Unnamed: 0,text,annotator,annotation,confidence,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
3,"/r/france is pretty lively, with it's own ling...",annotator03,[non-moral],Confident,[],[],[],[non-moral],[],[]
4,"/r/france is pretty lively, with it's own ling...",annotator00,[non-moral],Somewhat Confident,[non-moral],[],[],[],[],[]
5,"/r/france is pretty lively, with it's own ling...",annotator02,[non-moral],Confident,[],[],[non-moral],[],[],[]


In [30]:
# Group the rows by the text column and within each group, copy the annotation_00 to annotation_05 columns in a single row
mfrc_df = mfrc_df.groupby('text').agg({'annotation_00': 'sum', 'annotation_01': 'sum', 'annotation_02': 'sum', 'annotation_03': 'sum', 'annotation_04': 'sum', 'annotation_05': 'sum'}).reset_index()
mfrc_df

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
0,"\n\nDr. Robert Jay Lifton, distinguished profe...",[thin morality],[],[non-moral],[],[authority],[]
1,\n\nIf you prefer not to click on Daily Mail s...,[non-moral],[non-moral],[non-moral],[],[],[]
2,\n&gt;Ben Judah details Emmanuel Macron's nasc...,[authority],[authority],[],[],[fairness],[]
3,"\n&gt;Ergo, he supports Macron but doesn't wan...",[],[],[non-moral],[thin morality],[loyalty],[]
4,\n&gt;He looks exactly the same in Richie Rich...,[],[],[non-moral],[non-moral],[thin morality],[]
...,...,...,...,...,...,...,...
17881,🏅 Take this poor bitch's gold! I CAN'T STOP LA...,[non-moral],[],[non-moral],[non-moral],[],[]
17882,"😂 republicans response to Cohen, “This guys a ...",[],[authority],[non-moral],[non-moral],[],[]
17883,"😆 yes, full name is Lucy Clawless: Princess Wa...",[non-moral],[non-moral],[],[],[non-moral],[]
17884,😱 \n\nNo but seriously now. It doesn't take a ...,[],[non-moral],[non-moral],[loyalty],[],[]


In [31]:
# show the row which contains text: "That particular part of the debate is espe"
mfrc_df[mfrc_df['text'].str.contains("That particular part of the debate is espe")]

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
12625,That particular part of the debate is especial...,[],[purity],[thin morality],[non-moral],[],[]


In [32]:
# show the row which contains text: "/r/france is pretty lively, with it's own l"
mfrc_df[mfrc_df['text'].str.contains("/r/france is pretty lively, with it's own l")]

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05
1284,"/r/france is pretty lively, with it's own ling...",[non-moral],[],[non-moral],[non-moral],[],[]


In [33]:
# Let's have two columns having values of 'moral_count' and 'non_moral_count'. For each of the annotation columns (annotation_00 to annotation_05), if the annotation column only had 'non-moral', the non_moral_count increments.
# If the annotation column had one or more of the other (moral) labels and not 'non-moral' label, the moral_count_increments.
# If the annotation column had both 'non-moral' and other labels in it just skip that column and don't increment anything.
# If the annotation column was an empty list again skip that column and don't increment anything.
mfrc_df['moral_count'] = 0
mfrc_df['non_moral_count'] = 0

for col in mfrc_df.columns:
    if "annotation_" in col:
        mfrc_df['moral_count'] += mfrc_df[col].apply(lambda x: 1 if len(set(x)) >= 1 and 'non-moral' not in x else 0)
        mfrc_df['non_moral_count'] += mfrc_df[col].apply(lambda x: 1 if len(set(x)) == 1 and 'non-moral' in x else 0)

mfrc_df

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05,moral_count,non_moral_count
0,"\n\nDr. Robert Jay Lifton, distinguished profe...",[thin morality],[],[non-moral],[],[authority],[],2,1
1,\n\nIf you prefer not to click on Daily Mail s...,[non-moral],[non-moral],[non-moral],[],[],[],0,3
2,\n&gt;Ben Judah details Emmanuel Macron's nasc...,[authority],[authority],[],[],[fairness],[],3,0
3,"\n&gt;Ergo, he supports Macron but doesn't wan...",[],[],[non-moral],[thin morality],[loyalty],[],2,1
4,\n&gt;He looks exactly the same in Richie Rich...,[],[],[non-moral],[non-moral],[thin morality],[],1,2
...,...,...,...,...,...,...,...,...,...
17881,🏅 Take this poor bitch's gold! I CAN'T STOP LA...,[non-moral],[],[non-moral],[non-moral],[],[],0,3
17882,"😂 republicans response to Cohen, “This guys a ...",[],[authority],[non-moral],[non-moral],[],[],1,2
17883,"😆 yes, full name is Lucy Clawless: Princess Wa...",[non-moral],[non-moral],[],[],[non-moral],[],0,3
17884,😱 \n\nNo but seriously now. It doesn't take a ...,[],[non-moral],[non-moral],[loyalty],[],[],1,2


In [34]:
assert not mfrc_df[(mfrc_df['moral_count'] == 0) & (mfrc_df['non_moral_count'] == 0)].any(axis=None)

In [35]:
# Have a column 'binary_label' which is 'moral' if the 'moral_count' is greater than 'non_moral_count' and 'non-moral' if the 'non_moral_count' is greater than 'moral_count'.
# If the moral_count and non_moral_count are equal, the binary_label is 'tie'.
mfrc_df['binary_label'] = mfrc_df.apply(lambda row: 'moral' if row['moral_count'] > row['non_moral_count'] else 'non-moral' if row['non_moral_count'] > row['moral_count'] else 'tie', axis=1)
mfrc_df

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05,moral_count,non_moral_count,binary_label
0,"\n\nDr. Robert Jay Lifton, distinguished profe...",[thin morality],[],[non-moral],[],[authority],[],2,1,moral
1,\n\nIf you prefer not to click on Daily Mail s...,[non-moral],[non-moral],[non-moral],[],[],[],0,3,non-moral
2,\n&gt;Ben Judah details Emmanuel Macron's nasc...,[authority],[authority],[],[],[fairness],[],3,0,moral
3,"\n&gt;Ergo, he supports Macron but doesn't wan...",[],[],[non-moral],[thin morality],[loyalty],[],2,1,moral
4,\n&gt;He looks exactly the same in Richie Rich...,[],[],[non-moral],[non-moral],[thin morality],[],1,2,non-moral
...,...,...,...,...,...,...,...,...,...,...
17881,🏅 Take this poor bitch's gold! I CAN'T STOP LA...,[non-moral],[],[non-moral],[non-moral],[],[],0,3,non-moral
17882,"😂 republicans response to Cohen, “This guys a ...",[],[authority],[non-moral],[non-moral],[],[],1,2,non-moral
17883,"😆 yes, full name is Lucy Clawless: Princess Wa...",[non-moral],[non-moral],[],[],[non-moral],[],0,3,non-moral
17884,😱 \n\nNo but seriously now. It doesn't take a ...,[],[non-moral],[non-moral],[loyalty],[],[],1,2,non-moral


In [36]:
mfrc_df['binary_label'].value_counts()

binary_label
non-moral    9736
moral        8098
tie            52
Name: count, dtype: int64

In [37]:
# Show some of the rows where the binary_label is 'tie'
mfrc_df[mfrc_df['binary_label'] == 'tie']

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05,moral_count,non_moral_count,binary_label
169,"""That bum definitely spent my dollar on the bu...",[non-moral],[],[],[],[thin morality],[],1,1,tie
359,&gt; Did you just make your own laws?\n\nNo.\n...,[],[],"[care, fairness]",[non-moral],[],[],1,1,tie
1099,"&gt;Well, that's going to be really inconvenie...",[authority],[],[non-moral],[],[],[],1,1,tie
1688,All those right-wingers basically want back in...,[],"[purity, authority]",[],[non-moral],[],[],1,1,tie
2161,As an outsider Hamon was actually my favourite...,[non-moral],"[care, authority]",[],[],[],[],1,1,tie
2318,Because 99.9% of the time anything bad about T...,[],[authority],[non-moral],[],[],[],1,1,tie
3625,"Even if you did something you enjoyed, it woul...",[],[non-moral],[],[],[fairness],[],1,1,tie
3713,Exactly. Does Jamie's Dad want him arrested fo...,[],[],[],[non-moral],[thin morality],[],1,1,tie
4018,France should warn the Americans against inter...,[],[],[],[non-moral],[thin morality],[],1,1,tie
4840,"Hmmm, what about Napoleon III, Robbespierre, L...",[],[],[non-moral],[],"[care, loyalty, purity]",[],1,1,tie


In [38]:
# Drop the rows having binary_label as 'tie'
mfrc_df = mfrc_df[mfrc_df['binary_label'] != 'tie'].reset_index(drop=True)
mfrc_df

Unnamed: 0,text,annotation_00,annotation_01,annotation_02,annotation_03,annotation_04,annotation_05,moral_count,non_moral_count,binary_label
0,"\n\nDr. Robert Jay Lifton, distinguished profe...",[thin morality],[],[non-moral],[],[authority],[],2,1,moral
1,\n\nIf you prefer not to click on Daily Mail s...,[non-moral],[non-moral],[non-moral],[],[],[],0,3,non-moral
2,\n&gt;Ben Judah details Emmanuel Macron's nasc...,[authority],[authority],[],[],[fairness],[],3,0,moral
3,"\n&gt;Ergo, he supports Macron but doesn't wan...",[],[],[non-moral],[thin morality],[loyalty],[],2,1,moral
4,\n&gt;He looks exactly the same in Richie Rich...,[],[],[non-moral],[non-moral],[thin morality],[],1,2,non-moral
...,...,...,...,...,...,...,...,...,...,...
17829,🏅 Take this poor bitch's gold! I CAN'T STOP LA...,[non-moral],[],[non-moral],[non-moral],[],[],0,3,non-moral
17830,"😂 republicans response to Cohen, “This guys a ...",[],[authority],[non-moral],[non-moral],[],[],1,2,non-moral
17831,"😆 yes, full name is Lucy Clawless: Princess Wa...",[non-moral],[non-moral],[],[],[non-moral],[],0,3,non-moral
17832,😱 \n\nNo but seriously now. It doesn't take a ...,[],[non-moral],[non-moral],[loyalty],[],[],1,2,non-moral


# Merge MFTC and MFRC

In [39]:
# From mftc only keep the tweet_text and binary_label columns, rename the tweet_text column to text
mftc_df = mftc_df[['tweet_text', 'binary_label']]
mftc_df = mftc_df.rename(columns={'tweet_text': 'text'})
mftc_df

Unnamed: 0,text,binary_label
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,moral
1,Wholeheartedly support these protests & acts o...,moral
2,This Sandra Bland situation man no disrespect ...,moral
3,"Commitment to peace, healing and loving neighb...",moral
4,Injustice for one is an injustice for all #All...,moral
...,...,...
32541,AT_USER Proud of all your efforts to help thos...,moral
32542,While those affected by sandy continue to suff...,moral
32543,After losing the election to 2 unisex names ma...,moral
32544,AT_USER Price gouging looting and rage Sandy c...,moral


In [40]:
# From mfrc only keep the text and binary_label columns
mfrc_df = mfrc_df[['text', 'binary_label']]
mfrc_df

Unnamed: 0,text,binary_label
0,"\n\nDr. Robert Jay Lifton, distinguished profe...",moral
1,\n\nIf you prefer not to click on Daily Mail s...,non-moral
2,\n&gt;Ben Judah details Emmanuel Macron's nasc...,moral
3,"\n&gt;Ergo, he supports Macron but doesn't wan...",moral
4,\n&gt;He looks exactly the same in Richie Rich...,non-moral
...,...,...
17829,🏅 Take this poor bitch's gold! I CAN'T STOP LA...,non-moral
17830,"😂 republicans response to Cohen, “This guys a ...",non-moral
17831,"😆 yes, full name is Lucy Clawless: Princess Wa...",non-moral
17832,😱 \n\nNo but seriously now. It doesn't take a ...,non-moral


In [41]:
# Concatenate mftc_df and mfrc_df
df = pd.concat([mftc_df, mfrc_df], ignore_index=True)
df

Unnamed: 0,text,binary_label
0,@fergusonoctober @FOX2now #AllLivesMatter Peac...,moral
1,Wholeheartedly support these protests & acts o...,moral
2,This Sandra Bland situation man no disrespect ...,moral
3,"Commitment to peace, healing and loving neighb...",moral
4,Injustice for one is an injustice for all #All...,moral
...,...,...
50375,🏅 Take this poor bitch's gold! I CAN'T STOP LA...,non-moral
50376,"😂 republicans response to Cohen, “This guys a ...",non-moral
50377,"😆 yes, full name is Lucy Clawless: Princess Wa...",non-moral
50378,😱 \n\nNo but seriously now. It doesn't take a ...,non-moral


In [42]:
df['binary_label'].value_counts()

binary_label
moral        28037
non-moral    22343
Name: count, dtype: int64

In [43]:
# Count the number of samples in each class
class_counts = df['binary_label'].value_counts()

# Determine the minimum count among all classes
min_count = class_counts.min()

# Sample an equal number of samples from each class
balanced_df = df.groupby('binary_label').apply(lambda x: x.sample(n=min_count)).reset_index(drop=True)
balanced_df

Unnamed: 0,text,binary_label
0,I am #ConservativeBecause I am smart and belie...,moral
1,AT_USER you need to get involved and stop the ...,moral
2,RT @abdoolrazerq: All human beings are born f...,moral
3,trans person here— this isnt transphobic. peop...,moral
4,@Snellzilla4 @GlasnowPls slide in my DMs if yo...,moral
...,...,...
44681,It Just Happened #Obama Turns #DallasMemorial ...,non-moral
44682,@ShannonTrimble @dj_burek @andieiamwhoiam So y...,non-moral
44683,&gt;pro athletes or successful musicians?\n\nT...,non-moral
44684,"“My focus now turns to ensuring a smooth, orde...",non-moral


In [44]:
balanced_df['binary_label'].value_counts()

binary_label
moral        22343
non-moral    22343
Name: count, dtype: int64