In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, train_test_split, KFold,StratifiedKFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from sklearn.linear_model import SGDClassifier

### Datasets:

In [None]:
# Download the corpus here: https://osf.io/k5n7y/ 
# Concatenate all the subsets and use one-hot-encoding for multi-label mft annotations
mftc_df = pd.read_csv('path/to/downloaded/MFTC')

In [21]:
# General domain:
mftc_df['domain'] = 0
mftc_df['non-moral'] = mftc_df['new_label'].apply(lambda x: 1 if x == 'nm' else 0)
mftc_df.shape

(20628, 17)

In [22]:
mftc_df.head(3)

Unnamed: 0,text,cleaned_text,annotations,new_label,subdomain,care,harm,fairness,cheating,loyalty,betrayal,authority,subversion,purity,degradation,domain,non-moral
0,The courage to be impatient with evil and pati...,The courage to be impatient with evil and pati...,"[{'annotation': 'fairness', 'annotator': 'anno...",fairness,BLM,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,#NotAllCops but OMFG already. 😡 Protect and se...,but OMFG already. enraged_face Protect and ser...,"[{'annotation': 'care', 'annotator': 'annotato...",harm,BLM,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,"stop shaving, it's your manly dignity #blackje...","stop shaving, it's your manly dignity","[{'annotation': 'nm', 'annotator': 'annotator0...",nm,BLM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


### Reddit Data:

In [23]:
# Download the corpus here:https://huggingface.co/datasets/USC-MOLA-Lab/MFRC
# Use one-hot-encoding for multi-label mft annotations
mfrc_df = pd.read_csv('path/to/downloaded/MFRC')

In [24]:
mfrc_df['domain'] = 1
mfrc_df = mfrc_df[~mfrc_df['final_annotation'].isin(['inconclusive', 'Thin Morality'])]
mfrc_df['non-moral'] = mfrc_df['final_annotation'].apply(lambda x: 1 if x == 'Non-Moral' else 0)
mfrc_df.shape

(13995, 25)

In [25]:
mfrc_df.head(3)

Unnamed: 0,text,cleaned_text,subdomain,bucket,annotation,final_annotation,care,fairness,loyalty,authority,...,degradation,equality,proportionality,thin morality,non-moral,inconclusive,vader_neg,vader_neu,vader_pos,domain
1,"/r/france is pretty lively, with it's own ling...","/r/france is pretty lively, with it's own ling...",europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.142,0.679,0.18,1
3,it really is a very unusual situation isn't it...,it really is a very unusual situation isn't it...,europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.118,0.772,0.11,1
5,"Macrons face just screams\n""I do not know her,...","Macrons face just screams ""I do not know her, ...",europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.135,0.865,0.0,1


### Facebook Data
- For the facebook data, please contact the authors of this paper: https://dl.acm.org/doi/10.1145/3543507.3583865


In [26]:
mffp_df = pd.read_csv('path/to/downloaded/FbVaccinationPosts/')

In [27]:
mffp_df['domain'] = 2
mffp_df.shape

(1510, 21)

In [28]:
cat_columns = mffp_df.columns[6:-5]
cat_columns = ["text", "cleaned_text"]+cat_columns.tolist()+["subdomain", "domain"]
cat_columns

['text',
 'cleaned_text',
 'care',
 'harm',
 'fairness',
 'cheating',
 'loyalty',
 'betrayal',
 'authority',
 'subversion',
 'purity',
 'degradation',
 'subdomain',
 'domain']

In [29]:

df = pd.concat([mftc_df[cat_columns], mfrc_df[cat_columns], mffp_df[cat_columns]], axis=0)



In [30]:
df = df.dropna(subset = ['cleaned_text'])
df = df.drop_duplicates(subset = ['cleaned_text'])
df = df[~(df[['care', 'fairness', 'loyalty', 'authority', 'purity', "harm", "subversion", "degradation", "cheating", "betrayal"]] == 2).any(axis=1)]
df.reset_index(drop = True, inplace = True)

In [31]:
# Shuffle the data:
df = df.sample(frac=1).reset_index(drop=True)
df.shape

(34301, 14)

In [32]:
df.loc[:, 'care':'degradation'] = df.loc[:, 'care':'degradation'].astype(int)

In [33]:
df

Unnamed: 0,text,cleaned_text,care,harm,fairness,cheating,loyalty,betrayal,authority,subversion,purity,degradation,subdomain,domain
0,I didn't even know what SNL was until my twent...,I didn't even know what SNL was until my twent...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,nostalgia,1
1,Don't disrespect people because of what faith ...,Don't disrespect people because of what faith ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Elections,0
2,Le pen cant win. Our right wing is as retarded...,Le pen cant win. Our right wing is as retarded...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,worldnews,1
3,RT @skabdallahdrz: #FreeBatoulFromPrison we ne...,RT @user: we need justice @user @user @user @user,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
4,Holy fucking shit. This was a domestic (white)...,Holy fucking shit. This was a domestic (white)...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,worldnews,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34296,RT @AntiJokeJamal: If you honk your horn .4 se...,RT @user: If you honk your horn .4 seconds aft...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Davidson,0
34297,RT @Razarumi: Mother in #Baltimore who doesn’t...,RT @user: Mother in who doesn t want her son t...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Baltimore,0
34298,@cleggzta Ho ho ho.,@user Ho ho ho.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Davidson,0
34299,Bet Charlie Strong has contacted Ridley,Bet Charlie Strong has contacted Ridley,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Davidson,0


In [34]:
df.columns

Index(['text', 'cleaned_text', 'care', 'harm', 'fairness', 'cheating',
       'loyalty', 'betrayal', 'authority', 'subversion', 'purity',
       'degradation', 'subdomain', 'domain'],
      dtype='object')

In [35]:
35887*0.2

7177.400000000001

In [36]:
df.isna().sum()

text            0
cleaned_text    0
care            0
harm            0
fairness        0
cheating        0
loyalty         0
betrayal        0
authority       0
subversion      0
purity          0
degradation     0
subdomain       0
domain          0
dtype: int64

In [37]:
6178/2

3089.0

#### Assigning the test domain:

In [38]:
twitter_sample = df[(df['domain'] == 0) & ((df['subdomain'] == 'Elections') | (df['subdomain'] == 'BLM'))]
twitter_sample = twitter_sample.drop_duplicates(subset = ["cleaned_text"])
twitter_sample = twitter_sample.sample(n=3384)   
twitter_sample

Unnamed: 0,text,cleaned_text,care,harm,fairness,cheating,loyalty,betrayal,authority,subversion,purity,degradation,subdomain,domain
24666,"One nation indivisible, with liberty and justi...","One nation indivisible, with liberty and justi...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
32644,"Holy shit, #BlackLivesMatter!","Holy shit, !",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
13402,RT @HillaryClinton: Proud to celebrate a histo...,RT @user: Proud to celebrate a historic victor...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
10041,@HillaryClinton @realDonaldTrump but you condo...,@user @user but you condone the violence and c...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
24742,@NuBlackVision it is a part of The disregard a...,@user it is a part of The disregard and disres...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BLM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26488,#JeremyMcDole say his name. No sanctuary for t...,say his name. No sanctuary for the hopeless. N...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
694,non-thug privilege: the ability to obey the la...,non-thug privilege: the ability to obey the la...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,BLM,0
27112,@MorganBrittany4 @jpyoung27 Because no one who...,@user @user Because no one who can enforce the...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
6573,Our world today objectifies women subjecting t...,Our world today objectifies women subjecting t...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0


In [39]:
# mftc_df

In [40]:
reddit_sample = pd.read_csv('../../LREC-COLING-2024/Data/MFRC/MFRC_posts_MFT_10.csv')

In [41]:
reddit_sample.head()

Unnamed: 0,text,cleaned_text,subdomain,bucket,annotation,final_annotation,care,fairness,loyalty,authority,...,subversion,degradation,equality,proportionality,thin morality,non-moral,inconclusive,vader_neg,vader_neu,vader_pos
0,That particular part of the debate is especial...,That particular part of the debate is especial...,europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",inconclusive,0,0,0,0,...,0,0,0,0,0,0,1,0.169,0.725,0.106
1,"/r/france is pretty lively, with it's own ling...","/r/france is pretty lively, with it's own ling...",europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,0,1,0,0.142,0.679,0.18
2,TBH Marion Le Pen would be better. Closet fasc...,TBH Marion Le Pen would be better. Closet fasc...,neoliberal,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",inconclusive,0,0,0,0,...,0,0,0,0,0,0,1,0.358,0.498,0.144
3,it really is a very unusual situation isn't it...,it really is a very unusual situation isn't it...,europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,0,1,0,0.118,0.772,0.11
4,The Le Pen brand of conservatism and classical...,The Le Pen brand of conservatism and classical...,europe,French politics,"{'annotator03': {'annotation': 'Authority', 'c...",inconclusive,0,0,0,0,...,0,0,0,0,0,0,1,0.0,0.795,0.205


In [42]:
reddit_sample = reddit_sample[:3379]

In [43]:
reddit_sample['domain']=1
reddit_sample = reddit_sample[~reddit_sample['final_annotation'].isin(['inconclusive', 'Thin Morality'])]
reddit_sample['non-moral'] = reddit_sample['final_annotation'].apply(lambda x: 1 if x == 'Non-Moral' else 0)
reddit_sample = reddit_sample.drop_duplicates(subset = ["cleaned_text"])
reddit_sample.shape

(2793, 25)

In [44]:
reddit_sample

Unnamed: 0,text,cleaned_text,subdomain,bucket,annotation,final_annotation,care,fairness,loyalty,authority,...,degradation,equality,proportionality,thin morality,non-moral,inconclusive,vader_neg,vader_neu,vader_pos,domain
1,"/r/france is pretty lively, with it's own ling...","/r/france is pretty lively, with it's own ling...",europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.142,0.679,0.180,1
3,it really is a very unusual situation isn't it...,it really is a very unusual situation isn't it...,europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.118,0.772,0.110,1
5,"Macrons face just screams\n""I do not know her,...","Macrons face just screams ""I do not know her, ...",europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.135,0.865,0.000,1
6,"Clinton lead polls by 4%, well within a reason...","Clinton lead polls by 4%, well within a reason...",worldnews,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.094,0.833,0.073,1
7,"Hey, fuck you. Us leftists will never support ...","Hey, fuck you. Us leftists will never support ...",worldnews,French politics,"{'annotator03': {'annotation': 'Loyalty,Equali...",Equality,0,0,0,0,...,0,1,0,0,0,0,0.391,0.609,0.000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3374,"I very much agree with the other commentor, ce...","I very much agree with the other commentor, ce...",europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.000,0.905,0.095,1
3375,Dont forget there are people who describe Le P...,Dont forget there are people who describe Le P...,europe,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.000,0.844,0.156,1
3376,Macron could use a bit of Reagan/Thatcher righ...,Macron could use a bit of Reagan/Thatcher righ...,neoliberal,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.000,1.000,0.000,1
3377,I have no idea what the European steel industr...,I have no idea what the European steel industr...,neoliberal,French politics,"{'annotator03': {'annotation': 'Non-Moral', 'c...",Non-Moral,0,0,0,0,...,0,0,0,0,1,0,0.071,0.817,0.112,1


In [45]:
facebook_sample = df[df['domain']==2]
facebook_sample = facebook_sample.drop_duplicates(subset = ["cleaned_text"])
facebook_sample

Unnamed: 0,text,cleaned_text,care,harm,fairness,cheating,loyalty,betrayal,authority,subversion,purity,degradation,subdomain,domain
5,My cat will be 15 years old in August and ever...,My cat will be 15 years old in August and ever...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,vaccination,2
40,I commend this man for speaking this when he k...,I commend this man for speaking this when he k...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,vaccination,2
62,Neonates ani child under 5 are under major ris...,Neonates ani child under 5 are under major ris...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
142,"Yeah, I mean cmon anti-vaxers, this gives you ...","Yeah, I mean cmon anti-vaxers, this gives you ...",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
158,It is sad when children are seriously ill and ...,It is sad when children are seriously ill and ...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,vaccination,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34211,That's exactly purpose of vaccination . To mak...,That's exactly purpose of vaccination . To mak...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
34232,"Ryan, BECAUSE IT WORKS. Here is a summary of ...","Ryan, BECAUSE IT WORKS. Here is a summary of d...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
34237,Won't be long before people see the connection...,Won't be long before people see the connection...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,vaccination,2
34267,Very sad! I didn't even realize it was an opti...,Very sad! I didn't even realize it was an opti...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2


In [46]:
test_data = pd.concat([twitter_sample[['text', 'cleaned_text', 'care', 'harm', 'fairness', 'cheating', 'loyalty',
       'betrayal', 'authority', 'subversion', 'purity', 'degradation',
       'subdomain', 'domain']],
           reddit_sample[['text','cleaned_text', 'care', 'harm', 'fairness', 'cheating', 'loyalty',
       'betrayal', 'authority', 'subversion', 'purity', 'degradation',
       'subdomain', 'domain']], 
          facebook_sample[['text', 'cleaned_text', 'care', 'harm', 'fairness', 'cheating', 'loyalty',
       'betrayal', 'authority', 'subversion', 'purity', 'degradation',
       'subdomain', 'domain']]
          ])

In [47]:
test_data = test_data.reset_index(drop = True)

In [48]:
test_data

Unnamed: 0,text,cleaned_text,care,harm,fairness,cheating,loyalty,betrayal,authority,subversion,purity,degradation,subdomain,domain
0,"One nation indivisible, with liberty and justi...","One nation indivisible, with liberty and justi...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
1,"Holy shit, #BlackLivesMatter!","Holy shit, !",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
2,RT @HillaryClinton: Proud to celebrate a histo...,RT @user: Proud to celebrate a historic victor...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
3,@HillaryClinton @realDonaldTrump but you condo...,@user @user but you condone the violence and c...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
4,@NuBlackVision it is a part of The disregard a...,@user it is a part of The disregard and disres...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BLM,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7681,That's exactly purpose of vaccination . To mak...,That's exactly purpose of vaccination . To mak...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
7682,"Ryan, BECAUSE IT WORKS. Here is a summary of ...","Ryan, BECAUSE IT WORKS. Here is a summary of d...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
7683,Won't be long before people see the connection...,Won't be long before people see the connection...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,vaccination,2
7684,Very sad! I didn't even realize it was an opti...,Very sad! I didn't even realize it was an opti...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2


In [49]:
test_data[['care', 'harm', 'fairness', 'cheating', 'loyalty',
       'betrayal', 'authority', 'subversion', 'purity', 'degradation']].sum()

care           684.0
harm           664.0
fairness       659.0
cheating       737.0
loyalty        302.0
betrayal       140.0
authority      306.0
subversion     422.0
purity         259.0
degradation    199.0
dtype: float64

In [50]:
test_data.isna().sum()

text            0
cleaned_text    0
care            0
harm            0
fairness        0
cheating        0
loyalty         0
betrayal        0
authority       0
subversion      0
purity          0
degradation     0
subdomain       0
domain          0
dtype: int64

In [51]:
# test_data.to_csv('../Results_datasets/7686_test_data_Twitter_Reddit_FB.csv', index = None)

In [52]:
test_data = pd.read_csv('../Results_datasets/7686_test_data_Twitter_Reddit_FB.csv')

In [53]:
test_data = test_data.loc[:, "cleaned_text":]


In [54]:
test_data

Unnamed: 0,cleaned_text,care,harm,fairness,cheating,loyalty,betrayal,authority,subversion,purity,degradation,subdomain,domain
0,I've never experienced a hurt that has me lost...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BLM,0
1,RT @user: That 's the spirit my dear winking_f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
2,@user @user Instead of dGeneration whose futur...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Elections,0
3,It is of paramount importance that we strive t...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
4,They're all cheating tonight. First there's th...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Elections,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7681,"Exactly, I can never post this enough in my gr...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
7682,This is just a back door way in to mandatory v...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2
7683,So it is ok to be concerned about GMOs in food...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,vaccination,2
7684,Thank you for being bold and spreading this in...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,vaccination,2


In [55]:
# test_data.drop_duplicates()

#### Kfold implementation for all 5 MFT Values:

In [56]:
df.shape

(34301, 14)

In [57]:
df = df.drop_duplicates(subset = ['cleaned_text'])

In [58]:
df.shape

(34301, 14)

In [59]:
df.reset_index(drop = True, inplace = True)

In [60]:
# df[~df.cleaned_text.isin(test_data['cleaned_text'])].reset_index(drop=True)

In [61]:
# len(df_test)+len(df_train)==len(df)

In [62]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split, GridSearchCV


nltk.download('punkt')
nltk.download('stopwords')


# df_train = df[~df.cleaned_text.isin(test_data['cleaned_text'])]
# df_test = df[df.cleaned_text.isin(test_data['cleaned_text'])]

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)



stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

def vectorize(sentence, w2v_model):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.zeros(100) if len(words_vecs) == 0 else np.mean(words_vecs, axis=0)

# Cleaning the data
X_train = df_train["cleaned_text"].apply(preprocess)
X_test = df_test["cleaned_text"].apply(preprocess)

#Word2Vec embeddings
sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

X_train = np.array([vectorize(sentence, w2v_model) for sentence in X_train])
X_test = np.array([vectorize(sentence, w2v_model) for sentence in X_test])

predictions_df = pd.DataFrame(index=df_test.index)

mft_category = ['care', 'harm', 'fairness', 'cheating', 'loyalty',
                'betrayal', 'authority', 'subversion', 'purity', 'degradation']

for mft_value in mft_category:
    y_train = df_train[mft_value]
    y_test = df_test[mft_value]

    # Train a classification model with hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [10, 20, 30],
        'criterion': ['gini', 'entropy']
    }
    
    clf = RandomForestClassifier(class_weight='balanced', 
                                 n_estimators = 100,
#                                  max_features = 'auto',
                                 criterion = 'entropy', 
                                 max_depth = 30)
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    clf.fit(X_train, y_train)
    
    best_clf = grid_search.best_estimator_

    # Predict on the test data
    y_pred = clf.predict(X_test)

    predictions_df[mft_value + '_pred'] = y_pred

    # Print report on model performance
    print('Predictions for:', mft_value)
    print(classification_report(y_test, y_pred))

# Concatenate true values and predictions
results_df = pd.concat([df_test[mft_category].reset_index(drop=True), predictions_df.reset_index(drop=True)], axis=1)
print(results_df)


[nltk_data] Downloading package punkt to
[nltk_data]     /data/home/eey254/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /data/home/eey254/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predictions for: care
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.96      6287
         1.0       0.47      0.08      0.14       574

    accuracy                           0.92      6861
   macro avg       0.69      0.54      0.55      6861
weighted avg       0.88      0.92      0.89      6861

Predictions for: harm
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95      6212
         1.0       0.39      0.04      0.07       649

    accuracy                           0.90      6861
   macro avg       0.65      0.52      0.51      6861
weighted avg       0.86      0.90      0.87      6861

Predictions for: fairness
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      6411
         1.0       0.82      0.22      0.35       450

    accuracy                           0.95      6861
   macro avg       0.89      0.61      0.66      6861
weig

### Bootstraping Technique:

In [64]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.utils import resample
import numpy as np
import pandas as pd

possible_labels = ["care", "harm", "fairness", "cheating", "loyalty", "betrayal",
                   "authority", "subversion", "purity", "degradation"]



n_bootstrap_iters = 1000  # Number of bootstrap iterations
bootstrap_results = {label: {metric: [] for metric in ["F1 (Binary)",  "F1 (Macro)", "F1 (Weighted)",
                                                       "Precision (Binary)", "Precision (Macro)", "Precision (Weighted)",
                                                       "Recall (Binary)", "Recall (Macro)", "Recall (Weighted)", "Accuracy"]} for label in possible_labels}

# Bootstrap loop
for _ in range(n_bootstrap_iters):
    for lab in possible_labels:
        # resampling with replacement
        sample_indices = resample(np.arange(len(results_df)), replace=True)
        true = results_df.loc[sample_indices, lab].values
        candidate = results_df.loc[sample_indices, f"{lab}_pred"].values
        
        # computing metrics for bootstrap sample
        bootstrap_results[lab]["F1 (Binary)"].append(f1_score(true, candidate, average="binary", zero_division=0))
        bootstrap_results[lab]["F1 (Macro)"].append(f1_score(true, candidate, average="macro", zero_division=0))
        bootstrap_results[lab]["F1 (Weighted)"].append(f1_score(true, candidate, average="weighted", zero_division=0))
        bootstrap_results[lab]["Precision (Binary)"].append(precision_score(true, candidate, average="binary", zero_division=0))
        bootstrap_results[lab]["Precision (Macro)"].append(precision_score(true, candidate, average="macro", zero_division=0))
        bootstrap_results[lab]["Precision (Weighted)"].append(precision_score(true, candidate, average="weighted", zero_division=0))
        bootstrap_results[lab]["Recall (Binary)"].append(recall_score(true, candidate, average="binary", zero_division=0))
        bootstrap_results[lab]["Recall (Macro)"].append(recall_score(true, candidate, average="macro", zero_division=0))
        bootstrap_results[lab]["Recall (Weighted)"].append(recall_score(true, candidate, average="weighted", zero_division=0))
        bootstrap_results[lab]["Accuracy"].append(accuracy_score(true, candidate))

# standard deviations calculations from bootstrap results
std_devs = {label: {metric: np.std(values) for metric, values in metrics.items()} for label, metrics in bootstrap_results.items()}

# original metrics calculations with standard deviations
final_results = []
for lab in possible_labels:
    result = {"Moral Value": lab}
    true = results_df[lab].values
    candidate = results_df[f"{lab}_pred"].values
    
    # Original metrics
    result["F1 Score (Binary)"] = f"{f1_score(true, candidate, average='binary', zero_division=0):.2f} ± {std_devs[lab]['F1 (Binary)']:.2f}"
    result["F1 Score (Macro)"] = f"{f1_score(true, candidate, average='macro', zero_division=0):.2f} ± {std_devs[lab]['F1 (Macro)']:.2f}"  
    result["F1 Score (Weighted)"] = f"{f1_score(true, candidate, average='weighted', zero_division=0):.2f} ± {std_devs[lab]['F1 (Weighted)']:.2f}"
    
    result["Precision Score (Binary)"] = f"{precision_score(true, candidate, average='binary', zero_division=0):.2f} ± {std_devs[lab]['Precision (Binary)']:.2f}"
    result["Precision Score (Macro)"] = f"{precision_score(true, candidate, average='macro', zero_division=0):.2f} ± {std_devs[lab]['Precision (Macro)']:.2f}"    
    result["Precision Score (Weighted)"] = f"{precision_score(true, candidate, average='weighted', zero_division=0):.2f} ± {std_devs[lab]['Precision (Weighted)']:.2f}"
    result["Recall Score (Binary)"] = f"{recall_score(true, candidate, average='binary', zero_division=0):.2f} ± {std_devs[lab]['Recall (Binary)']:.2f}"
    result["Recall Score (Macro)"] = f"{recall_score(true, candidate, average='macro', zero_division=0):.2f} ± {std_devs[lab]['Recall (Macro)']:.2f}"
    result["Recall Score (Weighted)"] = f"{recall_score(true, candidate, average='weighted', zero_division=0):.2f} ± {std_devs[lab]['Recall (Weighted)']:.2f}"
    result["Accuracy"] = f"{accuracy_score(true, candidate):.2f} ± {std_devs[lab]['Accuracy']:.2f}"
    
    final_results.append(result)

results_df = pd.DataFrame(final_results)

In [65]:
results_df

Unnamed: 0,Moral Value,F1 Score (Binary),F1 Score (Macro),F1 Score (Weighted),Precision Score (Binary),Precision Score (Macro),Precision Score (Weighted),Recall Score (Binary),Recall Score (Macro),Recall Score (Weighted),Accuracy
0,care,0.14 ± 0.02,0.55 ± 0.01,0.89 ± 0.00,0.47 ± 0.05,0.69 ± 0.03,0.88 ± 0.01,0.08 ± 0.01,0.54 ± 0.01,0.92 ± 0.00,0.92 ± 0.00
1,harm,0.07 ± 0.01,0.51 ± 0.01,0.87 ± 0.01,0.39 ± 0.06,0.65 ± 0.03,0.86 ± 0.01,0.04 ± 0.01,0.52 ± 0.00,0.90 ± 0.00,0.90 ± 0.00
2,fairness,0.35 ± 0.03,0.66 ± 0.01,0.93 ± 0.00,0.82 ± 0.04,0.89 ± 0.02,0.94 ± 0.00,0.22 ± 0.02,0.61 ± 0.01,0.95 ± 0.00,0.95 ± 0.00
3,cheating,0.15 ± 0.02,0.55 ± 0.01,0.89 ± 0.00,0.61 ± 0.05,0.77 ± 0.03,0.90 ± 0.01,0.08 ± 0.01,0.54 ± 0.01,0.92 ± 0.00,0.92 ± 0.00
4,loyalty,0.28 ± 0.03,0.63 ± 0.02,0.96 ± 0.00,0.88 ± 0.05,0.92 ± 0.02,0.97 ± 0.00,0.16 ± 0.02,0.58 ± 0.01,0.97 ± 0.00,0.97 ± 0.00
5,betrayal,0.13 ± 0.02,0.55 ± 0.01,0.93 ± 0.00,0.46 ± 0.07,0.71 ± 0.03,0.93 ± 0.00,0.08 ± 0.02,0.54 ± 0.01,0.95 ± 0.00,0.95 ± 0.00
6,authority,0.22 ± 0.03,0.60 ± 0.02,0.96 ± 0.00,0.67 ± 0.07,0.82 ± 0.04,0.96 ± 0.00,0.13 ± 0.02,0.56 ± 0.01,0.97 ± 0.00,0.97 ± 0.00
7,subversion,0.10 ± 0.03,0.54 ± 0.01,0.96 ± 0.00,0.77 ± 0.12,0.87 ± 0.06,0.97 ± 0.00,0.05 ± 0.02,0.53 ± 0.01,0.97 ± 0.00,0.97 ± 0.00
8,purity,0.06 ± 0.03,0.53 ± 0.01,0.97 ± 0.00,0.56 ± 0.18,0.77 ± 0.09,0.97 ± 0.00,0.03 ± 0.01,0.52 ± 0.01,0.98 ± 0.00,0.98 ± 0.00
9,degradation,0.12 ± 0.03,0.55 ± 0.01,0.95 ± 0.00,0.25 ± 0.05,0.61 ± 0.03,0.95 ± 0.00,0.08 ± 0.02,0.54 ± 0.01,0.96 ± 0.00,0.96 ± 0.00


In [66]:
results_df.to_csv("save_results.csv")