In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import GroupShuffleSplit, train_test_split
pd.set_option('display.max_colwidth', None)

In [2]:
moral_aspects_df = pd.read_csv('/workspace/ceph_data/moral-based-argumentation/moral-frames/top_aspects.csv')
moral_aspects_df = moral_aspects_df[moral_aspects_df.moral != '-']

In [3]:
moral_aspects_df.head()

Unnamed: 0,aspect,moral
4,protect,care
5,control,authority
6,safe,care
8,justice,authority
9,safety,care


In [4]:
moral_aspects_dict = pd.Series(moral_aspects_df.moral.values,index=moral_aspects_df.aspect).to_dict()

In [5]:
df = pd.read_csv('/workspace/ceph_data/moral-based-argumentation/moral-frames/argsme_moral_aspects_with_morals_top_500_aspects.csv')
df['moral_frame'] = df.aspects.apply(lambda x: moral_aspects_dict[x] if x in moral_aspects_dict else 'None')
df = df[df.moral_frame != 'None']
df = df[['moral_frame', 'premise','aspects']]
df = df.rename(columns={'moral_frame': 'label', 'premise': 'full_sentence'})

In [6]:
df.head()

Unnamed: 0,label,full_sentence,aspects
0,authority,Liszt was one of the first musicians to believe that a genius had the obligation to give back to nature what nature has given to him.,obligation
1,authority,"Also I've said that Liszt believed that a genius had the obligation to give back to nature what nature has given to him, and that was not very accurate.",obligation
2,care,"Things black people in our society are oppressed, taunted, and even killed over.",oppressed
3,care,"But hate crimes are not their own specific category of crime, they're more of an identification to keep minorities and the people who could be attacked safe.",safe
5,care,"For that information that is protected via copyright, the property owner of that information has every right not to have their intellectual property stolen from them via downloading.",protected


In [7]:
df.label.value_counts()

care         3391
authority    2305
fairness     1680
loyalty      1371
purity        765
Name: label, dtype: int64

In [8]:
df.to_csv('/workspace/ceph_data/moral-based-argumentation/moral-frames/argsme_moral_aspects_with_morals_top_500_aspects_processed.csv')

In [9]:
gss = GroupShuffleSplit(n_splits=1, train_size=.9, random_state=42)

for train_idx, test_idx in gss.split(df['full_sentence'], df['label'], df['aspects']):
    df_train1 = df.iloc[train_idx]
    df_val1 = df.iloc[test_idx]

In [10]:
df_train1['masked_full_sentence'] = df_train1.apply(lambda row: row['full_sentence'].replace(row['aspects'], 'UNK'), axis=1)
df_val1['masked_full_sentence'] = df_val1.apply(lambda row: row['full_sentence'].replace(row['aspects'], 'UNK'), axis=1)

df_train1[['label', 'full_sentence']].to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/train_aspects_without_emfd.csv', index=False)
df_val1[['label', 'full_sentence']].to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/val_aspects_without_emfd.csv', index=False)

df_train1[['label', 'masked_full_sentence']].rename(columns={'masked_full_sentence': 'full_sentence'}).to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/masked_train_aspects_without_emfd.csv', index=False)
df_val1[['label', 'masked_full_sentence']].rename(columns={'masked_full_sentence': 'full_sentence'}).to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/masked_val_aspects_without_emfd.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


---------

### Data from Moral-Concepts:

In [17]:
moral_concepts_df = pd.read_csv('/workspace/ceph_data/moral-based-argumentation/moral-frames/argsme_moral_aspects_with_morals_based_on_concepts_balanced.csv')

In [18]:
len(moral_concepts_df)

15655

In [19]:
moral_concepts_df.aspects.nunique()

688

In [20]:
moral_concepts_df = moral_concepts_df.rename(columns={'moral_frame': 'label', 'premise': 'full_sentence'})

In [21]:
moral_concepts_df.label.value_counts()

authority    3131
fairness     3131
purity       3131
care         3131
loyalty      3131
Name: label, dtype: int64

In [22]:
training_aspects, test_aspects = train_test_split(moral_concepts_df.aspects.unique().tolist(), test_size=0.1)

moral_concepts_df_train = moral_concepts_df[moral_concepts_df.aspects.isin(training_aspects)]
moral_concepts_df_test = moral_concepts_df[moral_concepts_df.aspects.isin(test_aspects)]

In [23]:
gss = GroupShuffleSplit(n_splits=1, train_size=.9, random_state=42)

for train_idx, test_idx in gss.split(moral_concepts_df['full_sentence'], moral_concepts_df['label'], moral_concepts_df['aspects']):
    moral_concepts_df_train = moral_concepts_df.iloc[train_idx]
    moral_concepts_df_test = moral_concepts_df.iloc[test_idx]

In [24]:
moral_concepts_df_train.label.value_counts()

fairness     3040
loyalty      3034
care         2871
authority    2801
purity       2651
Name: label, dtype: int64

In [25]:
moral_concepts_df_test.label.value_counts()

purity       480
authority    330
care         260
loyalty       97
fairness      91
Name: label, dtype: int64

In [26]:
moral_concepts_df_train[['label', 'full_sentence']].to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/train_moral_concetps_balanced.csv', index=False)
moral_concepts_df_test[['label', 'full_sentence']].to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/val_moral_concept_balanced.csv', index=False)

----------------

### Data from Reddit:

In [12]:
train_df = pd.read_pickle('/workspace/ceph_data/data/aspect-controlled-argument-generation/reddit_data_with_morals_train.pkl')
test_df  = pd.read_pickle('/workspace/ceph_data/data/aspect-controlled-argument-generation/reddit_data_with_morals_test.pkl')

In [13]:
test_df.head()

Unnamed: 0,id,doc_sent_id,stance,sent,doc_id,doc_metadata_id,doc_url,doc_score,index,aspect_string,aspect_pos,moral_frame,topic
2,122467,0,Argument_against,Seems a little silly when you could probably just order hundreds of them or clones thereof from China .,soePDWwBJVi6mb4Y9_TP,d9ttl52,https://www.reddit.com//comments/5c2f6d//d9ttl52,8.48156,redditcomments-en,"[order, clones]","[[9, 9], [14, 14]]",authority,cloning
14,303294,3,Argument_for,"No trading of hacked , cloned , or illegal Pokémon .",unHZI2wBJVi6mb4Y8TKS,dip0cgd,https://www.reddit.com//comments/6gboxd//dip0cgd,12.383832,redditcomments-en,"[trading, hacked, illegal Pokémon]","[[1, 1], [3, 3], [8, 9]]",authority,cloning
16,207617,3,Argument_for,"No trading of hacked , cloned , or illegal Pokémon .",Of3cHGwBJVi6mb4YTzOG,eanbvtt,https://www.reddit.com/r/pokemontrades/comments/a17i2o/ft_x1_hl_na_x1_na_zeraora_and_x2_gk_na_codes_lf/eanbvtt/,12.383832,redditcomments-en,"[trading, hacked, illegal Pokémon]","[[1, 1], [3, 3], [8, 9]]",authority,cloning
21,255364,1,Argument_against,"Codex-Clone/Bomb/Teleport prevention and if you flashbang the controller , it instantly breaks mind control .",eI73TGwBJVi6mb4Y85F5,d695uo3,https://www.reddit.com//comments/4wp5wa//d695uo3,16.523973,redditcomments-en,"[breaks, mind control]","[[11, 11], [12, 13]]",authority,cloning
37,234685,2,Argument_for,"No trading of hacked , cloned , or illegal Pokémon .",Y5jg92sBJVi6mb4YFQzG,d9713v5,https://www.reddit.com//comments/59b8yk//d9713v5,13.763745,redditcomments-en,"[trading, hacked, illegal Pokémon]","[[1, 1], [3, 3], [8, 9]]",authority,cloning


In [None]:
for topic in train_df.topic.unique():
    x = train_df[train_df.topic==topic].moral_frame.value_counts().to_dict()
    total = sum(x.values())
    print(topic)
    r = [(k,round(v/total,2)) for k,v in x.items()]
    
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[v[0] for v in sorted(r, key=lambda x: x[0])]))
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[int(v[1]*100) for v in sorted(r, key=lambda x: x[0])]))

marijuana_legalization
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 54\% & 14\% & 13\% & 9\% & 10\%
gun_control
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 25\% & 31\% & 26\% & 13\% & 5\%
abortion
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 21\% & 19\% & 28\% & 14\% & 17\%
death_penalty
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 7\% & 13\% & 22\% & 21\% & 36\%
minimum_wage
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 8\% & 16\% & 23\% & 34\% & 19\%
nuclear_energy
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 2\% & 32\% & 9\% & 20\% & 37\%


In [None]:
for topic in test_df.topic.unique():
    x = test_df[test_df.topic==topic].moral_frame.value_counts().to_dict()
    total = sum(x.values())
    print(topic)
    r = [(k,round(v/total,2)) for k,v in x.items()]
    
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[v[0] for v in sorted(r, key=lambda x: x[0])]))
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[int(v[1]*100) for v in sorted(r, key=lambda x: x[0])]))

cloning
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 25\% & 20\% & 13\% & 24\% & 17\%
school_uniforms
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 8\% & 10\% & 16\% & 38\% & 28\%


In [14]:
train_df = train_df[['moral_frame', 'sent']]
train_df = train_df.rename(columns={'moral_frame':'label', 'sent': 'full_sentence'})

In [15]:
test_df = test_df[['moral_frame', 'sent']]
test_df = test_df.rename(columns={'moral_frame':'label', 'sent': 'full_sentence'})

In [16]:
import random

In [17]:
len(train_df)

199632

In [None]:
for i in range(0, 5):
    tmp_train_df = train_df.sample(50000, random_state=random.randint(1, 100000))
    tmp_train_df.to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_50k_{}.csv'.format(i), index=False)

In [18]:
for i in range(0, 5):
    tmp_train_df = train_df.sample(100000, random_state=random.randint(1, 100000))
    tmp_train_df.to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_100k_{}.csv'.format(i), index=False)

In [None]:
len(train_df)

199632

In [None]:
train_df = train_df.sample(50000)

In [27]:
train_df.label.value_counts()

fairness     41818
care         41393
authority    40952
purity       39137
loyalty      36332
Name: label, dtype: int64

In [28]:
train_df.to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_199k.csv', index=False)

In [8]:
test_df = test_df[['moral_frame', 'sent']]
test_df = test_df.rename(columns={'moral_frame':'label', 'sent': 'full_sentence'})

In [9]:
test_df.sample(1000).to_csv('/workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/test_data.csv', index=False)

### Create a sample to manually evaluate:

In [33]:
moral_df = train_df.sample(100).copy()
moral_df = moral_df.rename(columns={'label':'moral', 'full_sentence': 'claim'})
moral_df.to_csv('../../data/sample-of-automatic-moral-tagging.csv')

In [68]:
timon_results = pd.read_csv('../../data/manual-evaluations/sample-of-automatic-moral-tagging-timon.csv')
milad_results = pd.read_csv('../../data/manual-evaluations/sample-of-automatic-moral-tagging-milad.csv')

In [69]:
timon_results = timon_results.fillna('No')
timon_results['correct moral'] = timon_results['correct moral'].apply(lambda x: 'Yes' if x=='x' or x=='X' else 'No')

In [70]:
timon_results.head()

Unnamed: 0.1,Unnamed: 0,moral,claim,correct moral
0,7872,authority,"Gun control , in the US does not work .",Yes
1,207273,purity,Which is why people working in the Maquiladoras make 1/6 US minimum wage and it 's a decent wage down there .,No
2,95624,fairness,"Solar is absolutely , POSITIVELY a constant source of energy .",No
3,13776,authority,"The laws against cannabis are bad for this country , not least because they inspire contempt for law enforcement and for the government in general .",Yes
4,114083,fairness,Geo thermal and nuclear hold our key at any chance of moving beyond earth at any reasonable scale .,No


In [71]:
milad_results.head()

Unnamed: 0.1,Unnamed: 0,moral,claim,correct moral
0,7872,authority,"Gun control , in the US does not work .",Yes
1,207273,purity,Which is why people working in the Maquiladoras make 1/6 US minimum wage and it 's a decent wage down there .,Yes
2,95624,fairness,"Solar is absolutely , POSITIVELY a constant source of energy .",No
3,13776,authority,"The laws against cannabis are bad for this country , not least because they inspire contempt for law enforcement and for the government in general .",Yes
4,114083,fairness,Geo thermal and nuclear hold our key at any chance of moving beyond earth at any reasonable scale .,Yes


In [72]:
milad_results['correct moral'].value_counts()

Yes    73
No     27
Name: correct moral, dtype: int64

In [73]:
timon_results['correct moral'].value_counts()

No     54
Yes    46
Name: correct moral, dtype: int64

In [74]:
milad_results['timon_scores'] = timon_results['correct moral'].tolist()

In [75]:
milad_results['and_scores'] = milad_results.apply(lambda row: 1 if row['timon_scores'] == 'Yes' and row['correct moral'] == 'Yes' else 0, axis=1)
milad_results['or_scores'] = milad_results.apply(lambda row: 1 if row['timon_scores'] == 'Yes' or row['correct moral'] == 'Yes' else 0, axis=1)

In [78]:
milad_results['and_scores'].value_counts()

0    58
1    42
Name: and_scores, dtype: int64

In [77]:
milad_results['or_scores'].value_counts()

1    77
0    23
Name: or_scores, dtype: int64

In [57]:
from sklearn.metrics import cohen_kappa_score

In [58]:
cohen_kappa_score(timon_results['correct moral'].tolist(), milad_results['correct moral'].tolist())

0.32484567901234573

---------

### Training:

In [24]:
!export CUDA_VISIBLE_DEVICES=3
!python3 /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/aspects_model \
  --train_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/train_aspects.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/val_aspects.csv
  #--test_file /workspace/ceph_data/moral-based-argumentation/bert-emfd/test.csv
  #--max_steps 5 \

Traceback (most recent call last):
  File "/workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py", line 26, in <module>
    from transformers.trainer_utils import get_last_checkpoint, is_main_process
ImportError: cannot import name 'get_last_checkpoint'


In [None]:
export CUDA_VISIBLE_DEVICES=7
python /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/aspects_model \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 32 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/tmp \
  --train_file /workspace/ceph_data/moral-based-argumentation/dagstuhl_morality/test_data.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/dagstuhl_morality/test_data.csv

In [None]:
export CUDA_VISIBLE_DEVICES=3
python3 /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_model_0 \
  --train_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_50k_0.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/test_data_1k_0.csv
  #--test_file /workspace/ceph_data/moral-based-argumentation/bert-emfd/test.csv
  #--max_steps 5 \

In [None]:
export CUDA_VISIBLE_DEVICES=4
python3 /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_model_1 \
  --train_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_50k_1.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/test_data_1k_1.csv
  #--test_file /workspace/ceph_data/moral-based-argumentation/bert-emfd/test.csv
  #--max_steps 5 \

In [None]:
export CUDA_VISIBLE_DEVICES=5
python3 /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_model_2 \
  --train_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_50k_2.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/test_data_1k_2.csv
  #--test_file /workspace/ceph_data/moral-based-argumentation/bert-emfd/test.csv
  #--max_steps 5 \

In [None]:
export CUDA_VISIBLE_DEVICES=6
python3 /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_model_3 \
  --train_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_50k_3.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/test_data_1k_3.csv
  #--test_file /workspace/ceph_data/moral-based-argumentation/bert-emfd/test.csv
  #--max_steps 5 \

In [None]:
export CUDA_VISIBLE_DEVICES=0
python3 /workspace/ceph_data/moral-based-argumentation/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_model_4 \
  --train_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/train_data_50k_4.csv \
  --validation_file /workspace/ceph_data/moral-based-argumentation/bert-emfd-moral-frames/reddit_data/test_data_1k_4.csv
  #--test_file /workspace/ceph_data/moral-based-argumentation/bert-emfd/test.csv
  #--max_steps 5 \