In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import GroupShuffleSplit, train_test_split
pd.set_option('display.max_colwidth', None)

In [2]:
data_path = '../../../moral-debater-data/'

### Data from Reddit:

In [3]:
train_df = pd.read_pickle(data_path + '/aspect-controlled-argument-generation/reddit_data_with_morals_train.pkl')
test_df  = pd.read_pickle(data_path + '/aspect-controlled-argument-generation/reddit_data_with_morals_test.pkl')

In [4]:
test_df.topic.value_counts()

cloning            16520
school_uniforms    15823
Name: topic, dtype: int64

In [22]:
test_df.head()

Unnamed: 0,id,doc_sent_id,stance,sent,doc_id,doc_metadata_id,doc_url,doc_score,index,aspect_string,aspect_pos,moral_frame,topic
2,122467,0,Argument_against,Seems a little silly when you could probably just order hundreds of them or clones thereof from China .,soePDWwBJVi6mb4Y9_TP,d9ttl52,https://www.reddit.com//comments/5c2f6d//d9ttl52,8.48156,redditcomments-en,"[order, clones]","[[9, 9], [14, 14]]",authority,cloning
14,303294,3,Argument_for,"No trading of hacked , cloned , or illegal Pokémon .",unHZI2wBJVi6mb4Y8TKS,dip0cgd,https://www.reddit.com//comments/6gboxd//dip0cgd,12.383832,redditcomments-en,"[trading, hacked, illegal Pokémon]","[[1, 1], [3, 3], [8, 9]]",authority,cloning
16,207617,3,Argument_for,"No trading of hacked , cloned , or illegal Pokémon .",Of3cHGwBJVi6mb4YTzOG,eanbvtt,https://www.reddit.com/r/pokemontrades/comments/a17i2o/ft_x1_hl_na_x1_na_zeraora_and_x2_gk_na_codes_lf/eanbvtt/,12.383832,redditcomments-en,"[trading, hacked, illegal Pokémon]","[[1, 1], [3, 3], [8, 9]]",authority,cloning
21,255364,1,Argument_against,"Codex-Clone/Bomb/Teleport prevention and if you flashbang the controller , it instantly breaks mind control .",eI73TGwBJVi6mb4Y85F5,d695uo3,https://www.reddit.com//comments/4wp5wa//d695uo3,16.523973,redditcomments-en,"[breaks, mind control]","[[11, 11], [12, 13]]",authority,cloning
37,234685,2,Argument_for,"No trading of hacked , cloned , or illegal Pokémon .",Y5jg92sBJVi6mb4YFQzG,d9713v5,https://www.reddit.com//comments/59b8yk//d9713v5,13.763745,redditcomments-en,"[trading, hacked, illegal Pokémon]","[[1, 1], [3, 3], [8, 9]]",authority,cloning


##### Printing stats on the data:

In [17]:
for topic in train_df.topic.unique():
    x = train_df[train_df.topic==topic].moral_frame.value_counts().to_dict()
    total = sum(x.values())
    print(topic)
    r = [(k,round(v/total,2)) for k,v in x.items()]
    
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[v[0] for v in sorted(r, key=lambda x: x[0])]))
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[int(v[1]*100) for v in sorted(r, key=lambda x: x[0])]))

marijuana_legalization
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 54\% & 14\% & 13\% & 9\% & 10\%
gun_control
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 25\% & 31\% & 26\% & 13\% & 5\%
abortion
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 21\% & 19\% & 28\% & 14\% & 17\%
death_penalty
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 7\% & 13\% & 22\% & 21\% & 36\%
minimum_wage
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 8\% & 16\% & 23\% & 34\% & 19\%
nuclear_energy
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 2\% & 32\% & 9\% & 20\% & 37\%


In [18]:
for topic in test_df.topic.unique():
    x = test_df[test_df.topic==topic].moral_frame.value_counts().to_dict()
    total = sum(x.values())
    print(topic)
    r = [(k,round(v/total,2)) for k,v in x.items()]
    
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[v[0] for v in sorted(r, key=lambda x: x[0])]))
    print('& {}\% & {}\% & {}\% & {}\% & {}\%'.format(*[int(v[1]*100) for v in sorted(r, key=lambda x: x[0])]))

cloning
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 25\% & 20\% & 13\% & 24\% & 17\%
school_uniforms
& authority\% & care\% & fairness\% & loyalty\% & purity\%
& 8\% & 10\% & 16\% & 38\% & 28\%


### Prepare data to train BERT:

Use only text and moral colums and create 5 samples of 50k instances

In [14]:
train_df = train_df[['moral_frame', 'sent']]
train_df = train_df.rename(columns={'moral_frame':'label', 'sent': 'full_sentence'})

In [15]:
test_df = test_df[['moral_frame', 'sent']]
test_df = test_df.rename(columns={'moral_frame':'label', 'sent': 'full_sentence'})

In [None]:
for i in range(0, 5):
    tmp_train_df = train_df.sample(50000, random_state=random.randint(1, 100000))
    tmp_train_df.to_csv(data_path + '/moral-debater-data/reddit_data/train_data_50k_{}.csv'.format(i), index=False)

In [18]:
for i in range(0, 5):
    tmp_train_df = train_df.sample(100000, random_state=random.randint(1, 100000))
    tmp_train_df.to_csv(data_path + '/moral-debater-data/reddit_data/train_data_100k_{}.csv'.format(i), index=False)

### Create a sample to manually evaluate:

In [33]:
moral_df = train_df.sample(100).copy()
moral_df = moral_df.rename(columns={'label':'moral', 'full_sentence': 'claim'})
moral_df.to_csv('../../data/sample-of-automatic-moral-tagging.csv')

In [68]:
ann1_results = pd.read_csv('../../data/manual-evaluations/sample-of-automatic-moral-tagging-ann1.csv')
ann2_results = pd.read_csv('../../data/manual-evaluations/sample-of-automatic-moral-tagging-ann2.csv')

In [69]:
ann1_results = ann1_results.fillna('No')
ann1_results['correct moral'] = ann1_results['correct moral'].apply(lambda x: 'Yes' if x=='x' or x=='X' else 'No')

In [70]:
ann1_results.head()

Unnamed: 0.1,Unnamed: 0,moral,claim,correct moral
0,7872,authority,"Gun control , in the US does not work .",Yes
1,207273,purity,Which is why people working in the Maquiladoras make 1/6 US minimum wage and it 's a decent wage down there .,No
2,95624,fairness,"Solar is absolutely , POSITIVELY a constant source of energy .",No
3,13776,authority,"The laws against cannabis are bad for this country , not least because they inspire contempt for law enforcement and for the government in general .",Yes
4,114083,fairness,Geo thermal and nuclear hold our key at any chance of moving beyond earth at any reasonable scale .,No


In [71]:
ann2_results.head()

Unnamed: 0.1,Unnamed: 0,moral,claim,correct moral
0,7872,authority,"Gun control , in the US does not work .",Yes
1,207273,purity,Which is why people working in the Maquiladoras make 1/6 US minimum wage and it 's a decent wage down there .,Yes
2,95624,fairness,"Solar is absolutely , POSITIVELY a constant source of energy .",No
3,13776,authority,"The laws against cannabis are bad for this country , not least because they inspire contempt for law enforcement and for the government in general .",Yes
4,114083,fairness,Geo thermal and nuclear hold our key at any chance of moving beyond earth at any reasonable scale .,Yes


In [72]:
ann2_results['correct moral'].value_counts()

Yes    73
No     27
Name: correct moral, dtype: int64

In [73]:
ann1_results['correct moral'].value_counts()

No     54
Yes    46
Name: correct moral, dtype: int64

In [74]:
ann2_results['ann1_scores'] = ann1_results['correct moral'].tolist()

In [75]:
ann2_results['and_scores'] = ann1_results.apply(lambda row: 1 if row['ann1_scores'] == 'Yes' and row['correct moral'] == 'Yes' else 0, axis=1)
ann2_results['or_scores']  = ann1_results.apply(lambda row: 1 if row['ann1_scores'] == 'Yes' or row['correct moral'] == 'Yes' else 0, axis=1)

In [78]:
ann2_results['and_scores'].value_counts()

0    58
1    42
Name: and_scores, dtype: int64

In [77]:
ann2_results['or_scores'].value_counts()

1    77
0    23
Name: or_scores, dtype: int64

In [57]:
from sklearn.metrics import cohen_kappa_score

In [58]:
cohen_kappa_score(timon_results['correct moral'].tolist(), milad_results['correct moral'].tolist())

0.32484567901234573

---------

### Training:

Command used to train BERT

In [None]:
path_to_data = '/home/sile2804/data-ceph/moral-debater-data'

In [None]:
export CUDA_VISIBLE_DEVICES=0
python3 <path_to_transformers_library>/repos/transformers/examples/text-classification/run_mfd.py \
  --model_name_or_path bert-base-cased \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --overwrite_output_dir \
  --output_dir <path_to_data>/reddit_models/reddit_model_4 \
  --train_file <path_to_data>/reddit_data/train_data_50k_4.csv \
  --validation_file <path_to_data>/reddit_data/train_data_50k_4.csv