In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import shutil
import json
import re

pd.set_option('display.max_colwidth', None)

In [None]:
preds_dict = {}
for i in range(5):
    path = '../../../ceph_data/output/bart-AAE-v2-only-dot-direct-cola-au-full-mask-gen/{}/aee.preds'.format(i)
    path2 = '../../../ceph_data/intermediate/bart-AAE-v2-only-dot-direct-cola-au-full-mask-gen/{}/test.target'.format(i)
    with open(path, encoding='utf-8') as h:
        preds = h.readlines()
    with open(path2, encoding='utf-8') as f:
        gts = f.readlines()
    for pred, gt in zip(preds,gts):
        preds_dict[gt.replace('\n','')]=pred.replace('\n','')

In [None]:
df_aae_full = pd.read_csv('../../../ceph_data/input/UKP-InsufficientArguments_v1.0/data-tokenized.tsv', sep='\t', index_col=False, encoding='latin-1')
df_aae = pd.read_json('../../../ceph_data/intermediate/corpus-ukp-argument-annotated-essays-v2/aae3.json')
df_aae_invalid = pd.read_json('../../../ceph_data/intermediate/corpus-ukp-argument-annotated-essays-v2/aae3_invalid.json')
df_aae_fixed = pd.read_json('../../../ceph_data/intermediate/corpus-ukp-argument-annotated-essays-v2/aae3_fixed.json')
df_split = pd.read_csv('../../../ceph_data/input/UKP-InsufficientArguments_v1.0/data-splitting.tsv', sep='\t', names=['index']+[str(i) for i in range(100)], index_col=False)

df_aae_full['index'] = df_aae_full.apply(lambda x: 'essay{}_{}'.format(str(x['ESSAY']).zfill(3), x['ARGUMENT']), axis=1)
df_aae_full['local_sufficency'] = df_aae_full['ANNOTATION'].apply(lambda x: 0 if x == 'insufficient' else 1)


In [None]:
df_aae_full_mask = pd.read_csv('df_aae_full_mask.csv')

In [None]:
df_aae_full_mask.head()

In [None]:
only_claims = list(set(df_aae_full['index']) - set(df_aae['index']) - set(df_aae_invalid['index']))
df_aae_full_only_claims = df_aae_full[df_aae_full['index'].isin(only_claims)]

In [None]:
df_aae.columns

In [None]:
df_aae_full_only_claims.columns

In [None]:
df_aae_full_only_claims.head()

In [None]:
df_aae_invalid.columns

In [None]:
conclusions_dict = {}
for i, row in df_aae.iterrows():
    if row['index'] not in conclusions_dict:
        conclusions_dict[row['index']] = []
    conclusions_dict[row['index']].append(row['conclusion'])
for i, row in df_aae_invalid.iterrows():
    if row['index'] not in conclusions_dict:
        conclusions_dict[row['index']] = []
    conclusions_dict[row['index']].append(row['conclusion'])
for i, row in df_aae_full_only_claims.iterrows():
    if row['index'] not in conclusions_dict:
        conclusions_dict[row['index']] = []
    conclusions_dict[row['index']].append('')

In [None]:
replaced_text_dict = {}
for index in df_aae_full_mask['index']:
    rel_df = df_aae_full_mask[df_aae_full_mask['index']==index]
    masked_text = None
    j = 0
    for i, row in rel_df.iterrows():
        if masked_text == None:
            masked_text = row['TEXT_MASK']
        #print(conclusions_dict[row['index']])
        #print(j)
        if conclusions_dict[row['index']][j] in preds_dict:
            masked_text = masked_text.replace('<mask>', '</s> '+conclusions_dict[row['index']][j]+' </s> ' + preds_dict[conclusions_dict[row['index']][j]] + ' </s>')
        else:
            masked_text = '</s> ' + row['TEXT'] + ' </s>'
        masked_text = masked_text.replace('<unk>', '<mask>')
        j += 1
    replaced_text_dict[index] = masked_text

In [None]:
df_aae_full['REPLACED_TEXT'] = df_aae_full['index'].apply(lambda x: replaced_text_dict[x])

In [None]:
df_aae_full.sample(20)

In [None]:
df_aae_full[df_aae_full['index'].isin(df_aae_invalid['index'])]

In [None]:
df_aae_full['placeholder1'] = np.arange(len(df_aae_full))
df_aae_full['placeholder2'] = np.arange(len(df_aae_full))
df_aae_full['placeholder3'] = np.arange(len(df_aae_full))

path = '../../../ceph_data/intermediate/bert-AAE-v2-only-dot-direct-cola-au-full-both'
if not os.path.exists(path):
    os.mkdir(path)
else:
    shutil.rmtree(path)
    os.mkdir(path)
    
path2 = '../../../ceph_data/output/bert-AAE-v2-only-dot-direct-cola-au-full-both'
if not os.path.exists(path2):
    os.mkdir(path2)
else:
    shutil.rmtree(path2)
    os.mkdir(path2)

for i in range(100):
    if not os.path.exists(path+'/{}'.format(i)):
        os.mkdir(path+'/{}'.format(i))
    else:
        shutil.rmtree(path+'/{}'.format(i))
        os.mkdir(path+'/{}'.format(i))
        
    if not os.path.exists(path2+'/{}'.format(i)):
        os.mkdir(path2+'/{}'.format(i))
    else:
        shutil.rmtree(path2+'/{}'.format(i))
        os.mkdir(path2+'/{}'.format(i))
    
    split_dict = dict(zip(df_split['index'], df_split[str(i)]))
    df_aae_full['split'] = df_aae_full['index'].apply(lambda x: split_dict[x])
    df_aae_full[df_aae_full['split']=='TRAIN'][['local_sufficency','placeholder1','placeholder2','REPLACED_TEXT','placeholder3']].to_csv('../../../ceph_data/intermediate/bert-AAE-v2-only-dot-direct-cola-au-full-both/'+str(i)+'/train.tsv',sep='\t', index=False)
    df_aae_full[df_aae_full['split']=='DEV'][['local_sufficency','placeholder1','placeholder2','REPLACED_TEXT','placeholder3']].to_csv('../../../ceph_data/intermediate/bert-AAE-v2-only-dot-direct-cola-au-full-both/'+str(i)+'/dev.tsv',sep='\t', index=False)
    df_aae_full[df_aae_full['split']=='TEST'][['local_sufficency','placeholder1','placeholder2','REPLACED_TEXT','placeholder3']].to_csv('../../../ceph_data/intermediate/bert-AAE-v2-only-dot-direct-cola-au-full-both/'+str(i)+'/test.tsv',sep='\t', index=False)
