In [100]:
import pandas as pd
import numpy as np

### Original Dataset from Logical Fallacy Detection paper

In [232]:
old_datasets = dict()
for split in ['train', 'dev', 'test']:
    old_datasets[split] = pd.read_csv(f'edu_{split}.csv')

In [233]:
old_datasets = {
    key: dataset.drop(columns=[column for column in dataset.columns if column not in ['source_article', 'updated_label', 'masked_articles']], axis = 1)
    for key, dataset in old_datasets.items()
}

In [234]:
old_datasets = {
    key: dataset.drop_duplicates()
    for key, dataset in old_datasets.items()
}

In [235]:
old_datasets = {
    key: dataset.reset_index(drop = True)
    for key, dataset in old_datasets.items()
}
for key in old_datasets.keys():
    old_datasets[key].index.name = 'id'

### Updated dataset with the prompts 

In [248]:
new_datasets = dict()
for split in ['train', 'dev', 'test']:
    new_datasets[split] = pd.read_csv(f'fallacy_{split}.csv')

In [249]:
new_datasets = {
    key: dataset.drop_duplicates(subset = ['sentence', 'prompt', 'fine_class'])
    for key, dataset in new_datasets.items()
}

In [250]:
new_datasets = {
    key: dataset.reset_index(drop = True)
    for key, dataset in new_datasets.items()
}

In [251]:
for key in new_datasets.keys():
    new_datasets[key]['sent_id'] = None

In [252]:
from collections import defaultdict

sentence_to_id_dicts = defaultdict(dict)
for key in old_datasets.keys():
    sentence_to_id_dicts[key] = dict(zip(old_datasets[key]['source_article'].tolist(), old_datasets[key].index.tolist()))

In [253]:
sentences_not_found = list()

In [254]:
from IPython import embed

In [255]:
for key in new_datasets.keys():
    for i in range(len(new_datasets[key])):
        sentence = new_datasets[key]['sentence'][i]
        label = new_datasets[key]['fine_class'][i]
        try:
            new_datasets[key]['sent_id'][i] = sentence_to_id_dicts[key][sentence]
        except Exception as e:
            sentences_not_found.append((key, sentence, label))
            continue

In [257]:
sentences_not_found_df = pd.DataFrame({
    'split': list(map(lambda x: x[0], sentences_not_found)),
    'sentence': list(map(lambda x: x[1], sentences_not_found)),
    'label': list(map(lambda x: x[2], sentences_not_found))
})

In [258]:
sentences_not_found_df.to_csv('not_found_sentences.csv', index = False)