In [17]:
def extract_sentences(data):
    sentences = []
    labels = []
    sentence = []
    label = []
    for line in data:
        if line == '\n':
            sentences.append(' '.join(sentence))
            labels.append(' '.join(label))
            sentence = []
            label = []
        else:
            word, tag = line.split()
            sentence.append(word)
            label.append(tag)
    return sentences, labels

import pandas as pd

LABEL_LIST = ["O", "B-Object", "I-Object", "B-Aspect", "I-Aspect", "B-Predicate", "I-Predicate"]
label_to_index = {label: idx for idx, label in enumerate(LABEL_LIST)}

def format_list_as_string(lst):
    return '[' + ', '.join(lst.split()) + ']'

def map_labels_to_indices(labels):
    return [label_to_index[label] for label in labels.split()]

with open('../../Raw Datasets/Chekalina-OAI/train.tsv', 'r') as file:
    train = file.readlines()
    train_sentences, train_labels = extract_sentences(train)
    train_sentences = [format_list_as_string(sentence) for sentence in train_sentences]
    train_labels = [format_list_as_string(' '.join(map(str, map_labels_to_indices(label)))) for label in train_labels]
    df = pd.DataFrame({'words': train_sentences, 'labels': train_labels})
    # convert all 5 labels to 3 and all 6 labels to 4
    df['labels'] = df['labels'].apply(lambda x: x.replace('5', '3').replace('6', '4'))
    df.to_csv('train.csv', index=False)

with open('../../Raw Datasets/Chekalina-OAI/test.tsv', 'r') as file:
    test = file.readlines()
    test_sentences, test_labels = extract_sentences(test)
    test_sentences = [format_list_as_string(sentence) for sentence in test_sentences]
    test_labels = [format_list_as_string(' '.join(map(str, map_labels_to_indices(label)))) for label in test_labels]
    df = pd.DataFrame({'words': test_sentences, 'labels': test_labels})
    df['labels'] = df['labels'].apply(lambda x: x.replace('5', '3').replace('6', '4'))
    df.to_csv('test.csv', index=False)

with open('../../Raw Datasets/Chekalina-OAI/dev.tsv', 'r') as file:
    dev = file.readlines()
    dev_sentences, dev_labels = extract_sentences(dev)
    dev_sentences = [format_list_as_string(sentence) for sentence in dev_sentences]
    dev_labels = [format_list_as_string(' '.join(map(str, map_labels_to_indices(label)))) for label in dev_labels]
    df = pd.DataFrame({'words': dev_sentences, 'labels': dev_labels})
    df['labels'] = df['labels'].apply(lambda x: x.replace('5', '3').replace('6', '4'))
    df.to_csv('val.csv', index=False)
