In [3]:
import pandas as pd

def extract_sentences(data):
    sentences = []
    labels = []
    sentence = []
    label = []
    for line in data:
        if line.strip() == '':  # Handles empty lines
            if sentence:  # Only append if sentence is not empty
                sentences.append(' '.join(sentence))
                labels.append(' '.join(label))
                sentence = []
                label = []
        else:
            word, tag = line.split()
            if word.isalnum():
                sentence.append(word)
                label.append(tag)
    if sentence:  # Catch any leftover sentences
        sentences.append(' '.join(sentence))
        labels.append(' '.join(label))
    return sentences, labels

def format_list_as_string(lst):
    return "['" + "', '".join(lst.split()) + "']"

def format_labels_as_list_of_integers(lst):
    return '[' + ', '.join(lst) + ']'

LABEL_LIST = ["O", "B-Object", "I-Object", "B-Aspect", "I-Aspect", "B-Predicate", "I-Predicate"]
label_to_index = {label: idx for idx, label in enumerate(LABEL_LIST)}

def map_labels_to_indices(labels):
    return [str(label_to_index[label]) for label in labels.split()]

# Processing the train, test, and dev datasets
datasets = ['train', 'test', 'dev']
for dataset in datasets:
    with open(f'../../Raw Datasets/Chekalina-OAI/{dataset}.tsv', 'r') as file:
        data = file.readlines()
        sentences, labels = extract_sentences(data)
        sentences = [format_list_as_string(sentence) for sentence in sentences]
        labels = [format_labels_as_list_of_integers(map_labels_to_indices(label)) for label in labels]
        df = pd.DataFrame({'words': sentences, 'labels': labels})
        df['labels'] = df['labels'].apply(lambda x: x.replace('5', '3').replace('6', '4'))
        df.to_csv(f'{dataset}.csv', index=False)

import os
# rename dev.csv to val.csv
os.rename('dev.csv', 'val.csv')

In [4]:
import pandas as pd

# load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('val.csv')

# concatenate the data
data = pd.concat([train, test, val], ignore_index=True)

# check the shape of the data and the labels distribution
print(data.shape)

(3004, 2)
