In [18]:
import pandas as pd
import ast  # For safely evaluating string representations of lists
import numpy as np
import re  # For regular expressions

LABEL_LIST = ["O", "B-OBJ", "I-OBJ", "B-ASPECT", "I-ASPECT"]

def transform_to_iob2_format(labels):
    if (len(labels) == 0) or (labels[0] == "O" and len(labels) == 1):
        return labels
    new_labels = []
    prev_label = labels[0]
    is_first_label = True
    for ind in range(1, len(labels)):
        label = labels[ind]
        if prev_label != label:
            new_label = "B-" + prev_label if is_first_label else "I-" + prev_label
            new_labels.append(prev_label if prev_label == "O" else new_label)
            prev_label = label
            is_first_label = True
        elif is_first_label:
            new_labels.append(prev_label if prev_label == "O" else "B-" + prev_label)
            prev_label = label
            is_first_label = False
        else:
            new_labels.append(prev_label if prev_label == "O" else "I-" + prev_label)
            prev_label = label

    new_label = "B-" + prev_label if is_first_label else "I-" + prev_label
    new_labels.append(prev_label if prev_label == "O" else new_label)

    return new_labels


def map_and_apply(df):
    df = df.reset_index(drop=True)
    df["labels"] = df["labels"].map(lambda x: transform_to_iob2_format(x))
    df["labels"] = df["labels"].apply(
        lambda labels: [LABEL_LIST.index(label) for label in labels]
    )
    return df

# Read the CSV files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Replace NaN values with an empty list or a default value in the 'labels' column
train['labels'] = train['labels'].replace(np.nan, '[]')
test['labels'] = test['labels'].replace(np.nan, '[]')

# Convert the 'labels' column from string representation to actual lists
train['labels'] = train['labels'].apply(ast.literal_eval)
test['labels'] = test['labels'].apply(ast.literal_eval)

# Function to remove numbers after 'OBJ' and 'ASPECT' labels
def remove_numbers_from_labels(label_list):
    return [re.sub(r'(OBJ|ASPECT)-\d+', r'\1', label) for label in label_list]

# Apply the function to the 'labels' column
train['labels'] = train['labels'].apply(remove_numbers_from_labels)
test['labels'] = test['labels'].apply(remove_numbers_from_labels)

# Remove all rows that contain the word "SHARED" in the labels column
train = train[~train['labels'].apply(lambda x: 'SHARED' in x)]
test = test[~test['labels'].apply(lambda x: 'SHARED' in x)]

train = map_and_apply(train)
test = map_and_apply(test)

# Save the processed data to CSV files
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [20]:
import pandas as pd

# load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('val.csv')

labels
33     164
27     137
30     131
39     127
36     125
42     111
45      88
48      71
51      70
24      61
54      52
60      45
57      44
21      32
63      29
66      26
81      22
75      19
69      19
72      17
78      16
18      15
87      13
84      12
99       8
105      6
102      6
93       4
108      3
90       3
144      3
96       3
111      2
123      1
15       1
150      1
132      1
Name: count, dtype: int64