In [None]:
import pandas as pd
import ast  # For safely evaluating string representations of lists
import numpy as np
import re  # For regular expressions

data = pd.read_csv('../../Raw Datasets/Beloucif/Labeling Task/SeqDataSetAllData_TestDataExcluded_Flair_train.tsv', sep='\t')

'''
the data has this format:
How	O
can	O
you	O
tell	O
the	O
difference	O
between	O
literal	OBJ-1
and	O
symbolic	OBJ-2
dreams	SHARED
?	O

should	O
i	O
get	O
the	O
Intel	OBJ-1
PRO	OBJ-1
/	OBJ-1
Wireless	OBJ-1
3945ABG	OBJ-1
Mini-PCI	OBJ-1
Express	OBJ-1
Adapter	OBJ-1
or	O
ThinkPad	OBJ-2
11a	OBJ-2
/	OBJ-2
b	OBJ-2
/	OBJ-2
g	OBJ-2
Wireless	OBJ-2
LAN	OBJ-2
Mini	OBJ-2
?	O
'''



In [18]:
import pandas as pd
import ast  # For safely evaluating string representations of lists
import numpy as np
import re  # For regular expressions

LABEL_LIST = ["O", "B-OBJ", "I-OBJ", "B-ASPECT", "I-ASPECT"]

def transform_to_iob2_format(labels):
    if (len(labels) == 0) or (labels[0] == "O" and len(labels) == 1):
        return labels
    new_labels = []
    prev_label = labels[0]
    is_first_label = True
    for ind in range(1, len(labels)):
        label = labels[ind]
        if prev_label != label:
            new_label = "B-" + prev_label if is_first_label else "I-" + prev_label
            new_labels.append(prev_label if prev_label == "O" else new_label)
            prev_label = label
            is_first_label = True
        elif is_first_label:
            new_labels.append(prev_label if prev_label == "O" else "B-" + prev_label)
            prev_label = label
            is_first_label = False
        else:
            new_labels.append(prev_label if prev_label == "O" else "I-" + prev_label)
            prev_label = label

    new_label = "B-" + prev_label if is_first_label else "I-" + prev_label
    new_labels.append(prev_label if prev_label == "O" else new_label)

    return new_labels


def map_and_apply(df):
    df = df.reset_index(drop=True)
    df["labels"] = df["labels"].map(lambda x: transform_to_iob2_format(x))
    df["labels"] = df["labels"].apply(
        lambda labels: [LABEL_LIST.index(label) for label in labels]
    )
    return df

# Read the CSV files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('val.csv')

# Replace NaN values with an empty list or a default value in the 'labels' column
train['labels'] = train['labels'].replace(np.nan, '[]')
test['labels'] = test['labels'].replace(np.nan, '[]')
val['labels'] = val['labels'].replace(np.nan, '[]')

# Convert the 'labels' column from string representation to actual lists
train['labels'] = train['labels'].apply(ast.literal_eval)
test['labels'] = test['labels'].apply(ast.literal_eval)
val['labels'] = val['labels'].apply(ast.literal_eval)

# Function to remove numbers after 'OBJ' and 'ASPECT' labels
def remove_numbers_from_labels(label_list):
    return [re.sub(r'(OBJ|ASPECT)-\d+', r'\1', label) for label in label_list]

# Apply the function to the 'labels' column
train['labels'] = train['labels'].apply(remove_numbers_from_labels)
test['labels'] = test['labels'].apply(remove_numbers_from_labels)
val['labels'] = val['labels'].apply(remove_numbers_from_labels)

# Remove all rows that contain the word "SHARED" in the labels column
train = train[~train['labels'].apply(lambda x: 'SHARED' in x)]
test = test[~test['labels'].apply(lambda x: 'SHARED' in x)]
val = val[~val['labels'].apply(lambda x: 'SHARED' in x)]

train = map_and_apply(train)
test = map_and_apply(test)
val = map_and_apply(val)

# Save the processed data to CSV files
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
val.to_csv('val.csv', index=False)