In [None]:
import pandas as pd
import os
import random

In [None]:
PATH = "path"

In [None]:
data = pd.read_csv(os.path.join(PATH, "merged.csv"))

In [None]:
data['label'] = data['label'].replace({'SUBJ': 1, 'OBJ': 0})
data = data.drop('solved_conflict', axis=1)
data = data.drop('sentence_id', axis=1)

In [None]:
num_rows = len(data)
random.seed(42)

#Isolating all rows containing OBJ and SUBJ to ensure an equal split of both in train, test and val
obj = data.loc[data['label'] == 0]
subj = data.loc[data['label'] == 1]

train_size_obj = int(0.8*len(obj))
train_size_subj = int(0.8*len(subj))
val_size_obj = int(0.1*len(obj))
val_size_subj = int(0.1*len(subj))

train_obj = obj.iloc[:train_size_obj, :]
val_obj = obj.iloc[train_size_obj: train_size_obj + val_size_obj, :]
test_obj = obj.iloc[train_size_obj + val_size_obj:, :]
train_subj = subj.iloc[:train_size_subj, :]
val_subj = subj.iloc[train_size_subj: train_size_subj + val_size_subj, :]
test_subj = subj.iloc[train_size_subj + val_size_subj:, :]

train = pd.concat([train_obj, train_subj])
val = pd.concat([val_obj, val_subj])
test = pd.concat([test_obj, test_subj])

train = train.sample(frac=1, random_state=42)
val = val.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

# Save each split into separate CSV files
train.to_csv(os.path.join(PATH, 'train-unprocessed.csv'), index=False)
val.to_csv(os.path.join(PATH, 'val-unprocessed.csv'), index=False)
test.to_csv(os.path.join(PATH, 'test-unprocessed.csv'), index=False)

In [None]:
print(f"Number of lines in the training set: {len(train)}")
print(f"Number of lines in the validation set: {len(val)}")
print(f"Number of lines in the testing set: {len(test)}")

Number of lines in the training set: 750
Number of lines in the validation set: 93
Number of lines in the testing set: 96


In [None]:
train.head(5)

Unnamed: 0,sentence,label
120,The reason for the lack of any actual ‘levelli...,1
572,For the same reason they protest lightly or no...,0
207,But the New England Journal of Medicine study ...,0
393,The Biden administration predicted nearly 100 ...,0
475,So now there is a movement—a movement within g...,0


In [None]:
val.head(5)

Unnamed: 0,sentence,label
784,In it I argued that the reason corporate law c...,0
756,"“Cities have assumed new obligations,” writes ...",0
809,The agreement still needs to be ratified by me...,0
835,We may have faith that human inventiveness wil...,1
729,That you will be hearing.,0


In [None]:
test.head(5)

Unnamed: 0,sentence,label
908,The GOP must do much more to show it’s the par...,1
903,"Instead, it is distracted by fantasies of raci...",1
898,The single issue that overcame all others: Mr....,1
936,Funny thing: the MSM is at least partly right.,1
878,With over half of social housing residents rel...,0
