In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

# Split train-dev-test
The data split takes into account grouping by sentence_id, i.e. makes sure that the whole sentence stays together.

In [2]:
data = pd.read_pickle('../data/szeged_fixed.pkl')

In [3]:
# 80% of data goes into train df
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, other_idx in gss.split(data.words, data.labels, groups=data.sentence_id):
    train = data.iloc[train_idx]
    other = data.iloc[other_idx]

# the non-train data is split 50/50 into development df and test df
gss = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=19)
for dev_idx, test_idx in gss.split(other.words, other.labels, groups=other.sentence_id):
    dev = other.iloc[dev_idx]
    test = other.iloc[test_idx]

## Check that sentences were split correctly

In [4]:
print(test.sentence_id.isin(train.sentence_id).any())
print(dev.sentence_id.isin(train.sentence_id).any())
print(dev.sentence_id.isin(test.sentence_id).any())

False
False
False


In [5]:
print(f"{data.sentence_id.nunique()=}")
print(f"{train.sentence_id.nunique()=}")
print(f"{dev.sentence_id.nunique()=}")
print(f"{test.sentence_id.nunique()=}")

data.sentence_id.nunique()=42739
train.sentence_id.nunique()=34191
dev.sentence_id.nunique()=4274
test.sentence_id.nunique()=4274


## Check that label proportions are good

In [6]:
def make_table(df):
    return df.pivot_table(
        index='labels',
        values='words',
        aggfunc='count',
        margins=True,
        margins_name='total',
    )

table_all = make_table(data).assign(
    prc_total=lambda df: (df.words / df.words.loc['total']).mul(100).round(2),
).rename(columns={'words': 'all'})

table_train = make_table(train).assign(
    prc_train=lambda df: (df.words / df.words.loc['total']).mul(100).round(2),
).rename(columns={'words': 'train'})

table_dev = make_table(dev).assign(
    prc_dev=lambda df: (df.words / df.words.loc['total']).mul(100).round(2),
).rename(columns={'words': 'dev'})

table_test = make_table(test).assign(
    prc_test=lambda df: (df.words / df.words.loc['total']).mul(100).round(2),
).rename(columns={'words': 'test'})

table = table_all.join([table_train, table_dev, table_test])
table

Unnamed: 0_level_0,all,prc_total,train,prc_train,dev,prc_dev,test,prc_test
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,1055973,99.02,843372,98.99,107664,99.13,104937,99.09
D,1546,0.14,1262,0.15,142,0.13,142,0.13
E,6525,0.61,5328,0.63,573,0.53,624,0.59
I,1496,0.14,1245,0.15,140,0.13,111,0.1
N,931,0.09,756,0.09,89,0.08,86,0.08
total,1066471,100.0,851963,100.0,108608,100.0,105900,100.0


## Save

In [7]:
train.to_pickle('../data/train_dev_test/train.pkl')
dev.to_pickle('../data/train_dev_test/dev.pkl')
test.to_pickle('../data/train_dev_test/test.pkl')