In [2]:
import pandas as pd

beloucif_test = pd.read_csv('../qi_beloucif/test.csv')

mintaka_test = pd.read_csv('../qi_mintaka/test.csv')
mintaka_train = pd.read_csv('../qi_mintaka/train.csv')
mintaka_val = pd.read_csv('../qi_mintaka/val.csv')

webis_2020_test = pd.read_csv('../qi_webis_2020/test.csv')
webis_2020_train = pd.read_csv('../qi_webis_2020/train.csv')

webis_2022_test = pd.read_csv('../qi_webis_2022/test.csv')
webis_2022_train = pd.read_csv('../qi_webis_2022/train.csv')
webis_2022_val = pd.read_csv('../qi_webis_2022/val.csv')

# create a validation split for webis_2020
webis_2020_val = webis_2020_train.sample(frac=0.2, random_state=42)
webis_2020_train = webis_2020_train.drop(webis_2020_val.index)

# merge mintaka splits
mintaka = pd.concat([mintaka_train, mintaka_val, mintaka_test])
# merge webis_2020 splits
webis_2020 = pd.concat([webis_2020_train, webis_2020_val, webis_2020_test])
# merge webis_2022 splits
webis_2022 = pd.concat([webis_2022_train, webis_2022_val, webis_2022_test])

# print label distribution for each dataset
print('Mintaka:')
print(mintaka['label'].value_counts())
print('Webis 2020:')
print(webis_2020['label'].value_counts())
print('Webis 2022:')
print(webis_2022['label'].value_counts())
print('Beloucif:')
print(beloucif_test['label'].value_counts())


Mintaka:
label
0    18000
1     2000
Name: count, dtype: int64
Webis 2020:
label
0    13569
1     1431
Name: count, dtype: int64
Webis 2022:
label
0    4938
1    4938
Name: count, dtype: int64
Beloucif:
label
0    409
1    387
Name: count, dtype: int64


In [6]:
# merge all splits into one
train = pd.concat([mintaka_train, webis_2020_train, webis_2022_train])
val = pd.concat([mintaka_val, webis_2020_val, webis_2022_val])
test = pd.concat([beloucif_test, mintaka_test, webis_2020_test, webis_2022_test])

# shuffle the data
train = train.sample(frac=1, random_state=42)
val = val.sample(frac=1, random_state=42)
test = test.sample(frac=1, random_state=42)

# save the splits
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test_wBeloucif.csv', index=False)

test = pd.concat([mintaka_test, webis_2020_test, webis_2022_test])
test = test.sample(frac=1, random_state=42)
test.to_csv('test.csv', index=False)

In [4]:
# for each 2 datasets, merge them into one and save into a folder called '{split1}_{split2}_merged'
from os import path, makedirs

datasets_test = {
    'beloucif': beloucif_test,
    'mintaka': mintaka_test,
    'webis_2020': webis_2020_test,
    'webis_2022': webis_2022_test
}

datasets_train = {
    'mintaka': mintaka_train,
    'webis_2020': webis_2020_train,
    'webis_2022': webis_2022_train
}

datasets_val = {
    'mintaka': mintaka_val,
    'webis_2020': webis_2020_val,
    'webis_2022': webis_2022_val
}

for split1, split2 in [('mintaka', 'webis_2020'), ('mintaka', 'webis_2022'), ('webis_2020', 'webis_2022')]:
    train = pd.concat([datasets_train[split1], datasets_train[split2]])
    val = pd.concat([datasets_val[split1], datasets_val[split2]])
    test = pd.concat([datasets_test[split1], datasets_test[split2]])

    train = train.sample(frac=1, random_state=42)
    val = val.sample(frac=1, random_state=42)
    test = test.sample(frac=1, random_state=42)

    folder = f'../qi_{split1}_{split2}_merged'
    if not path.exists(folder):
        makedirs(folder)

    train.to_csv(f'{folder}/train.csv', index=False)
    val.to_csv(f'{folder}/val.csv', index=False)
    test.to_csv(f'{folder}/test.csv', index=False)