In [13]:
# here I'm analysing similarities between the datasets
import pandas as pd


def read_dataset(folder, split):
    try:
        return pd.read_csv(f'../../../Splits/{folder}/{split}.csv')
    except:
        return None


# read the datasets, first open the datasets-metadata.json file
metadata = pd.read_json('../../../datasets-metadata.json')['datasets']
datasets = []
for dataset in metadata:
    if dataset["task"] == "Question Identification":
        folder = dataset["folder"]
        # attempt reading train, test and val csv files, if no file is found, assing none
        train = read_dataset(folder, 'train')
        test = read_dataset(folder, 'test')
        val = read_dataset(folder, 'val')
        datasets.append({
            f"{folder}": {
                "train": train,
                "test": test,
                "val": val
            }
        }
        )

datasets_dict = {list(dataset.keys())[0]: list(dataset.values())[0] for dataset in datasets}

# merge train, test and val of the dataset in folder qi_webis_2020_webis_2022_merged
qi_webis_2020_webis_2022_merged = pd.concat(
    [datasets_dict['qi_webis_2020_webis_2022_merged']['train'],
     datasets_dict['qi_webis_2020_webis_2022_merged']['test'],
     datasets_dict['qi_webis_2020_webis_2022_merged']['val']],
    ignore_index=True
)

# find duplicates in the merged dataset in the question column
duplicates = qi_webis_2020_webis_2022_merged[qi_webis_2020_webis_2022_merged.duplicated(['question'])]

print(f"Number of duplicates in qi_webis_2020_webis_2022_merged: {len(duplicates)}")
print(f"Number of unique questions in qi_webis_2020_webis_2022_merged: {len(qi_webis_2020_webis_2022_merged) - len(duplicates)}")

# get the sizes of qi_webis_2020 and qi_webis_2022
qi_webis_2020 = pd.concat([datasets_dict['qi_webis_2020']['train'],
                            datasets_dict['qi_webis_2020']['test'],
                            datasets_dict['qi_webis_2020']['val']])
qi_webis_2022 = pd.concat([datasets_dict['qi_webis_2022']['train'],
                            datasets_dict['qi_webis_2022']['test'],
                            datasets_dict['qi_webis_2022']['val']])
print(f"Size of qi_webis_2020: {len(qi_webis_2020)}")
print(f"Size of qi_webis_2022: {len(qi_webis_2022)}")

# check for duplicates in qi_webis_2020 and qi_webis_2022
duplicates = qi_webis_2020[qi_webis_2020.duplicated(['question'])]
print(f"Number of duplicates in qi_webis_2020: {len(duplicates)}")
duplicates = qi_webis_2022[qi_webis_2022.duplicated(['question'])]
print(f"Number of duplicates in qi_webis_2022: {len(duplicates)}")

# check for identical questions in qi_webis_2020 and qi_webis_2022
qi_webis_2020_questions = set(qi_webis_2020['question'])
qi_webis_2022_questions = set(qi_webis_2022['question'])
identical_questions = qi_webis_2020_questions.intersection(qi_webis_2022_questions)
print(f"Number of identical questions in qi_webis_2020 and qi_webis_2022: {len(identical_questions)}")


Number of duplicates in qi_webis_2020_webis_2022_merged: 2776
Number of unique questions in qi_webis_2020_webis_2022_merged: 22100
Size of qi_webis_2020: 15000
Size of qi_webis_2022: 9876
Number of duplicates in qi_webis_2020: 0
Number of duplicates in qi_webis_2022: 0
Number of identical questions in qi_webis_2020 and qi_webis_2022: 2776


In [16]:
import pandas as pd

metrics = [
    pd.read_csv(f'{folder}/metrics.csv') for folder in ['distilbert', 'distilbert-base-uncased', 'microsoft', 'roberta-base']
    ]
# header of each metrics file: training on,tested on,model,accuracy,precision,recall,f1
# make a new metrics file, with training on, testedon, average accuracy, average precision, average recall, average f1
# sort by f1 score
# remove model column
metrics = pd.concat(metrics)
metrics = metrics.drop(columns=['model'])
metrics = metrics.groupby(['training on', 'tested on']).mean().reset_index()
metrics = metrics.sort_values(by='f1', ascending=False)

# save the metrics file
metrics.to_csv('metrics.csv', index=False)

In [32]:
# Webis datasets
import pandas as pd

metrics = pd.read_csv('metrics.csv')

metrics['training on'] = metrics['training on'].str.replace('qi_', '')
metrics['tested on'] = metrics['tested on'].str.replace('qi_', '')
# remove model, accuracy, precision and recall columns
metrics = metrics.drop(columns=['precision', 'recall'])

# sort alphabetically by training on and tested on
metrics = metrics.sort_values(by=['training on', 'tested on'])

# keep only rows that contain the string "webis" in either training on or tested on columns or contain 'all' in training on.
metrics = metrics[metrics['training on'].str.contains('webis') | metrics['tested on'].str.contains('webis') | metrics['training on'].str.contains('all')]

#remove all rows with 'beloucif' in the training on or tested on columns
metrics = metrics[~metrics['training on'].str.contains('beloucif') & ~metrics['tested on'].str.contains('beloucif')]

metrics

metrics

Unnamed: 0,training on,tested on,accuracy,f1
4,all,all,0.980971,0.981202
3,all,mintaka,0.992625,0.99267
14,all,webis_2020,0.968,0.969433
7,all,webis_2022,0.973178,0.973176
25,mintaka,webis_2020,0.894167,0.859335
33,mintaka,webis_2022,0.680162,0.646776
6,mintaka_webis_2020_merged,all,0.976277,0.976004
2,mintaka_webis_2020_merged,mintaka,0.993375,0.99338
16,mintaka_webis_2020_merged,webis_2020,0.968,0.967712
21,mintaka_webis_2020_merged,webis_2022,0.932186,0.931972


In [28]:
# add mintaka to the mix
import pandas as pd

metrics = pd.read_csv('metrics.csv')

metrics['training on'] = metrics['training on'].str.replace('qi_', '')
metrics['tested on'] = metrics['tested on'].str.replace('qi_', '')

# remove model, accuracy, precision and recall columns
metrics = metrics.drop(columns=['accuracy', 'precision', 'recall'])

# remove all rows with 'beloucif' in the training on or tested on columns
metrics = metrics[~metrics['training on'].str.contains('beloucif') & ~metrics['tested on'].str.contains('beloucif')]

# keep only rows that contains the string "mintaka" in one of the training on or tested on columns, or contains 'all'.
metrics = metrics[metrics['training on'].str.contains('mintaka') | metrics['tested on'].str.contains('mintaka') | metrics['training on'].str.contains('all') | metrics['tested on'].str.contains('all')]

# sort by f1 score
metrics = metrics.sort_values(by='f1', ascending=False)

metrics

Unnamed: 0,training on,tested on,f1
0,mintaka,mintaka,0.994805
1,mintaka_webis_2022_merged,mintaka,0.993409
2,mintaka_webis_2020_merged,mintaka,0.99338
3,all,mintaka,0.99267
4,all,all,0.981202
6,mintaka_webis_2020_merged,all,0.976004
7,all,webis_2022,0.973176
8,mintaka_webis_2022_merged,all,0.97288
9,mintaka_webis_2022_merged,webis_2022,0.971153
11,webis_2020_webis_2022_merged,mintaka,0.970704


In [None]:
#