In [13]:
# here I'm analysing similarities between the datasets
import pandas as pd


def read_dataset(folder, split):
    try:
        return pd.read_csv(f'../../../Splits/{folder}/{split}.csv')
    except:
        return None


# read the datasets, first open the datasets-metadata.json file
metadata = pd.read_json('../../../datasets-metadata.json')['datasets']
datasets = []
for dataset in metadata:
    if dataset["task"] == "Question Identification":
        folder = dataset["folder"]
        # attempt reading train, test and val csv files, if no file is found, assing none
        train = read_dataset(folder, 'train')
        test = read_dataset(folder, 'test')
        val = read_dataset(folder, 'val')
        datasets.append({
            f"{folder}": {
                "train": train,
                "test": test,
                "val": val
            }
        }
        )

datasets_dict = {list(dataset.keys())[0]: list(dataset.values())[0] for dataset in datasets}

# merge train, test and val of the dataset in folder qi_webis_2020_webis_2022_merged
qi_webis_2020_webis_2022_merged = pd.concat(
    [datasets_dict['qi_webis_2020_webis_2022_merged']['train'],
     datasets_dict['qi_webis_2020_webis_2022_merged']['test'],
     datasets_dict['qi_webis_2020_webis_2022_merged']['val']],
    ignore_index=True
)

# find duplicates in the merged dataset in the question column
duplicates = qi_webis_2020_webis_2022_merged[qi_webis_2020_webis_2022_merged.duplicated(['question'])]

print(f"Number of duplicates in qi_webis_2020_webis_2022_merged: {len(duplicates)}")
print(f"Number of unique questions in qi_webis_2020_webis_2022_merged: {len(qi_webis_2020_webis_2022_merged) - len(duplicates)}")

# get the sizes of qi_webis_2020 and qi_webis_2022
qi_webis_2020 = pd.concat([datasets_dict['qi_webis_2020']['train'],
                            datasets_dict['qi_webis_2020']['test'],
                            datasets_dict['qi_webis_2020']['val']])
qi_webis_2022 = pd.concat([datasets_dict['qi_webis_2022']['train'],
                            datasets_dict['qi_webis_2022']['test'],
                            datasets_dict['qi_webis_2022']['val']])
print(f"Size of qi_webis_2020: {len(qi_webis_2020)}")
print(f"Size of qi_webis_2022: {len(qi_webis_2022)}")

# check for duplicates in qi_webis_2020 and qi_webis_2022
duplicates = qi_webis_2020[qi_webis_2020.duplicated(['question'])]
print(f"Number of duplicates in qi_webis_2020: {len(duplicates)}")
duplicates = qi_webis_2022[qi_webis_2022.duplicated(['question'])]
print(f"Number of duplicates in qi_webis_2022: {len(duplicates)}")

# check for identical questions in qi_webis_2020 and qi_webis_2022 after making the question column lowercase, do that only for label 1\
qi_webis_2020_questions = set(qi_webis_2020[qi_webis_2020['label'] == 1]['question'].str.lower())
qi_webis_2022_questions = set(qi_webis_2022[qi_webis_2022['label'] == 1]['question'].str.lower())
identical_questions = qi_webis_2020_questions.intersection(qi_webis_2022_questions)
print(f"Number of identical questions in qi_webis_2020 and qi_webis_2022: {len(identical_questions)}")

# do same for label 0
qi_webis_2020_questions = set(qi_webis_2020[qi_webis_2020['label'] == 0]['question'].str.lower())
qi_webis_2022_questions = set(qi_webis_2022[qi_webis_2022['label'] == 0]['question'].str.lower())
identical_questions = qi_webis_2020_questions.intersection(qi_webis_2022_questions)
print(f"Number of identical questions in qi_webis_2020 and qi_webis_2022: {len(identical_questions)}")


Number of duplicates in qi_webis_2020_webis_2022_merged: 2776
Number of unique questions in qi_webis_2020_webis_2022_merged: 22100
Size of qi_webis_2020: 15000
Size of qi_webis_2022: 9876
Number of duplicates in qi_webis_2020: 0
Number of duplicates in qi_webis_2022: 0
Number of identical questions in qi_webis_2020 and qi_webis_2022: 1358
Number of identical questions in qi_webis_2020 and qi_webis_2022: 1418


In [14]:
import pandas as pd

metrics = [
    pd.read_csv(f'{folder}/metrics.csv') for folder in ['distilbert', 'distilbert-base-uncased', 'microsoft', 'roberta-base']
    ]
# header of each metrics file: training on,tested on,model,accuracy,precision,recall,f1
# make a new metrics file, with training on, testedon, average accuracy, average precision, average recall, average f1
# sort by f1 score
# remove model column
metrics = pd.concat(metrics)
metrics = metrics.drop(columns=['model'])
metrics = metrics.groupby(['training on', 'tested on']).mean().reset_index()
metrics = metrics.sort_values(by='f1', ascending=False)

# save the metrics file
metrics.to_csv('metrics.csv', index=False)

In [19]:
# add mintaka to the mix
import pandas as pd

metrics = pd.read_csv('metrics.csv')

metrics['training on'] = metrics['training on'].str.replace('qi_', '')
metrics['tested on'] = metrics['tested on'].str.replace('qi_', '')

# remove model, accuracy, precision and recall columns
metrics = metrics.drop(columns=['accuracy', 'precision', 'recall'])

# sort by f1 score
metrics = metrics.sort_values(by='f1', ascending=False)

# show only rows with tested on column contains beloucif
metrics = metrics[metrics['tested on'].str.contains('beloucif')]

metrics

Unnamed: 0,training on,tested on,f1
27,all,beloucif,0.793891
28,mintaka_webis_2022_merged,beloucif,0.728907
29,webis_2020_webis_2022_merged,beloucif,0.725035
30,webis_2020,beloucif,0.706855
31,webis_2022,beloucif,0.696184
32,mintaka_webis_2020_merged,beloucif,0.684501
34,mintaka,beloucif,0.602871


In [17]:
import pandas as pd

predictions1 = pd.read_csv('microsoft/qi_all_qi_beloucif_microsoft_test_results.csv')
predictions2 = pd.read_csv('microsoft/qi_webis_2020_webis_2022_merged_qi_all_microsoft_test_results.csv')

# question,label,predictions
questions1 = predictions1[predictions1['label'] != predictions1['predictions']]

questions1


Unnamed: 0,question,label,predictions
15,"What is the difference between a king , an emp...",0,1
22,How are humans able to tell the difference bet...,0,1
25,Why are some FM radio stations clearer on some...,0,1
31,why doesn't China just conquer Mongolia when i...,0,1
41,Why do I get more likes on a tweet than the re...,0,1
...,...,...,...
770,Why do businesses use that thinner than paper ...,0,1
772,"What are some books that are , in your opinion...",0,1
777,Why can states pass laws that are more restric...,0,1
791,Who is the motivational speaker in the motivat...,0,1


In [26]:
import pandas as pd

metrics = pd.read_csv('metrics.csv')

# remove all rows where tested on column includes the world 'all'
metrics = metrics[~metrics['tested on'].str.contains('all')]

# now create a table, where rows are training on, columns are tested on, and values are accuracy
metrics = metrics.pivot(index='training on', columns='tested on', values='accuracy')

# round all values to 2 decimal places
metrics = metrics.round(2)

metrics

tested on,qi_beloucif,qi_mintaka,qi_webis_2020,qi_webis_2022
training on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
qi_all,0.8,0.99,0.97,0.97
qi_mintaka,0.63,0.99,0.89,0.68
qi_mintaka_webis_2020_merged,0.7,0.99,0.97,0.93
qi_mintaka_webis_2022_merged,0.74,0.99,0.94,0.97
qi_webis_2020,0.72,0.75,0.97,0.88
qi_webis_2020_webis_2022_merged,0.74,0.97,0.97,0.98
qi_webis_2022,0.72,0.96,0.94,0.97


In [31]:
import pandas as pd

metrics = [
    pd.read_csv(f'{folder}/metrics.csv') for folder in ['distilbert', 'distilbert-base-uncased', 'microsoft', 'roberta-base']
    ]
# header of each metrics file: training on,tested on,model,accuracy,precision,recall,f1
# make a new metrics file, with training on, testedon, average accuracy, average precision, average recall, average f1
# sort by f1 score
# remove model column
metrics = pd.concat(metrics)
metrics = metrics.sort_values(by='f1', ascending=False)

# keep only rows where tested on column includes beloucif
metrics = metrics[metrics['tested on'].str.contains('beloucif')]

metrics = metrics.pivot(index='training on', columns='model', values='f1')

# round all values to 2 decimal places
metrics = metrics.round(2)

metrics

model,distilbert,distilbert-base-uncased,microsoft,roberta-base
training on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
qi_all,0.79,0.77,0.78,0.83
qi_mintaka,0.58,0.64,0.62,0.57
qi_mintaka_webis_2020_merged,0.66,0.71,0.67,0.69
qi_mintaka_webis_2022_merged,0.68,0.76,0.8,0.68
qi_webis_2020,0.69,0.7,0.68,0.77
qi_webis_2020_webis_2022_merged,0.66,0.79,0.78,0.66
qi_webis_2022,0.66,0.7,0.7,0.73
