## Imports

In [1]:
from pathlib import Path
import json
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Dictionaries

In [2]:
# policy_dict = {'Credit' : 'Credit', 'Direct' : 'Direct payment', 'Fine' : 'Fine', 'General' : 'Unknown', 
#                'Guarantee' : 'Credit', 'Supplies' : 'Supplies', 'Tax' : 'Tax deduction', 
#                'Technical' : 'Technical assistance', 'Unknown' : 'Unknown', 'Other' : 'Unknown', 'Nan' : 'Unknown' }
# policy_counter = {"Credit" : 0, "Direct payment" : 0, "Fine" : 0, "Supplies" : 0, "Tax deduction" : 0, "Technical assistance" : 0}
# incentive_counter = {"Incentive" : 0, "not_Incentive" : 0}

policy_dict = {'Loan' : 'Loan', 'Direct' : 'Direct payment', 'Fine' : 'Fine', 'General' : 'Unknown', 
               'Guarantee' : 'Credit', 'Supplies' : 'Supplies', 'Tax' : 'Tax benefit', 
               'Technical' : 'Technical assistance', 'Unknown' : 'Unknown', 'Other' : 'Unknown', 'Nan' : 'Unknown' }
policy_counter = {"Credit" : 0, "Direct payment" : 0, "Fine" : 0, "Supplies" : 0, "Tax benefit" : 0, "Technical assistance" : 0}
incentive_counter = {"Incentive" : 0, "not_Incentive" : 0}

incentive_dict = {'Incentive' : 'Incentive', 'Disincentive' : 'Incentive', 'Unknown' : 'not_Incentive', 'Nan' : 'not_Incentive'}

## Functions

In [10]:

def clean_df(df, reference_column_to_drop_na):
    df.dropna(subset = [reference_column_to_drop_na], inplace = True)
    df.dropna(axis=1, how='all', inplace = True)
    df.replace(np.nan, 'Nan', regex=True, inplace = True)
    return df

def process_new_labels(filename, reference_column_to_drop_na, policy):
    df_temp = pd.concat(pd.read_excel(filename, engine='openpyxl', sheet_name = None, skiprows=[0]), ignore_index = True)
    df_temp = clean_df(df_temp, reference_column_to_drop_na)
    df_temp.insert(0, "Document", df_temp.apply(lambda row: row.Sentence_Id.split("_")[0], axis = 1))
    df_temp.loc[df_temp['Is_policy'] == 0, 'Is_policy'] = "Nan"
    df_temp.loc[df_temp['Is_policy'] == 1, 'Is_policy'] = policy
    if "Other_instrument"in df_temp.columns:
        df_temp['Is_policy'] = np.where(df_temp['Is_policy'] == "Nan", df_temp['Other_instrument'], df_temp['Is_policy'])
    df_temp.loc[df_temp['Is_incentive'] == 0, 'Is_incentive'] = "Unknown"
    df_temp.loc[df_temp['Is_incentive'] == 1, 'Is_incentive'] = "Incentive"
    return df_temp

def label_cleaning(dictionary, label):
    flag = 1
    
    for key in dictionary:
#         print(key, "----", label)
        if key in label:
            return dictionary[key]
            flag = 0
            break
    if flag == 1:
#         print(label)
        return label

def merge_excel_to_list(filename):
    df = process_new_labels(filename, "Is_incentive", policy_instrument)
    List = df[["Document", "Sentence_Id", "Sentence", "Is_policy", "Is_incentive"]].values.tolist()
    
    return List

def merge_excel_to_list_new(filename):
    flag = False
    
    for policy in policy_dict:
        if policy in filename:
            policy_instrument = policy
            flag = True
    if flag:
#         print(filename)
        df = process_new_labels(filename, "Is_incentive", policy_instrument)
        List = df[["Document", "Sentence_Id", "Sentence", "Is_policy", "Is_incentive"]].values.tolist()
    
    return List

def merge_excel_to_list_old(filename):
    df = pd.concat(pd.read_excel(filename, engine='openpyxl', sheet_name = None), ignore_index = True)
    df = clean_df(df, "Document")
    df = df[["Document", "Sentence", "Primary_Instrument", "Category"]]
    data = df.values.tolist()
    
    return data


def list_to_dict(List, dictionary, dataset):
    Dictionary = {}
    i = 0
    for item in List:
        i += 1
        if dataset == "Old":
            Dictionary[str(i)] = {"text" : item[1], "labels" : label_cleaning(policy_dict, item [2]), "incentive": label_cleaning(incentive_dict, item [3])}
        elif dataset == "New":
            dictionary[item[1]] = {"text" : item[2], "labels" : label_cleaning(policy_dict, item [3]), "incentive": label_cleaning(incentive_dict, item [4])}
    if dataset == "Old":
        return Dictionary
    elif dataset == "New":
        return dictionary
        
    

def dictionary_to_final_lists(dictionary, classifier):
    all_sents = []
    all_labels = []
        
    for item in dictionary:
        if classifier == "Binary":
            all_sents.append(dictionary[item]["text"])
            all_labels.append(dictionary[item]["incentive"])
        elif classifier == "Multiclass":
            if dictionary[item]["labels"] != "Unknown":
                all_sents.append(dictionary[item]["text"])
                all_labels.append(dictionary[item]["labels"])
    
    return all_sents, all_labels
        

def save_data(X_train, y_train, X_test, y_test, experiment, classifier):
    base_path = "C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/"
    
    path = base_path + classifier + "/"
    
    filename = "{}_train_sentences.csv".format(experiment)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_train))

    filename = "{}_train_labels.csv".format(experiment)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_train))

    filename = "{}_test_sentences.csv".format(experiment)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_test))

    filename = "{}_test_labels.csv".format(experiment)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_test))
        
def load_dictionary(file):
    with open(file, 'r') as f:
        dictionary = json.load(f)
    return dictionary

In [11]:
path = "/home/propietari/Documents/GitHub/policy-data-analyzer/tasks/data_augmentation/input/"
filename = "pre_labeled_English_ready_short.xlsx"
file = path + filename

incentive_list = merge_excel_to_list(file)

NameError: name 'policy_instrument' is not defined

## Raters multiclass

In [None]:

test_perc = 0.2
rater = "Rater1"
classifier = "Multiclass"
base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/{}/".format(rater))
data_path = base_path.glob('**/')

for path in data_path:
#         results_list = []    
    path_in_str = str(path)
#         print(path_in_str)
    if rater in path_in_str:
        dictionary = {}
#         print(path_in_str)
        for file_obj in Path(path_in_str).glob('*.xlsx'):
            file = str(file_obj)
            if "Unique" in file:
                print(file)
                dictionary_new_labels = list_to_dict(merge_excel_to_list_new(file), dictionary, "New")
            else:
#                 print(file)
                dictionary_old_labels = list_to_dict(merge_excel_to_list_old(file), dictionary, "Old")
                all_sents_old, all_labels_old = dictionary_to_final_lists(dictionary_old_labels, classifier)
                
        all_sents_new, all_labels_new = dictionary_to_final_lists(dictionary_new_labels, classifier)
print(len(all_sents_old), " -- ", len(all_labels_old))
print(len(all_sents_new), " -- ", len(all_labels_new))

In [None]:
experiments = ["EXP3", "EXP9", "EXP15"]
if rater == "Rater3" and classifier == "Multiclass":
    for experiment in experiments:
        if experiment == "EXP3":
            X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
            save_data(X_train, y_train, X_test, y_test, experiment, classifier)
        if experiment == "EXP9":
            save_data(all_sents_new, all_labels_new, all_sents_old, all_labels_old, experiment, classifier)
        if experiment == "EXP15":
            all_sents = all_sents_new + all_sents_old
            all_labels = all_labels_new + all_labels_old
            X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
            save_data(X_train, y_train, X_test, y_test, experiment, classifier)

In [None]:
experiments = ["EXP1", "EXP7", "EXP13"]
if rater == "Rater1" and classifier == "Multiclass":
    for experiment in experiments:
        if experiment == "EXP1":
            X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
            save_data(X_train, y_train, X_test, y_test, experiment, classifier)
        if experiment == "EXP7":
            save_data(all_sents_new, all_labels_new, all_sents_old, all_labels_old, experiment, classifier)
        if experiment == "EXP13":
            all_sents = all_sents_new + all_sents_old
            all_labels = all_labels_new + all_labels_old
            X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
            save_data(X_train, y_train, X_test, y_test, experiment, classifier)

In [None]:
experiments = ["EXP2", "EXP8", "EXP14"]
if rater == "Rater2" and classifier == "Multiclass":
    for experiment in experiments:
        if experiment == "EXP2":
            X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
            save_data(X_train, y_train, X_test, y_test, experiment, classifier)
        if experiment == "EXP8":
            save_data(all_sents_new, all_labels_new, all_sents_old, all_labels_old, experiment, classifier)
        if experiment == "EXP14":
            all_sents = all_sents_new + all_sents_old
            all_labels = all_labels_new + all_labels_old
            X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
            save_data(X_train, y_train, X_test, y_test, experiment, classifier)

In [None]:
raters = ["Rater1", "Rater2", "Rater3"]
only_raters = [True, False]
# base_path = Path("C:/Users/jordi/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")
base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")

for rater in raters:
    for only_rater in only_raters:
        wraping_up(base_path, rater, only_rater)

## Merges multiclass

In [None]:
raters = {"Rater1" : rater1, "Rater2" : rater2, "Rater3" : rater3}
test_perc = 0.2
classifier = "Multiclass"

all = {}
rater1 = {}
rater2 = {}
rater3 = {}

for rater in raters:
    base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/{}/".format(rater))
    data_path = base_path.glob('**/')

    for path in data_path:
    #         results_list = []    
        path_in_str = str(path)
    #         print(path_in_str)
        if rater in path_in_str:
            dictionary = {}
    #         print(path_in_str)
            for file_obj in Path(path_in_str).glob('*.xlsx'):
                file = str(file_obj)
                if "Unique" in file:
                    dictionary_new_labels = list_to_dict(merge_excel_to_list_new(file), dictionary, "New")
                else:
                    dictionary_old_labels = list_to_dict(merge_excel_to_list_old(file), dictionary, "Old")
                    all_old, all_labels_old = dictionary_to_final_lists(dictionary_old_labels, classifier)
                    
            if "Rater1" in path_in_str:
                rater1 = dictionary_new_labels
            elif "Rater2" in path_in_str:
                rater2 = dictionary_new_labels          
            elif "Rater3" in path_in_str:
                rater3 = dictionary_new_labels
all = {**rater1, **rater2, **rater3}

print(len(rater1), len(rater2), len(rater3), len(all))

# Next we loop for the dictionary to find the elements that meet the criteria of the different merges.

incentive = "labels" #write "labels" if you want to work with policy instruments, write "incentive" to work with is_incentive
merge1 = {}
merge2 = {}
merge3 = {}
# classifier = "labels" #If you want to classify by is_incentive

i = 0
j = 0
k = 0
l = 0
m = 0
n = 0
# First we look for sentences that all three raters have labeled the same
for sent in all:
    i += 1
    if sent in rater1 and sent in rater2 and sent in rater3:
        j += 1
        if rater1[sent]["labels"] == rater2[sent]["labels"] and rater2[sent]["labels"] == rater3[sent]["labels"]:
            k += 1
            if rater1[sent]["labels"] != 'Unknown':
                merge3[sent] = rater1[sent]

#Now we look for the sentences that at least two raters have labeled the same
for sent in all:
    if sent in rater1 and sent in rater2:
        l += 1
        if rater1[sent]["labels"] == rater2[sent]["labels"]:
            if rater1[sent]["labels"] != 'Unknown':
                m += 1
                merge2[sent] = rater1[sent]
    elif sent in rater1 and sent in rater3:
        l += 1
        if rater1[sent]["labels"] == rater3[sent]["labels"]:
            if rater1[sent]["labels"] != 'Unknown':
                m += 1
                merge2[sent] = rater1[sent]
    elif sent in rater2 and sent in rater3:
        l += 1
        if rater2[sent]["labels"] == rater3[sent]["labels"]:
            if rater2[sent]["labels"] != 'Unknown':
                m += 1
                merge2[sent] = rater2[sent]
            
# Finally we build a dataset containing the sentences that at least one of the labelers have labeled with a label different from "Unknown"
for sent in all:
    label = {}
    if sent in rater3 and rater3[sent]["labels"] != 'Unknown':
#         label[rater3[sent]["labels"]] = "rater3"
        merge1[sent] = rater3[sent]
    elif sent in rater2 and rater2[sent]["labels"] != 'Unknown':
#         label[rater2[sent]["labels"]] = "rater2"
        merge1[sent] = rater2[sent]
    elif sent in rater1 and rater1[sent]["labels"] != 'Unknown':
#         label[rater1[sent]["labels"]] = "rater1"
        merge1[sent] = rater1[sent]
#     else:
#         merge1[sent] = all[sent]

print(f"In the all-sentences dict there are {i} sentences")
print(f"In the three raters lists there are {j} common sentences")
print(f"In the three raters lists there are {k} common sentences which are rated identically")
print(f"There are {l} sentences which are comon to at lest two rater's datasets")
print(f"There are {m} sentences which are labeled identical in at least two rater's datasets")
print(len(merge3))
print(len(merge2))
print(len(merge1))

merges = {"Merge1" : merge1, "Merge2" : merge2, "Merge3" : merge3}
for merge in merges:
    all_new = []
    all_labels_new = []
    for sent in merges[merge]:
        all_new.append(merges[merge][sent]["text"])
        all_labels_new.append(merges[merge][sent]["labels"])
    print(merge)
    if merge == "Merge1" and classifier == "Multiclass":
        experiments = ["EXP4", "EXP10", "EXP16"]
        for experiment in experiments:
            if experiment == "EXP4":
                X_train, X_test, y_train, y_test = train_test_split(all_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP10":
                save_data(all_new, all_labels_new, all_old, all_labels_old, experiment, classifier)
            if experiment == "EXP16":
                all = all_new + all_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
    
    if merge == "Merge2" and classifier == "Multiclass":
        experiments = ["EXP5", "EXP11", "EXP17"]
        for experiment in experiments:
            if experiment == "EXP5":
                X_train, X_test, y_train, y_test = train_test_split(all_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP11":
                save_data(all_new, all_labels_new, all_old, all_labels_old, experiment, classifier)
            if experiment == "EXP17":
                all = all_new + all_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
                
    if merge == "Merge3" and classifier == "Multiclass":
        experiments = ["EXP6", "EXP12", "EXP18"]
        for experiment in experiments:
            if experiment == "EXP6":
                X_train, X_test, y_train, y_test = train_test_split(all_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP12":
                save_data(all_new, all_labels_new, all_old, all_labels_old, experiment, classifier)
            if experiment == "EXP18":
                all = all_new + all_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)


In [None]:
merge2

## Raters binary

In [None]:
test_perc = 0.2
# rater = "Rater1"
classifier = "Binary"
all = {}
rater1 = {}
rater2 = {}
rater3 = {}
raters = {"Rater1" : rater1, "Rater2" : rater2, "Rater3" : rater3}

for rater in raters:
    base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/{}/".format(rater))
    data_path = base_path.glob('**/')

    for path in data_path:
    #         results_list = []    
        path_in_str = str(path)
    #         print(path_in_str)
        if rater in path_in_str:
            dictionary = {}
    #         print(path_in_str)
            for file_obj in Path(path_in_str).glob('*.xlsx'):
                file = str(file_obj)
                if "Unique" in file:
#                     print(file)
                    dictionary_new_labels = list_to_dict(merge_excel_to_list_new(file), dictionary, "New")
                else:
    #                 print(file)
                    dictionary_old_labels = list_to_dict(merge_excel_to_list_old(file), dictionary, "Old")
                    all_sents_old, all_labels_old = dictionary_to_final_lists(dictionary_old_labels, classifier)
            if "Rater1" in path_in_str:
                raters[rater] = dictionary_new_labels
            elif "Rater2" in path_in_str:
                raters[rater] = dictionary_new_labels          
            elif "Rater3" in path_in_str:
                raters[rater] = dictionary_new_labels
# all = {**rater1, **rater2, **rater3}

    unknown = 0
    known = 0
    bugs = 0
    all_sents_new = []
    all_labels_new = []
    for sent in raters[rater]:
        all_sents_new.append(raters[rater][sent]["text"])
        all_labels_new.append(raters[rater][sent]["incentive"])
        if raters[rater][sent]["incentive"] == "not_Incentive":
            unknown += 1
        elif raters[rater][sent]["incentive"] == "Incentive":
            known += 1
        else:
            bugs += 1
    print(rater)
    print(len(raters[rater]))
    print("Incentive", known, " -- not_Incentive", unknown, " -- Others", bugs)

    if rater == "Rater1" and classifier == "Binary":
        experiments = ["EXP19", "EXP25"]
        for experiment in experiments:
            if experiment == "EXP19":
                X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP25":
                all_sents = all_sents_new + all_sents_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
                
    if rater == "Rater2" and classifier == "Binary":
        experiments = ["EXP20", "EXP26"]
        for experiment in experiments:
            if experiment == "EXP20":
                X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP26":
                all_sents = all_sents_new + all_sents_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
                
    if rater == "Rater3" and classifier == "Binary":
        experiments = ["EXP21", "EXP27"]
        for experiment in experiments:
            if experiment == "EXP21":
                X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP27":
                all_sents = all_sents_new + all_sents_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)

## Merges binary

In [None]:
test_perc = 0.2
# rater = "Rater1"
classifier = "Binary"
all = {}
rater1 = {}
rater2 = {}
rater3 = {}
raters = {"Rater1" : rater1, "Rater2" : rater2, "Rater3" : rater3}

for rater in raters:
    base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/{}/".format(rater))
    data_path = base_path.glob('**/')

    for path in data_path:
    #         results_list = []    
        path_in_str = str(path)
    #         print(path_in_str)
        if rater in path_in_str:
            dictionary = {}
    #         print(path_in_str)
            for file_obj in Path(path_in_str).glob('*.xlsx'):
                file = str(file_obj)
                if "Unique" in file:
#                     print(file)
                    dictionary_new_labels = list_to_dict(merge_excel_to_list_new(file), dictionary, "New")
                else:
    #                 print(file)
                    dictionary_old_labels = list_to_dict(merge_excel_to_list_old(file), dictionary, "Old")
                    all_sents_old, all_labels_old = dictionary_to_final_lists(dictionary_old_labels, classifier)
            if "Rater1" in path_in_str:
                rater1 = dictionary_new_labels
            elif "Rater2" in path_in_str:
                rater2 = dictionary_new_labels          
            elif "Rater3" in path_in_str:
                rater3 = dictionary_new_labels

all = {**rater1, **rater2, **rater3}


incentive = "incentive" #write incentive if you want to work with policy instruments, write "incentive" to work with is_incentive
merge1 = {}
merge2 = {}
merge3 = {}
# classifier = incentive #If you want to classify by is_incentive

i = 0
j = 0
k = 0
l = 0
m = 0
n = 0
# First we look for sentences that all three raters have labeled the same
for sent in all:
    i += 1
    if sent in rater1 and sent in rater2 and sent in rater3:
        j += 1
        if rater1[sent][incentive] == rater2[sent][incentive] and rater2[sent][incentive] == rater3[sent][incentive]:
            k += 1
            merge3[sent] = rater1[sent]

#Now we look for the sentences that at least two raters have labeled the same
for sent in all:
    if sent in rater1 and sent in rater2:
        l += 1
        if rater1[sent][incentive] == rater2[sent][incentive]:
            m += 1
            merge2[sent] = rater1[sent]
    elif sent in rater1 and sent in rater3:
        l += 1
        if rater1[sent][incentive] == rater3[sent][incentive]:
            m += 1
            merge2[sent] = rater1[sent]
    elif sent in rater2 and sent in rater3:
        l += 1
        if rater2[sent][incentive] == rater3[sent][incentive]:
            m += 1
            merge2[sent] = rater2[sent]
            
# Finally we build a dataset containing the sentences that at least one of the labelers have labeled
for sent in all:
    label = {}
    if sent in rater3:
#         label[rater3[sent][incentive]] = "rater3"
        merge1[sent] = rater3[sent]
    elif sent in rater2:
#         label[rater2[sent][incentive]] = "rater2"
        merge1[sent] = rater2[sent]
    elif sent in rater1:
#         label[rater1[sent][incentive]] = "rater1"
        merge1[sent] = rater1[sent]
#     else:
#         merge1[sent] = all[sent]

print(f"In the all-sentences dict there are {i} sentences")
print(f"In the three raters lists there are {j} common sentences")
print(f"In the three raters lists there are {k} common sentences which are rated identically")
print(f"There are {l} sentences which are comon to at lest two rater's datasets")
print(f"There are {m} sentences which are labeled identical in at least two rater's datasets")
print(len(merge3))
print(len(merge2))
print(len(merge1))

merges = {"Merge1" : merge1, "Merge2" : merge2, "Merge3" : merge3}
for merge in merges:
    all_sents_new = []
    all_labels_new = []
    for sent in merges[merge]:
        all_sents_new.append(merges[merge][sent]["text"])
        all_labels_new.append(merges[merge][sent][incentive])
    print(merge)
    
    if merge == "Merge1" and classifier == "Binary":
        experiments = ["EXP22", "EXP28"]
        for experiment in experiments:
            if experiment == "EXP22":
                X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP28":
                all_sents = all_sents_new + all_sents_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
                
    if merge == "Merge2" and classifier == "Binary":
        experiments = ["EXP23", "EXP29"]
        for experiment in experiments:
            if experiment == "EXP23":
                X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP29":
                all_sents = all_sents_new + all_sents_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
                
    if merge == "Merge3" and classifier == "Binary":
        experiments = ["EXP24", "EXP30"]
        for experiment in experiments:
            if experiment == "EXP24":
                X_train, X_test, y_train, y_test = train_test_split(all_sents_new, all_labels_new, test_size=test_perc, stratify=all_labels_new, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)
            if experiment == "EXP30":
                all_sents = all_sents_new + all_sents_old
                all_labels = all_labels_new + all_labels_old
                X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
                save_data(X_train, y_train, X_test, y_test, experiment, classifier)

In [None]:
all