## Imports

In [1]:
from pathlib import Path
import json
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Dictionaries

In [2]:
policy_dict = {'Credit' : 'Credit', 'Direct' : 'Direct payment', 'Fine' : 'Fine', 'General' : 'Unknown', 
               'Guarantee' : 'Credit', 'Supplies' : 'Supplies', 'Tax' : 'Tax deduction', 
               'Technical' : 'Technical assistance', 'Unknown' : 'Unknown', 'Other' : 'Unknown', 'Nan' : 'Unknown' }

incentive_dict = {'Incentive' : 'Incentive', 'Disincentive' : 'Incentive', 'Unknown' : 'not_Incentive', 'Nan' : 'not_Incentive'}

## Functions

In [3]:

def clean_df(df, reference_column_to_drop_na):
    df.dropna(subset = [reference_column_to_drop_na], inplace = True)
    df.dropna(axis=1, how='all', inplace = True)
    df.replace(np.nan, 'Nan', regex=True, inplace = True)
    return df

def process_new_labels(filename, reference_column_to_drop_na, policy):
    df_temp = pd.concat(pd.read_excel(filename, engine='openpyxl', sheet_name = None, skiprows=[0]), ignore_index = True)
    df_temp = clean_df(df_temp, reference_column_to_drop_na)
    df_temp.insert(0, "Document", df_temp.apply(lambda row: row.Sentence_Id.split("_")[0], axis = 1))
    df_temp.loc[df_temp['Is_policy'] == 0, 'Is_policy'] = "Nan"
    df_temp.loc[df_temp['Is_policy'] == 1, 'Is_policy'] = policy
    if "Other_instrument"in df_temp.columns:
        df_temp['Is_policy'] = np.where(df_temp['Is_policy'] == "Nan", df_temp['Other_instrument'], df_temp['Is_policy'])
    df_temp.loc[df_temp['Is_incentive'] == 0, 'Is_incentive'] = "Unknown"
    df_temp.loc[df_temp['Is_incentive'] == 1, 'Is_incentive'] = "Incentive"
    return df_temp

def label_cleaning(dictionaryionary, label):
    flag = 1
    
    for key in dictionaryionary:
#         print(key, "----", label)
        if key in label:
            return dictionaryionary[key]
            flag = 0
            break
    if flag == 1:
#         print(label)
        return label

def merge_excel_to_list_new(filename):
    flag = False
    
    for policy in policy_dict:
        if policy in filename:
            policy_instrument = policy
            flag = True
    if flag:
#         print(filename)
        df = process_new_labels(filename, "Is_incentive", policy_instrument)
        List = df[["Document", "Sentence_Id", "Sentence", "Is_policy", "Is_incentive"]].values.tolist()
    
    return List

def merge_excel_to_list_old(filename):
    df = pd.concat(pd.read_excel(filename, engine='openpyxl', sheet_name = None), ignore_index = True)
    df = clean_df(df, "Document")
    df = df[["Document", "Sentence", "Primary_Instrument", "Category"]]
    data = df.values.tolist()
    
    return data
    

def list_new_labels_to_dict(List, Dictionary, rater):
    i = 0
    for item in List:
        if item[0] in Dictionary:
            if item[1] in Dictionary[item[0]]["Seccion unica"]["sentences"]:
                if Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]]["labels"] == "Unknown":
                    if label_cleaning(policy_dict, item [3]) != "Unknown":
                        i += 1
#                             print(rater, "--", item[1], Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]]["text"], label_cleaning(policy_dict, item [3]))
                        Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]]["labels"] = label_cleaning(policy_dict, item [3])

            else:
                i += 1
                Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]] = {"text" : item[2], "labels" : label_cleaning(policy_dict, item [3]), "incentive": label_cleaning(incentive_dict, item [4])}
        else:
            i += 1
            Dictionary[item[0]] = {"Seccion unica" : {"tags" : [], "sentences" : {}}}
            Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]] = {"text" : item[2], "labels" : label_cleaning(policy_dict, item [3]), "incentive": label_cleaning(incentive_dict, item [4])}
#     print(rater, " -- ", i)
    return Dictionary

def list_old_labels_to_dict(List):
    Dictionary = {}
    i = 0
    for item in List:
        if item[0] in Dictionary:
            i += 1
            Dictionary[item[0]]["Seccion unica"]["sentences"][str(i)] = {"text" : item[1], "labels" : label_cleaning(policy_dict, item [2]), "incentive": label_cleaning(incentive_dict, item [3])}
        else:
            i += 1
            Dictionary[item[0]] = {"Seccion unica" : {"tags" : [], "sentences" : {}}}
            Dictionary[item[0]]["Seccion unica"]["sentences"][str(i)] = {"text" : item[1], "labels" : label_cleaning(policy_dict, item [2]), "incentive": label_cleaning(incentive_dict, item [3])}
            
    return Dictionary

def save_dictionary(path, dictionary, rater, old_merge = False, test = False):
    if old_merge:
        filename = "{}/{}_combined_labeled.json".format(rater, rater)
    elif old_merge == False and test == False:
        filename = "{}/{}_single_labeled.json".format(rater, rater)
    elif old_merge == False and test == True:
        filename = "{}/{}_single_labeled_test.json".format(rater, rater)
        
    file = path / filename
    with open(file, 'w') as fp:
        json.dump(dictionary, fp)
        
def load_dictionary(file):
    with open(file, 'r') as f:
        dictionary = json.load(f)
    return dictionary
        
def wraping_up(base_path, rater, only_rater):
    data_path = base_path.glob('**/')
    for path in data_path:
#         results_list = []
        dictionary = {}    
        path_in_str = str(path)
#         print(path_in_str)
        if rater in path_in_str:
#             print(path_in_str)
            for file_obj in Path(path_in_str).glob('*.xlsx'):
                file = str(file_obj)
                if "Unique" in file:
#                     print(file)
                    results_list = merge_excel_to_list_new(file)
                    dictionary_new_labels = list_new_labels_to_dict(results_list, dictionary, rater)
                else:
                    dictionary_old_labels = list_old_labels_to_dict(merge_excel_to_list_old(file))
            j = 0        
            for key1 in dictionary_new_labels:
                for key2 in dictionary_new_labels[key1]["Seccion unica"]["sentences"]:
                    j += 1
#             print(rater, " - all", j)
            if only_rater:
                save_dictionary(base_path, dictionary_new_labels, rater, False, False)
                save_dictionary(base_path, dictionary_old_labels, rater, False, True)
            else:
                merged_dict = {**dictionary_old_labels, **dictionary_new_labels}
                save_dictionary(base_path, merged_dict, rater, True)

In [None]:
raters = ["Rater1", "Rater2", "Rater3"]
only_raters = [True, False]
# base_path = Path("C:/Users/jordi/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")
base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")

for rater in raters:
    for only_rater in only_raters:
        wraping_up(base_path, rater, only_rater)

In [None]:
data_path = base_path.glob('**/')

all_sents = {}
rater1 = {}
rater2 = {}
rater3 = {}

# First we build a dictionary with all the sentences that are in the three databases.
# We also build a dictionary for each rater
i = 0
for path in data_path:
    for file_obj in path.glob('*.json'):
        file = str(file_obj)
        if "Rater" in file and "single" in file and "test" not in file:
            print(file)
#             print(i)
            dictionary = load_dictionary(file)
            new_dict = {}
            for value in dictionary.values():
                for sentences in value.values():
                    for sent in sentences['sentences']:
#                         i += 1
                        all_sents[sent] = sentences['sentences'][sent]
                        new_dict[sent] = sentences['sentences'][sent]
        if "Rater1" in file:
            rater1 = new_dict
        elif "Rater2" in file:
            rater2 = new_dict          
        elif "Rater3" in file:
            rater3 = new_dict
            
print(len(rater1), " -- ", len(rater2), " -- ", len(rater3))

# Next we loop for the dictionary to find the elements that meet the criteria of the different merges.

incentive = "labels" #write "labels" if you want to work with policy instruments, write "incentive" to work with is_incentive
merge1 = {}
merge2 = {}
merge3 = {}
classifier = "labels" #If you want to classify by is_incentive

i = 0
j = 0
k = 0
l = 0
m = 0
n = 0
# First we look for sentences that all three raters have labeled the same
for sent in all_sents:
    i += 1
    if sent in rater1 and sent in rater2 and sent in rater3:
        j += 1
        if rater1[sent]["labels"] == rater2[sent]["labels"] and rater2[sent]["labels"] == rater3[sent]["labels"]:
            k += 1
            if rater1[sent]["labels"] != 'Unknown':
                merge3[sent] = rater1[sent]

#Now we look for the sentences that at least two raters have labeled the same
for sent in all_sents:
    if sent in rater1 and sent in rater2:
        l += 1
        if rater1[sent]["labels"] == rater2[sent]["labels"]:
            if rater1[sent]["labels"] != 'Unknown':
                m += 1
                merge2[sent] = rater1[sent]
    elif sent in rater1 and sent in rater3:
        l += 1
        if rater1[sent]["labels"] == rater3[sent]["labels"]:
            if rater1[sent]["labels"] != 'Unknown':
                m += 1
                merge2[sent] = rater1[sent]
    elif sent in rater2 and sent in rater3:
        l += 1
        if rater2[sent]["labels"] == rater3[sent]["labels"]:
            if rater2[sent]["labels"] != 'Unknown':
                m += 1
                merge2[sent] = rater2[sent]
            
# Finally we build a dataset containing the sentences that at least one of the labelers have labeled with a label different from "Unknown"
for sent in all_sents:
    label = {}
    if sent in rater3 and rater3[sent]["labels"] != 'Unknown':
#         label[rater3[sent]["labels"]] = "rater3"
        merge1[sent] = rater3[sent]
    elif sent in rater2 and rater2[sent]["labels"] != 'Unknown':
#         label[rater2[sent]["labels"]] = "rater2"
        merge1[sent] = rater2[sent]
    elif sent in rater1 and rater1[sent]["labels"] != 'Unknown':
#         label[rater1[sent]["labels"]] = "rater1"
        merge1[sent] = rater1[sent]
#     else:
#         merge1[sent] = all_sents[sent]

print(f"In the all-sentences dict there are {i} sentences")
print(f"In the three raters lists there are {j} common sentences")
print(f"In the three raters lists there are {k} common sentences which are rated identically")
print(f"There are {l} sentences which are comon to at lest two rater's datasets")
print(f"There are {m} sentences which are labeled identical in at least two rater's datasets")
print(len(merge3))
print(len(merge2))
print(len(merge1))

merges = {"Merge1" : merge1, "Merge2" : merge2, "Merge3" : merge3}
for merge in merges:
    all_sents = []
    all_labels = []
    for sent in merges[merge]:
        all_sents.append(merges[merge][sent]["text"])
        all_labels.append(merges[merge][sent]["labels"])

    print(merge, len(all_sents))
    
    test_perc = 0.2

    X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)

    print(len(X_train), " - ", len(X_test))

    path = "C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/Merges/"

    filename = "dataset_{}_policy_sentences.csv".format(merge)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_train))

    filename = "dataset_{}_policy_labels.csv".format(merge)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_train))

    filename = "testset_{}_policy_sentences.csv".format(merge)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_test))

    filename = "testset_{}_policy_labels.csv".format(merge)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_test))


In [None]:
merge2

### Data for binary classifier

In [4]:
raters = ["Rater1", "Rater2", "Rater3"]
# base_path = Path("C:/Users/jordi/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")
base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")

data_path = base_path.glob('**/')

all_sent = {}
rater1 = {}
rater2 = {}
rater3 = {}

# First we build a dictionary with all the sentences that are in the three databases.
# We also build a dictionary for each rater
i = 0
for path in data_path:
    for file_obj in path.glob('*.json'):
        file = str(file_obj)
#         print(file)
        if "Rater" in file and "single" in file and "test" not in file: #TODO: Adjust when needed
#             print(file)
#             print(i)
            dictionary = load_dictionary(file)
            new_dict = {}
            for value in dictionary.values():
                for sentences in value.values():
                    for sent in sentences['sentences']:
#                         i += 1
                        all_sent[sent] = sentences['sentences'][sent]
                        new_dict[sent] = sentences['sentences'][sent]
            if "Rater1" in file:
                rater1 = new_dict
            elif "Rater2" in file:
                rater2 = new_dict          
            elif "Rater3" in file:
                rater3 = new_dict
print("all_sents elements", len(all_sent))
print(len(rater1), " -- ", len(rater2), " -- ", len(rater3))
raters = {"Rater1" : rater1, "Rater2" : rater2, "Rater3": rater3}

for rater in raters:
    unknown = 0
    known = 0
    bugs = 0
    all_sents = []
    all_labels = []
    for sent in raters[rater]:
        all_sents.append(raters[rater][sent]["text"])
        all_labels.append(raters[rater][sent]["incentive"])
        if raters[rater][sent]["incentive"] == "not_Incentive":
            unknown += 1
        elif raters[rater][sent]["incentive"] == "Incentive":
            known += 1
        else:
            bugs += 1

    test_perc = 0.2

    X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)
    
    print(rater)
    print("Incentive", unknown, " -- not_Incentive", known, " -- Others", bugs)
    print(len(X_train), " - ", len(X_test))       

    path = "C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/Binary/"

    filename = "dataset_{}_incentive_sentences.csv".format(rater)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_train))

    filename = "dataset_{}_incentive_labels.csv".format(rater)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_train))

    filename = "testset_{}_incentive_sentences.csv".format(rater)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_test))

    filename = "testset_{}_incentive_labels.csv".format(rater)
    file = path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_test))

        
# # For the merged dataset
# rater = "Merged"
# all_sents = []
# all_labels = []
# for sent in all_sent:
#     all_sents.append(all_sent[sent]["text"])
#     all_labels.append(all_sent[sent]["incentive"])

# print(len(all_sents))
    
# test_perc = 0.2

# X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)

# print(len(X_train), " - ", len(X_test))

# path = "C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/Binary/"

# filename = "dataset_{}_incentive_sentences.csv".format(rater)
# file = path + filename
# with open(file, 'w', newline='', encoding='utf-8') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerows(zip(X_train))

# filename = "dataset_{}_incentive_labels.csv".format(rater)
# file = path + filename
# with open(file, 'w', newline='', encoding='utf-8') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerows(zip(y_train))

# filename = "testset_{}_incentive_sentences.csv".format(rater)
# file = path + filename
# with open(file, 'w', newline='', encoding='utf-8') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerows(zip(X_test))

# filename = "testset_{}_incentive_labels.csv".format(rater)
# file = path + filename
# with open(file, 'w', newline='', encoding='utf-8') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerows(zip(y_test))

all_sents elements 1326
565  --  1312  --  911
Rater1
Incentive 267  -- not_Incentive 298  -- Others 0
452  -  113
Rater2
Incentive 772  -- not_Incentive 540  -- Others 0
1049  -  263
Rater3
Incentive 440  -- not_Incentive 471  -- Others 0
728  -  183


In [None]:
all_sents