## Imports

In [3]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

## Dictionaries

In [4]:
policy_dict = {'Credit' : 'Credit', 'Direct' : 'Direct payment', 'Fine' : 'Fine', 'General' : 'Unknown', 
               'Guarantee' : 'Credit', 'Supplies' : 'Supplies', 'Tax' : 'Tax deduction', 
               'Technical' : 'Technical assistance', 'Unknown' : 'Unknown', 'Other' : 'Unknown', 'Nan' : 'Unknown' }

incentive_dict = {'Incentive' : 'Incentive', 'Disincentive' : 'Incentive', 'Unknown' : 'not_Incentive', 'Nan' : 'not_Incentive'}

## Functions

In [147]:

def clean_df(df, reference_column_to_drop_na):
    df.dropna(subset = [reference_column_to_drop_na], inplace = True)
    df.dropna(axis=1, how='all', inplace = True)
    df.replace(np.nan, 'Nan', regex=True, inplace = True)
    return df

def process_new_labels(filename, reference_column_to_drop_na, policy):
    df_temp = pd.concat(pd.read_excel(filename, engine='openpyxl', sheet_name = None, skiprows=[0]), ignore_index = True)
    df_temp = clean_df(df_temp, reference_column_to_drop_na)
    df_temp.insert(0, "Document", df_temp.apply(lambda row: row.Sentence_Id.split("_")[0], axis = 1))
    df_temp.loc[df_temp['Is_policy'] == 0, 'Is_policy'] = "Nan"
    df_temp.loc[df_temp['Is_policy'] == 1, 'Is_policy'] = policy
    if "Other_instrument"in df_temp.columns:
        df_temp['Is_policy'] = np.where(df_temp['Is_policy'] == "Nan", df_temp['Other_instrument'], df_temp['Is_policy'])
    df_temp.loc[df_temp['Is_incentive'] == 0, 'Is_incentive'] = "Unknown"
    df_temp.loc[df_temp['Is_incentive'] == 1, 'Is_incentive'] = "Incentive"
    return df_temp

def label_cleaning(dictionaryionary, label):
    flag = 1
    
    for key in dictionaryionary:
#         print(key, "----", label)
        if key in label:
            return dictionaryionary[key]
            flag = 0
            break
    if flag == 1:
#         print(label)
        return label

def merge_excel_to_list_new(filename, List):
    flag = False
    
    for policy in policy_dict:
        if policy in filename:
            policy_instrument = policy
            flag = True
    if flag:
#         print(filename)
        df = process_new_labels(filename, "Is_incentive", policy_instrument)
        List.append(df[["Document", "Sentence_Id", "Sentence", "Is_policy", "Is_incentive"]].values.tolist())
    
    return List

def merge_excel_to_list_old(filename):
    df = pd.concat(pd.read_excel(filename, engine='openpyxl', sheet_name = None), ignore_index = True)
    df = clean_df(df, "Document")
    df = df[["Document", "Sentence", "Primary_Instrument", "Category"]]
    data = df.values.tolist()
    
    return data
    

def list_new_labels_to_dict(List, Dictionary, rater):
    for item1 in List:
        for item in item1:
            if item[0] in Dictionary:
                if item[1] in Dictionary[item[0]]["Seccion unica"]["sentences"]:
                    if Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]]["labels"] == "Unknown":
                        if label_cleaning(policy_dict, item [3]) != "Unknown":
                            print(rater, "--", item[1], Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]]["text"], label_cleaning(policy_dict, item [3]))
                            Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]]["labels"] = label_cleaning(policy_dict, item [3])
                         
                else:
                    Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]] = {"text" : item[2], "labels" : label_cleaning(policy_dict, item [3]), "incentive": label_cleaning(incentive_dict, item [4])}
            else:
                Dictionary[item[0]] = {"Seccion unica" : {"tags" : [], "sentences" : {}}}
                Dictionary[item[0]]["Seccion unica"]["sentences"][item[1]] = {"text" : item[2], "labels" : label_cleaning(policy_dict, item [3]), "incentive": label_cleaning(incentive_dict, item [4])}
    
    return Dictionary

def list_old_labels_to_dict(List):
    Dictionary = {}
    i = 0
    for item in List:
        if item[0] in Dictionary:
            i += 1
            Dictionary[item[0]]["Seccion unica"]["sentences"][str(i)] = {"text" : item[1], "labels" : label_cleaning(policy_dict, item [2]), "incentive": label_cleaning(incentive_dict, item [3])}
        else:
            i += 1
            Dictionary[item[0]] = {"Seccion unica" : {"tags" : [], "sentences" : {}}}
            Dictionary[item[0]]["Seccion unica"]["sentences"][str(i)] = {"text" : item[1], "labels" : label_cleaning(policy_dict, item [2]), "incentive": label_cleaning(incentive_dict, item [3])}
            
    return Dictionary

def save_dictionary(path, dictionary, rater, old_merge = False):
    if old_merge:
        filename = "{}/{}_combined_labeled.json".format(rater, rater)
    else:
        filename = "{}/{}_single_labeled.json".format(rater, rater)
        
    file = path / filename
    with open(file, 'w') as fp:
        json.dump(dictionary, fp)
        
def load_dictionary(file):
    with open(file, 'r') as f:
        dictionary = json.load(f)
    return dictionary
        
def wraping_up(base_path, rater, only_rater):
    data_path = base_path.glob('**/')
    for path in data_path:
        results_list = []
        dictionary = {}    
        path_in_str = str(path)
        if rater in path_in_str:
            for file_obj in Path(path_in_str).glob('*.xlsx'):
                file = str(file_obj)
                results_list = merge_excel_to_list(file, results_list)
                if "Unique" in file:
                    dictionary_new_labels = list_new_labels_to_dict(results_list, dictionary, rater)
                else:
                    dictionary_old_labels = list_old_labels_to_dict(merge_excel_to_list_old(file))
#             if only_rater:
#                 save_dictionary(base_path, dictionary_new_labels, rater)
#             else:
#                 merged_dict = {**dictionary_old_labels, **dictionary_new_labels}
#                 save_dictionary(base_path, merged_dict, rater, True)

In [148]:
raters = ["Rater1", "Rater2", "Rater3"]
only_raters = [True, False]
# base_path = Path("C:/Users/jordi/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")
base_path = Path("C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/")

for rater in raters:
    for only_rater in only_raters:
        wraping_up(base_path, rater, only_rater)

Rater1 -- fb9f67e_31 En caso de existir más de un régimen tributario, se considerará aquel régimen que resulte más conveniente económicamente para la empresa eficiente Tax deduction
Rater1 -- fb9f67e_31 En caso de existir más de un régimen tributario, se considerará aquel régimen que resulte más conveniente económicamente para la empresa eficiente Tax deduction
Rater2 -- dab533a_214 De las compensaciones para proyectos de interes social Direct payment
Rater2 -- 15df6c5_137 alto costo, el Servicio podrá otorgar una ayuda económica, destinada a Direct payment
Rater2 -- 01e1622_94 d) De auxilio: Se concederán con el objeto de satisfacer necesidades económicas imprevistas de los afiliados Technical assistance
Rater2 -- 742f481_35 El mecanismo de estabilización consistirá en la aplicación de un Componente de Sustentación o de un Componente de Recuperación, según corresponda, a la Tarifa de compra a los Beneficiarios Direct payment
Rater2 -- 1739c19_276 Esquema de incentivos de la Administra

In [141]:
data_path = base_path.glob('**/')

all_sents = {}
rater1 = {}
rater2 = {}
rater3 = {}


i = 0
for path in data_path:
    for file_obj in path.glob('*.json'):
        file = str(file_obj)
        if "Rater" in file and "old" not in file:
            print(i)
            dictionary = load_dictionary(file)
            new_dict = {}
            for value in dictionary.values():
                for sentences in value.values():
                    for sent in sentences['sentences']:
                        i += 1
                        all_sents[sent] = 0
                        new_dict[sent] = sentences['sentences'][sent]
        if "Rater1" in file:
            rater1 = new_dict
        elif "Rater2" in file:
            rater2 = new_dict          
        elif "Rater3" in file:
            rater3 = new_dict
            
print(len(rater1), " -- ", len(rater2), " -- ", len(rater3))

merge1 = {}
merge2 = {}
merge3 = {}
classifier = "labels" #If you want to classify by is_incentive

i = 0
for sent in all_sents:
    if sent in rater1 and sent in rater2 and sent in rater3:
        if rater1[sent]["labels"] == rater2[sent]["labels"] and rater2[sent]["labels"] == rater3[sent]["labels"]:
            print(rater1[sent]["labels"])
#         print(rater1[sent])
        i +=1
print(i)
        

0
1252
1817
3816
5128
6726
565  --  1312  --  911
Technical assistance
Unknown
Unknown
Unknown
Unknown
Unknown
Unknown
Credit
Credit
Direct payment
Supplies
Unknown
Credit
Credit
Unknown
Unknown
Credit
Unknown
Unknown
Unknown
Direct payment
Credit
Credit
Credit
Credit
Unknown
Unknown
Technical assistance
Unknown
Unknown
Technical assistance
Unknown
Unknown
Credit
Unknown
Unknown
Unknown
Credit
Credit
Credit
Credit
Credit
Credit
Unknown
Unknown
Credit
Credit
Credit
Credit
Credit
Unknown
Credit
Credit
Unknown
Unknown
Unknown
Unknown
Technical assistance
Technical assistance
Unknown
Unknown
Unknown
Unknown
Unknown
Unknown
Technical assistance
Unknown
Unknown
Unknown
Unknown
Credit
Technical assistance
Technical assistance
Unknown
Unknown
Unknown
Unknown
Unknown
Credit
Tax deduction
Unknown
Credit
Credit
Unknown
Credit
Credit
Credit
Credit
Credit
Credit
Fine
Unknown
Technical assistance
Unknown
Unknown
Unknown
Credit
Unknown
Unknown
Unknown
Unknown
Credit
Credit
Credit
Credit
Credit
Unknow