## Imports

In [1]:
import json
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Functions

In [2]:

def save_data(X_train, y_train, X_test, y_test, experiment, system):
    if "win" in system.lower():
        base_path = "C:/Users/user/Google Drive/Els_meus_documents/projectes/CompetitiveIntelligence/WRI/Notebooks/Data/Final_input_data/" #Windows
    elif "lin" in system.lower():
        base_path = "/home/propietari/Documents/GitHub/policy-data-analyzer/tasks/data_augmentation/output/HSSC/" #linux
        
    filename = f"{experiment}_train_sentences.csv"
    file = base_path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_train))

    filename = f"{experiment}_train_labels.csv"
    file = base_path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_train))

    filename = f"{experiment}_test_sentences.csv"
    file = base_path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(X_test))

    filename = f"{experiment}_test_labels.csv"
    file = base_path + filename
    with open(file, 'w', newline='', encoding='utf-8') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerows(zip(y_test))
        
def load_dictionary(file):
    with open(file, 'r') as f:
        dictionary = json.load(f)
    return dictionary

def check_labels(data_frame, column_name):
    print(data_frame[column_name].unique())
    

## Data retrieval from Excel sheets

In [3]:
#To process the Excel file with assisted labeling results

language = "english"

path = "/home/propietari/Documents/GitHub/policy-data-analyzer/tasks/data_augmentation/input/" #Linux
# path = "C:/Users/jordi/Documents/GitHub/policy-data-analyzer/tasks/data_augmentation/input/" #Windows
filename = f"pre_labeled_{language}_ready_short.xlsx"
file = path + filename

policy_counter = {"Loan" : 0, "Direct payment" : 0, "Fine" : 0, "Supplies" : 0, "Tax benefit" : 0, "Technical assistance" : 0}

for policy in policy_counter:
    df_temp = pd.read_excel(file, engine='openpyxl', sheet_name = policy, skiprows=[0])
    df_temp.loc[df_temp['Is_policy'] == 1.0, 'Is_policy'] = policy
    if "Other_instrument"in df_temp.columns:
        df_temp['Is_policy'] = np.where(df_temp['Is_policy'] == 0.0, df_temp['Other_instrument'], df_temp['Is_policy'])
    df_temp.loc[df_temp['Is_policy'] == "Unknown", 'Is_policy'] = np.NaN
    df_temp.loc[df_temp['Is_policy'] == "Instrument unknown", 'Is_policy'] = np.NaN
    df_temp.loc[df_temp['Is_policy'] == "Tax", 'Is_policy'] = "Tax benefit"
    df_temp.loc[df_temp['Is_policy'] == "Tax deduction", 'Is_policy'] = "Tax benefit"
    df_temp.loc[df_temp['Is_policy'] == "Technical", 'Is_policy'] = "Technical assistance"
    df_temp.loc[df_temp['Is_policy'] == "Direct", 'Is_policy'] = "Direct payment"
    df_temp.loc[df_temp['Is_policy'] == "Credit", 'Is_policy'] = "Loan"
    df_temp['Is_policy'] = df_temp['Is_policy'].str.strip()
    df_temp.loc[df_temp['Is_incentive'] == 0.0, 'Is_incentive'] = "not_Incentive"
    df_temp.loc[df_temp['Is_incentive'] == 1.0, 'Is_incentive'] = "Incentive"
    try:
        df_binary = pd.concat([df_binary, df_temp[['Sentence', 'Is_incentive']]], ignore_index = True)
        df_multiclass = pd.concat([df_multiclass, df_temp[['Sentence', 'Is_policy']]], ignore_index = True)
    except:
        df_binary = df_temp[['Sentence', 'Is_incentive']]
        df_multiclass = df_temp[['Sentence', 'Is_policy']]

binary_assisted_english = df_binary.dropna().values.tolist()
multiclass_assisted_english = df_multiclass.dropna().values.tolist()
check_labels(df_multiclass, "Is_policy")
check_labels(df_temp, "Is_incentive")
del df_binary
del df_multiclass        
# print(df_binary[0:5])
# print(List_multiclass[0:5])
print(len(binary_assisted_english))
print(len(multiclass_assisted_english))

['Loan' nan 'Direct payment' 'Technical assistance' 'Fine' 'Tax benefit'
 'Supplies']
['not_Incentive' 'Incentive']
3001
752


In [None]:
# Merging chile+El Salvador with Mexico assisted labeling datasets
binary_assisted_spanish = binary_assisted_spanish + binary_assisted_mexico
multiclass_assisted_spanish = multiclass_assisted_spanish + multiclass_assisted_mexico

In [None]:
print(len(binary_assisted_spanish))
print(len(multiclass_assisted_spanish))

In [4]:
#To process the Excel file with hand-picked labeling results for English training data

path = "/home/propietari/Documents/GitHub/policy-data-analyzer/tasks/data_augmentation/input/" #Linux
# path = "C:/Users/jordi/Documents/GitHub/policy-data-analyzer/tasks/data_augmentation/input/" #Windows
filename = "WRI_Policy_Tags_English.xlsx"
file = path + filename

df_temp = pd.concat(pd.read_excel(file, engine='openpyxl', sheet_name = None, usecols = "B:D"), ignore_index = True)
df_temp.loc[df_temp['Incentive'] == 1.0, 'Incentive'] = "Incentive"
df_temp.loc[df_temp['Incentive'] == 0.0, 'Incentive'] = "not_Incentive"
df_temp['Primary Instrument'] = df_temp['Primary Instrument'].str.strip()
df_temp.loc[df_temp['Primary Instrument'] == "Unknown", 'Primary Instrument'] = np.NaN
df_temp.loc[df_temp['Primary Instrument'] == "Other", 'Primary Instrument'] = np.NaN
df_temp.loc[df_temp['Primary Instrument'] == "Tax deduction", 'Primary Instrument'] = "Tax benefit"
df_temp.loc[df_temp['Primary Instrument'] == "Interest subvention", 'Primary Instrument'] = "Loan"
df_temp.loc[df_temp['Primary Instrument'] == "Credit", 'Primary Instrument'] = "Loan"
df_temp.loc[df_temp['Primary Instrument'] == "Guarantee", 'Primary Instrument'] = "Loan"

binary_handpicked_english = df_temp[["Sentence", "Incentive"]].dropna().values.tolist()
multiclass_handpicked_english = df_temp[["Sentence", "Primary Instrument"]].dropna().values.tolist()
check_labels(df_temp, "Primary Instrument")
check_labels(df_temp, "Incentive")
print(len(binary_handpicked_english))
print(len(multiclass_handpicked_english))

['Direct payment' 'Technical assistance' 'Supplies' nan 'Tax benefit'
 'Loan' 'Fine']
['Incentive' 'not_Incentive' nan]
1381
723


In [None]:
policy_counter = {"Loan" : 0, "Direct payment" : 0, "Fine" : 0, "Supplies" : 0, "Tax benefit" : 0, "Technical assistance" : 0}

In [5]:
#Create merges for assited labeling dataset and hand-picked dataset

binary_merged_english = binary_assisted_english + binary_handpicked_english
multiclass_merged_english = multiclass_assisted_english + multiclass_handpicked_english

print(len(binary_merged_english))
print(len(multiclass_merged_english))

4382
1475


## Definition of the experiments


In [8]:
experiments = {"binary_assisted_english" : binary_assisted_english,
               "binary_handpicked_english" : binary_handpicked_english,
               "binary_merged_english" : binary_merged_english,
               "multiclass_assisted_english" : multiclass_assisted_english,
               "multiclass_handpicked_english" : multiclass_handpicked_english,
               "multiclass_merged_english" : multiclass_merged_english
}
# experiments = {"binary_assisted_spanish" : binary_assisted_spanish,
#                "binary_handpicked_spanish" : binary_handpicked_spanish,
#                "binary_merged_spanish" : binary_merged_spanish,
#                "multiclass_assisted_spanish" : multiclass_assisted_spanish,
#                "multiclass_handpicked_spanish" : multiclass_handpicked_spanish,
#                "multiclass_merged_spanish" : multiclass_merged_spanish
# }

## Data saving

In [9]:
test_perc = 0.2
system = "linux"

for name, data in experiments.items():
    sentences = [i[0] for i in data]
    labels = [i[1] for i in data]
    X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=test_perc, stratify=labels, random_state=69420)
    save_data(X_train, y_train, X_test, y_test, name, system)