# Zero-Shot Learning Experiments: Topic classification

Using https://huggingface.co/zero-shot/ 

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import sys

sys.path.append("../../")
sys.path.append("../../../")
from tasks.data_loader.src.utils import *
from tasks.evaluate_model.src.model_evaluator import *
from tasks.augment_data.src.zero_shot_classification.nli_topic_classifier import *

### Load labeled sentences from Excel - One label

In [19]:
all_sents_excel = pd.read_excel("../input/WRI_Policy_Tags.xlsx", engine="openpyxl")

In [20]:
all_sents_excel.head()

Unnamed: 0,Document,Text,Incentive Instrument,Land Use Type,Category,Unique Policy #,Key words,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,2019 ACUERDO por el que se emiten los Lineamie...,Generar empleo y garantizara la población camp...,Direct payment,"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),"insumo, crédito, capacitación, asistencia técnica",,,,...,,,,,,,,,,
1,2019 ACUERDO por el que se emiten los Lineamie...,Generar empleo y garantizara la población camp...,Technical assistance,"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),,,,,...,,,,,,,,,,
2,2019 ACUERDO por el que se emiten los Lineamie...,Generar empleo y garantizara la población camp...,Credit,"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),,,,,...,,,,,,,,,,
3,2019 ACUERDO por el que se emiten los Lineamie...,El Programa incentivará a los sujetos agrarios...,Direct payment,"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),incentivar,,,,...,,,,,,,,,,
4,2019 ACUERDO por el que se emiten los Lineamie...,El Programa incentivará a los sujetos agrarios...,Technical assistance,"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),,,,,...,,,,,,,,,,


### Try out which hypothesis template gives the "best" results

In [41]:
test_sent = 'Generar empleo y garantizara la población campesina el bienestar y su participación e incorporación en el desarrollo nacional, y fomentará la actividad agropecuaria y forestal para el óptimo uso de la tierra, con obras de infraestructura, insumos, créditos, servicios de capacitación y asistencia técnica'
test_labels = ['direct payment', 'technical assistance', 'credit']
all_labels = ["direct payment", "tax deduction", "credit", "guarantee", "technical assistance", "supplies", "fine", "unknown"]
hyp_template_list = ['This text is about {}.', 'This text contains incentives about {}s', 
                     "This text contains information about {}s", "This text contains {}"]

results = []
for hyp in hyp_template_list:
    result = classify_sentence(test_sent, all_labels,
                        hyp, classifier, allow_multi_class=False, all_probs=True)
    
    print(f"Hypothesis: {hyp}")
    print(f"Labels and scores:")
    for pretty in result:
        print(pretty)
    print("-----------------------------------------------------------\n")
    results.append(result)
    

Hypothesis: This text is about {}.
Labels and scores:
('fine', 0.35791587829589844)
('guarantee', 0.2378205507993698)
('direct payment', 0.09230978041887283)
('unknown', 0.07248454540967941)
('supplies', 0.06786298006772995)
('tax deduction', 0.06379210948944092)
('credit', 0.06338527053594589)
('technical assistance', 0.04442889615893364)
-----------------------------------------------------------

Hypothesis: This text contains incentives about {}s
Labels and scores:
('fine', 0.6989411115646362)
('guarantee', 0.11369836330413818)
('supplies', 0.054176971316337585)
('technical assistance', 0.04837031662464142)
('direct payment', 0.03908627852797508)
('credit', 0.028751058503985405)
('unknown', 0.014763382263481617)
('tax deduction', 0.002212527673691511)
-----------------------------------------------------------

Hypothesis: This text contains information about {}s
Labels and scores:
('fine', 0.4065122604370117)
('guarantee', 0.3340264856815338)
('supplies', 0.09895601868629456)
('di

### Load labeled sentences from 5 countries (Excel file)

In [17]:
data_excel = pd.read_excel("../input/allcountries_policytags.xlsx", engine="openpyxl", sheet_name=None)

In [18]:
data_excel['Mexico '].head()

Unnamed: 0,Document,Original Text,"Relevant Sentences (for Environment, Incentives, Land Type)","Relevant Phrases (for Environment, Incentives, Land Type)",Incentive Instrument,Land Use Type,Category,Unique Policy #,Key words
0,2019 ACUERDO por el que se emiten los Lineamie...,Generar empleo y garantizara la población camp...,Generar empleo y garantizara la población camp...,garantizara la población campesina el bienesta...,"Direct payment (PES), Credit, Technical assist...","Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),"insumo, crédito, capacitación, asistencia técnica"
1,2019 ACUERDO por el que se emiten los Lineamie...,\nEl Programa incentivará a los sujetos agrari...,\nEl Programa incentivará a los sujetos agrari...,incentivará a los sujetos agrarios a establece...,"Direct payment (PES), Credit, Technical assist...","Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),incentivar
2,2019 ACUERDO por el que se emiten los Lineamie...,Los sujetos agrarios beneficiados por el progr...,Los sujetos agrarios beneficiados por el progr...,Los sujetos agrarios beneficiados por el progr...,"Supplies, Technical assistance","Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),"apoyo económico, apoyos en especie, insumos, h..."
3,2019 ACUERDO por el que se emiten los Lineamie...,"El sujeto de derecho, recibirá un apoyo económ...","El sujeto de derecho, recibirá un apoyo económ...","recibirá un apoyo económico de $5,000.00 (Cin...",Direct payment (PES),"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),pesos
4,2019 ACUERDO por el que se emiten los Lineamie...,"El sujeto de derecho, recibirá en especie las ...","El sujeto de derecho, recibirá en especie las ...",recibirá en especie las plantas necesarias par...,Supplies,"Forest, Agriculture (Crop)",Incentive,1 (Sembrando Vida),recibir


In [None]:
countries_sents_map = {}
mexico_df = data_excel['Mexico ']
mexico_df["Relevant Sentences (for Environment, Incentives, Land Type) "]

In [None]:
# Sentences
mexico_df["Relevant Sentences (for Environment, Incentives, Land Type) "] = mexico_df["Relevant Sentences (for Environment, Incentives, Land Type) "].apply(lambda x: x.replace("\n", "").strip())
mexico_sents = list(mexico_df["Relevant Sentences (for Environment, Incentives, Land Type) "])

# Labels
mexico_df['Incentive Instrument'] = mexico_df['Incentive Instrument'].apply(lambda x: x.replace("(PES)", "").replace("(Bond)", "").strip())
mexico_labels = [[string.strip() for string in label.split(", ")][0] for label in mexico_df['Incentive Instrument']]

In [None]:
mexico_sents[0], set(mexico_labels)

In [None]:
def country_labeled_sentences(excel_map):
    result = {}
    for country, dataframe in excel_map.items():

        new_sents_col = dataframe["Relevant Sentences (for Environment, Incentives, Land Type) "].dropna()
        new_labels_col= dataframe["Incentive Instrument"].dropna()
        
        sentences = list(new_sents_col.apply(lambda x: x.replace("\n", "").strip()))
        label_col = new_labels_col.apply(lambda x: x.replace("(PES)", "").replace("(Bond)", "").strip())
        labels = [[string.strip() for string in label.split(", ")][0] for label in label_col]
        result[country] = {}

        for i, (sent, label) in enumerate(zip(sentences, labels)):
            if i not in result[country]:
                result[country][i] = {"text": sent, "labels": [label]}
            else:
                result[country][i]["text"] = sent
                result[country][i]["labels"] = [label]

    return result

In [None]:
excel_sents_map = country_labeled_sentences(data_excel)

## 2.1 Sentence analysis by country

### 2.1.1 Mexico

In [None]:
excel_sents_map["Mexico "][0]

In [None]:
label_names = ["Direct payment", "Tax deduction", "Credit", "Guarantee", "Technical assistance", "Supplies", "Fine", "Unknown"]

#### We want to have the labels both as numbers and as text (for plotting/evaluating purposes)

In [None]:
mexico_labels = labels_from_model_output(excel_sents_map["Mexico "])
num_mexico_labels = labels_to_numeric(mexico_labels, label_names)

In [None]:
plot_data_distribution(num_mexico_labels, label_names)

In [None]:
preds, scores = classify_sentences_topic(excel_sents_map["Mexico "], label_names)

In [None]:
num_preds = labels_to_numeric(preds, label_names)
preds[:10], scores[:10], num_preds[:10]

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
evaluator.evaluate(num_mexico_labels, num_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="multi_class_test")

In [None]:
evaluator.plot_precision_recall_curve(num_mexico_labels, num_preds, bin_class=False, all_classes=True, store=True, exp_name="../output/mexico_multi_class_test")

### Load labeled sentences from 5 countries (JSON file)

In [None]:
fname = "../input/allcountries_tagged_sents.json"
data = load_file(fname)

In [None]:
data

### Filter out badly parsed sentences - with 1 character or empty

In [None]:
missing_ids = []
for document, content in data.items():
    for section in content.values():
        for sid, sentence in section['sentences'].items():
            if 0 <= len(sentence['text']) <= 1:
                print(sentence['text'])
                missing_ids.append(sid)

print(len(missing_ids))

In [None]:
def remove_sents_from_dataset(ids, dataset):
    filtered_dataset = {}
    
    for docid, document in dataset.items():
        filtered_dataset[docid] = {}
        for secid, section in document.items():
            filtered_dataset[docid][secid] = {3}
            filtered_dataset[docid][secid]['tags'] = section['tags']
            filtered_dataset[docid][secid]['sentences'] = {}
            for sentid, sentence in section['sentences'].items():
                if sentid not in ids:
                    filtered_dataset[docid][secid]['sentences'][sentid] = sentence
            
    return filtered_dataset

In [None]:
dataset = remove_sents_from_dataset(missing_ids, data)

In [None]:
dataset_map = labeled_sentences_from_dataset(data)

In [None]:
dataset_map['10']

### 3. Binary classification

In [None]:
labels = labels_from_dataset(data)

In [None]:
label_names = ["not incentives", "incentives"]
binary_labels = list(map(lambda x: "incentives" if x != "Unknown" else "not incentives", labels))

In [None]:
binary_labels.count("incentives"), binary_labels.count("not incentives")

In [None]:
num_bin_labels = labels_to_numeric(binary_labels, label_names)

In [None]:
num_bin_labels[180:190]

In [None]:
plot_data_distribution(num_bin_labels, label_names)

In [None]:
bin_model_preds, bin_scores = classify_sentences_topic(dataset_map,label_names)

In [None]:
bin_preds = labels_to_numeric(bin_model_preds, label_names)
bin_preds[:10]

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
evaluator.update(num_bin_labels, bin_preds)
print("Recall per class:", evaluator.recall)
print("Average weighted precision:", evaluator.avg_precision[1])

In [None]:
evaluator.evaluate(num_bin_labels, bin_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="binary_class_test")

In [None]:
evaluator.plot_precision_recall_curve(num_bin_labels, bin_scores, bin_class=True, store=True, exp_name="binary_class_test")

### 4. Multi-class classification

In [None]:
label_names = ["Direct payment", "Tax deduction", "Credit", "Guarantee", "Technical assistance", "Supplies", "Fine", "Unknown"]

In [None]:
multi_model_preds, multi_scores = classify_sentences_topic(dataset_map, label_names)

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
multi_model_preds[:10], labels[:10]

In [None]:
# Make label names from dataset match label names from prediction (a.k.a make all Direct payments be the same)
updated_labels = []
for label in labels:
    if "(" in label:
        updated_labels.append("Direct payment")
    else:
        updated_labels.append(label)

In [None]:
num_multi_labels = labels_to_numeric(updated_labels, label_names)
num_multi_preds = labels_to_numeric(multi_model_preds, label_names)

In [None]:
evaluator.evaluate(num_multi_labels, num_multi_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="multi_class_test")

In [None]:
evaluator.plot_precision_recall_curve(num_multi_labels, num_multi_preds, bin_class=False, all_classes=True, store=True, exp_name="multi_class_test")

### Tangent: Separate mutliple labels into a list from JSON file

In [None]:
import json

In [None]:
with open("../input/tagged_sentences_all.json", "r") as fjson: 
    jsents = json.load(fjson)

In [None]:
len(jsents), jsents.keys()

In [None]:
sentence_tags_dict = {}
new_json = {}

for keydoc, document in jsents.items():
    new_json[keydoc] = {}
    for keysec, section in document.items():
        new_json[keydoc][keysec] = {}
        new_json[keydoc][keysec]['tags'] = section['tags']
        new_json[keydoc][keysec]['sentences'] = {}
        for sentid, sentence in section['sentences'].items():
            new_labels = [label.strip() for label in sentence['labels'].split(", ")]

            updated_labels = []
            for label in new_labels:
                if "(" in label:
                    updated_labels.append("Direct payment")
                else:
                    updated_labels.append(label)
            new_json[keydoc][keysec]['sentences'][sentid] = {'text': sentence['text'], 'labels': updated_labels}

In [None]:
len(new_json), new_json.keys()

In [None]:
with open("allcountries_tagged_sents.json", "w") as wjson:
    json.dump(new_json, wjson)

In [None]:
data = load_file("allcountries_tagged_sents.json")

In [None]:
dataset_map = labeled_sentences_from_dataset(data)

In [None]:
dataset_map['1']

In [None]:
dataset_labels = labels_from_dataset(data)
dataset_labels[:10], set(dataset_labels)

In [None]:
numeric_labels = numeric_labels_from_dataset(data)
numeric_labels[:10]