# Note
This notebook can be run on google colab for improved performance. The code changes necessary for running on this system are commented over the code.

## Data preprocessing

In [12]:
! pip install \
  scprep\
  spacy==2.3.2 \
  sentence_transformers==0.4.0 \
  phate==1.0.4 && \
  python -m spacy download es_core_news_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_lg')


<span style="color:red"><strong>WARNING!</strong></span> Once you installed the packages in the previous cell you must restart your runtime and then import the library and load the model

In [1]:
import spacy
if spacy.prefer_gpu():
    print("Using the GPU")
else:
    print("Using the CPU")
es_nlp = spacy.load('es_core_news_lg')

Using the GPU


For development work, in case you want to update the files in your GitHub branch by rerunning the clone, you first have to empty the folder.

In [2]:
!rm -rf policy-data-analyzer/

In [3]:
# Define branch to clone
! branch_name='#50_dfq_sbert_fine_tuning' && \
  git clone --branch $branch_name https://github.com/wri-dssg/policy-data-analyzer.git

Cloning into 'policy-data-analyzer'...
remote: Enumerating objects: 481, done.[K
remote: Counting objects: 100% (481/481), done.[K
remote: Compressing objects: 100% (309/309), done.[K
remote: Total 2883 (delta 305), reused 326 (delta 171), pack-reused 2402[K
Receiving objects: 100% (2883/2883), 126.70 MiB | 27.15 MiB/s, done.
Resolving deltas: 100% (1463/1463), done.
Checking out files: 100% (843/843), done.


In [32]:
import pandas as pd
import sys
import os
import csv
from sklearn.model_selection import train_test_split
from sentence_transformers import SentencesDataset, SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import LabelAccuracyEvaluator
from torch import nn, Tensor
from typing import Iterable, Dict
from torch.utils.data import DataLoader
import math
import time
import cupy as cp

# os.chdir("policy-data-analyzer") #If you run this cell more than once, comment out this line because you are ready in this folder and you will get an error
from tasks.data_loader.src.utils import *
from tasks.data_augmentation.src.zero_shot_classification.latent_embeddings_classifier import *
from tasks.evaluate_model.src.model_evaluator import *
from tasks.data_visualization.src.plotting import *

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Fine-tuning the embedding model on the labeled data

### Something we can try out:
https://www.sbert.net/examples/training/data_augmentation/README.html#extend-to-your-own-datasets

### Links:
https://github.com/UKPLab/sentence-transformers/issues/350

https://omoindrot.github.io/triplet-loss

### Possible tasks for fine-tuning:
1) Given a pair of sentence embeddings, do they belong to the same category (binary)?

2) Given a sentence and a category embedding, does the sentence belong to the category (binary)?

3) Given a sentence embedding, use a classifier to predict its category (multiclass) [https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli.py](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli.py)

4) Use a triplet loss approach such that sentences (texts) that have the same labels will become close in vector space, while sentences with a different label will be further away [https://github.com/UKPLab/sentencetransformers/blob/master/examples/training/other/training_batch_hard_trec_continue_training.py](https://github.com/UKPLab/sentencetransformers/blob/master/examples/training/other/training_batch_hard_trec_continue_training.py)
   
#### In this notebook **task number 3** is used to fine-tune the model.

## Run fine tuning experiments

In [5]:
class SoftmaxClassifier(nn.Module):
    """
    This loss adds a softmax classifier on top of the output of the transformer network. 
    It takes a sentence embedding and learns a mapping between it and the corresponding category.
    :param model: SentenceTransformer model
    :param sentence_embedding_dimension: Dimension of your sentence embeddings
    :param num_labels: Number of different labels
    """
    def __init__(self,
                 model: SentenceTransformer,
                 sentence_embedding_dimension: int,
                 num_labels: int):
        super(SoftmaxClassifier, self).__init__()
        self.model = model
        self.num_labels = num_labels
        self.classifier = nn.Linear(sentence_embedding_dimension, num_labels)

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        # Get batch sentence embeddings
        features = self.model(sentence_features[0])['sentence_embedding']
        
        # Get batch loss
        output = self.classifier(features)
        loss_fct = nn.CrossEntropyLoss()

        if labels is not None:
            loss = loss_fct(output, labels.view(-1))
            return loss
        else:
            return features, output

In [6]:
# This is the dictionary used to transform policy instrument labels into numeric codes. BEWARE that we have a new category which is 0. This new category might represent unknown incentive or no incentive!!!
policy_dict = {'Credit' : 'Credit',
 'Direct' : 'Direct payment',
 'Fine' : 'Fine',
 'General' : 'Unknown', 
 'Guarantee' : 'Credit', 
 'Supplies' : 'Supplies', 
 'Tax' : 'Tax deduction', 
 'Technical' : 'Technical assistance', 
 'Unknown' : 'Unknown', 
 'Other' : 'Unknown', 
 'Nan' : 'Unknown' }
 
Three_most_common = ['Credit', 'Direct payment', 'Fine']
All_but_unknown = ['Credit', 'Direct payment', 'Fine', 'Supplies', 'Tax deduction', 'Technical assistance']
All = ['Credit', 'Direct payment', 'Fine', 'Supplies', 'Tax deduction', 'Technical assistance', 'Unknown']

# This is the dictionary used to transform is_incentive labels into numeric codes.
incentive_dict = {'Incentive' : 'Incentive', 
'Disincentive' : 'Incentive', 
'Unknown' : 'not_Incentive', 
'Nan' : 'not_Incentive'}

### Experiment codes
<strong>EXP0 -</strong> Multiclass Rater2 with all the labels

<strong>EXP1 -</strong> Multiclass Cristina's old dataset all labels but unknown

<strong>EXP2 -</strong> Multiclass Rater3 all labels but unknown

<strong>EXP3 -</strong> Multiclass Rater2 all labels but unknown

<strong>EXP4 -</strong> Multiclass Rater1 all labels but unknown

<strong>EXP5 -</strong> Multiclass Rater3 all Three_most_common labels

<strong>EXP6 -</strong> Multiclass Rater2 all Three_most_common labels

<strong>EXP7 -</strong> Multiclass Rater1 Three_most_common labels

<strong>EXP8 -</strong>



### Set up variables
Here you set up the main variables of the experiments:
* <strong><i>rater</i></strong> . This is the database you want to work with. The values are:
  * "Rater1" for using the labeled sentences from Daniel
  * "Rater2" for using the labeled sentences from Cristina
  * "Rater3" for using the labeled sentences from Jordi
* <strong><i>Experiment</i></strong> . This is to rename folders and files to be saved in the right place. Please follow the above list to number your experients as "EXPi"
* <strong><i>set_of_labels</i></strong> . In the multiclass classification you can choose to filter the dataset by label. There are three predefined categories which are defined in two cells above:
  * <i>Three_most_common</i> for using Credit Direct Payment and Fine
  * <i>All_but_unknown</i> 
  * <i>All</i>
* <strong><i>set_of_labels_string</i></strong> . Just write "All" or "All_but_unknown" in agreement with the previus variable setting
* <strong><i>dataset_fname</i></strong> . To adjust the path of the data to your particular system setting
* <strong><i>model_names</i></strong> . You put in the list the names of the models to be used.

In [139]:
rater = "Rater1" # TODO: Change accordingly to what is the dataset you want to analyze

Experiment = "EXP7"

set_of_labels = All_but_unknown
set_of_labels_string = "All_but_unknown"

# This first one is the one used by David and Daniel
# dataset_fname = "/content/drive/MyDrive/WRI-LatinAmerica-Talent/Cristina_Policy_Files/Tagged_sentence_lists/{}_labeled.json".format(rater)
# This one is the one used by Jordi
path = "/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Cristina_Policy_Files/Tagged_sentence_lists/datasets/"

model_names = ['stsb-xlm-r-multilingual', 'paraphrase-xlm-r-multilingual-v1']#, 'quora-distilbert-multilingual''distiluse-base-multilingual-cased-v2', 

In [142]:
filename = "dataset_" + rater + "_" + set_of_labels_string + "_sentences.csv"
file = path + filename
with open(file, newline='') as f:
    reader = csv.reader(f)
    all_sents = list(reader)[0]

filename = "dataset_" + rater + "_" + set_of_labels_string + "_labels.csv"
file = path + filename
with open(file, newline='') as f:
    reader = csv.reader(f)
    all_labels = list(reader)[0]

filename = "testset_" + rater + "_" + set_of_labels_string + "_sentences.csv"
file = path + filename
with open(file, newline='') as f:
    reader = csv.reader(f)
    test_sents = list(reader)[0]

filename = "testset_" + rater + "_" + set_of_labels_string + "_labels.csv"
file = path + filename
with open(file, newline='') as f:
    reader = csv.reader(f)
    test_labels = list(reader)[0]

In [143]:
len(all_sents)
all_sents

['El acompañamiento social para el desarrollo comunitario y el acompañamiento técnico para el fortalecimiento de las/los sujetos agrarios y la implementación de los sistemas agroforestales, será realizado por las/los técnicos(as) sociales y productivos(as), mediante el diálogo de saberes basado en el intercambio de conocimientos y experiencias, aprendiendo de la sabiduría de las gentes que han convivido con la naturaleza, especialmente con su territorio específico, por muchas generaciones, y propiciando el diálogo intergeneracional.',
 '25% de descuento en el pago del derecho de aprovechamiento, si el titular de la concesión reporta anualmente a la ARFFS y al SERFOR los resultados de las parcelas permanentes de muestreo que establezca en el área concesionada.',
 'Asegurar disponibilidad de semillas y plantas de calidad.',
 '25% de descuento en el pago del derecho de aprovechamiento, por establecer dos o más especies nativas.',
 'Documentacion Juridica pertinente para llevar a cabo la d

In [None]:
# The rest:
label_names = unique_labels(all_labels)
numeric_labels = labels2numeric(all_labels, label_names)
label_names

In [10]:
# Train test split stratified
all_test_perc = [0.15, 0.2, 0.25, 0.3]

# Output setup
output = {}

for test_perc in all_test_perc:
  output[f"test_perc={test_perc}"] = {}
  X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)

  # Load data samples into batches
  train_batch_size = 16
  label2int = dict(zip(label_names, range(len(label_names))))
  train_samples = []
  for sent, label in zip(X_train, y_train):
      label_id = label2int[label]
      train_samples.append(InputExample(texts=[sent], label=label_id))

  # Configure the dev set evaluator - still need to test whether this works
  dev_samples = []
  for sent, label in zip(X_test, y_test):
      label_id = label2int[label]
      dev_samples.append(InputExample(texts=[sent], label=label_id))
  
  for model_name in model_names:
    # Setup
    model_preds = []
    model_scores = []
    output[f"test_perc={test_perc}"][model_name] = []
    
    # Train set config
    model = SentenceTransformer(model_name)
    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    
    # Define the way the loss is computed
    classifier = SoftmaxClassifier(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))
    
    # Dev set config
    dev_dataset = SentencesDataset(dev_samples, model=model)
    dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)
    dev_evaluator = LabelAccuracyEvaluator(dataloader=dev_dataloader, softmax_model=classifier, name='lae-dev')

    # Configure the training
    max_num_epochs = 10
        
    for num_epochs in range(4, max_num_epochs + 2, 2):
        print("Num epochs:", num_epochs)
        
        warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1)  # 10% of train data for warm-up
        model_deets = f"model={model_name}_test-perc={test_perc}_n-epoch={num_epochs}"
        model_save_path = f"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/FineTuning_{model_deets}"
        

        # Train the model
        start = time.time()
        model.fit(train_objectives=[(train_dataloader, classifier)],
                  evaluator=dev_evaluator,
                  epochs=2, # We always tune on an extra epoch to see the performance gain
                  evaluation_steps=1000,
                  warmup_steps=warmup_steps
                  )
        
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        print("Time taken for fine-tuning:", "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        
        ### Classify sentences
        # Projection matrix Z low-dim projection
        print("Classifying sentences...")
        proj_matrix = cp.asnumpy(calc_proj_matrix(test_sents, 50, es_nlp, model, 0.01))
        all_sent_embs = encode_all_sents(test_sents, model, proj_matrix)
        all_label_embs = encode_labels(label_names, model, proj_matrix)
        visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50, store_name=f"{model_save_path}/{model_deets}")
        model_preds, model_scores = calc_all_cos_similarity(all_sent_embs, all_label_embs, label_names)
        
        ### Evaluate the model
        numeric_preds = labels2numeric(model_preds, label_names)
        evaluator = ModelEvaluator(label_names, y_true=numeric_labels, y_pred=numeric_preds)
        
        output[f"test_perc={test_perc}"][model_name].append({"num_epochs": num_epochs, "avg_f1": evaluator.avg_f1.tolist()})
        
        evaluator.plot_confusion_matrix(color_map='Blues', exp_name=f"{model_save_path}/{model_deets}")

Num epochs: 4


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…

Evaluating:  33%|███▎      | 1/3 [00:00<00:00,  9.02it/s]




Evaluating: 100%|██████████| 3/3 [00:00<00:00,  7.63it/s]


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…

Evaluating:  33%|███▎      | 1/3 [00:00<00:00,  7.12it/s]




Evaluating: 100%|██████████| 3/3 [00:00<00:00,  7.75it/s]



Time taken for fine-tuning: 00:00:18.83
Classifying sentences...


OutOfMemoryError: ignored

In [None]:
output.keys()

dict_keys(['test_perc=0.15', 'test_perc=0.2', 'test_perc=0.25', 'test_perc=0.3'])

In [11]:
! nvidia-smi

Sat Jan 16 22:16:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P0    25W /  75W |   7609MiB /  7611MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
new_json = {}

for key in output.keys():
  new_json[key] = {}
  for subkey in output[key].keys():
    new_json[key][subkey] = []
    for element in output[key][subkey]:
      el_copy = {"avg_f1": element["avg_f1"], "num_epochs": element["num_epochs"]}
      new_json[key][subkey].append(el_copy)

In [None]:
import json
with open(f"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/{Experiment}_FineTuningResults.json", "w") as f:
  json.dump(new_json, f)

### Building datasets

This piece of code below has been used to build the test splits and the datasets for the fine tuning. Don't execute it unless you kow what you want :)

In [160]:
rater = "Rater1"
set_of_labels = All
set_of_labels_string = "All"

dataset_fname = "/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Cristina_Policy_Files/Tagged_sentence_lists/{}_labeled.json".format(rater)
dataset = load_file(dataset_fname)
dataset_map = labeled_sentences_from_dataset(dataset) # Labels AND sentences
dataset_map_target_labels =  select_labels(dataset_map, set_of_labels) # Adjust to the labels you want to use for your analysis. In the cell above you have three pre-defined sets.
all_sents = sentences_from_dataset(dataset_map_target_labels) # Just sentences
all_labels = labels_from_dataset(dataset_map_target_labels) # Just labels. 

In [161]:
# # The rest:
label_names = unique_labels(all_labels)
numeric_labels = labels2numeric(all_labels, label_names)
label_names

['Credit',
 'Fine',
 'Direct payment',
 'Supplies',
 'Technical assistance',
 'Unknown',
 'Tax deduction']

In [162]:
X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=0.2, stratify=all_labels, random_state=69420)

In [163]:
# import csv
path = "/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Cristina_Policy_Files/Tagged_sentence_lists/datasets/"

filename = "dataset_{}_{}_sentences.csv".format(rater, set_of_labels_string)
file = path + filename
with open(file, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(X_train)

filename = "dataset_{}_{}_labels.csv".format(rater, set_of_labels_string)
file = path + filename
with open(file, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(y_train)

filename = "testset_{}_{}_sentences.csv".format(rater, set_of_labels_string)
file = path + filename
with open(file, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(X_test)

filename = "testset_{}_{}_labels.csv".format(rater, set_of_labels_string)
file = path + filename
with open(file, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(y_test)
