# Note
This notebook can be run on google colab for improved performance. The code changes necessary for running on this system are commented over the code.

## Data preprocessing

In [None]:
!pip install scprep

In [None]:
!pip install spacy==2.3.2

In [None]:
! python -m spacy download es_core_news_lg

In [None]:
! pip install sentence_transformers==0.4.0

In [None]:
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from sentence_transformers import SentencesDataset, SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import LabelAccuracyEvaluator
from torch import nn, Tensor
from typing import Iterable, Dict
from torch.utils.data import DataLoader
import math

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt


def load_file(file_name):
    with open(file_name, "r") as f:
        return json.load(f)


def labeled_sentences_from_dataset(dataset):
    sentence_tags_dict = {}

    for document in dataset.values():
        for section in document.values():
            sentence_tags_dict.update(section['sentences'])

    return sentence_tags_dict


def numeric_labels_from_dataset(dataset):
    """
    TEMPORARY: We're getting the set of unique labels from the data, but in reality we should standardize this and define it in a separate env file.
    """
    dataset_labels = labels_from_dataset(dataset)
    label_names = [label for label in set(dataset_labels)]
    label_names.sort()
    label_map = dict(zip(label_names, range(len(label_names))))
    num_dataset_labels = [label_map[label] for label in dataset_labels]
    return num_dataset_labels


def labels2numeric(labels, label_names):
    label_map = dict(zip(label_names, range(len(label_names))))
    num_dataset_labels = [label_map[label] for label in labels]
    return num_dataset_labels


def unique_labels(all_labels):
    return list(set(all_labels))


def filter_out_labeled_sents(sents, labels_to_filter):
    return [sent for sent in sents.values() if sent['labels'][0] not in labels_to_filter]


def sort_model_preds(dataset, model_preds):
    """
    Sorts the model predictions in the order that the input dataset is in.
    """
    # Dictionaries are insertion ordered since Python 3.6+,
    ordered_preds = {}

    for sentence_id in dataset:
        ordered_preds[sentence_id] = model_preds[sentence_id]

    return ordered_preds


def sentences_from_dataset(dataset):
    sentences = []

    for document in dataset.values():
        for section in document.values():
            for sentence in section['sentences'].values():
                sentences.append(sentence['text'])

    return sentences


def labels_from_dataset(dataset):
    labels = []

    for document in dataset.values():
        for section in document.values():
            for sentence in section['sentences'].values():
                labels.append(sentence['labels'][0])

    return labels


def country_labeled_sentences(excel_map):
    result = {}
    sent_num = 0

    for country, dataframe in excel_map.items():

        new_sents_col = dataframe["Sentence"].dropna()
        new_labels_col = dataframe["Primary Instrument"].dropna()

        sentences = list(new_sents_col.apply(lambda x: x.replace("\n", "").strip()))
        label_col = new_labels_col.apply(lambda x: x.replace("(PES)", "").replace("(Bond)", "").strip())
        labels = [[string.strip() for string in label.split(", ")][0] for label in label_col]
        result[country] = {}

        for sent, label in zip(sentences, labels):
            if sent_num not in result[country]:
                result[country][sent_num] = {"text": sent, "labels": [label]}
            else:
                result[country][sent_num]["text"] = sent
                result[country][sent_num]["labels"] = [label]

            sent_num += 1

    return result


def labeled_sentences_from_excel(excel_map):
    country2labeledsents = country_labeled_sentences(excel_map)
    labeled_sents = dict()
    for sents in country2labeledsents.values():
        labeled_sents.update(sents)

    return labeled_sents


def sentences_from_model_output(model_preds):
    return [preds["text"] for preds in model_preds.values()]


def labels_from_model_output(model_preds):
    return [preds["labels"][0] for preds in model_preds.values()]


def get_counts_per_label(y_true, n_classes):
    """
    Return a map of {label: number of data points with that label} for the given list of labels

    Parameters:
        - y_true: (integer) a list of labels
        - n_classes: (integer) the number of classes
    """
    label_counts = [0] * n_classes
    for label in y_true:
        label_counts[label] += 1
    return label_counts


def merge_labels(all_labels, labels_to_merge):
    return [f"{labels_to_merge[0]} & {labels_to_merge[1]}" if label in labels_to_merge else label for label in all_labels]


def plot_data_distribution(data, label_names, normalize=True):
    weights = np.array(get_counts_per_label(data, len(label_names)))
    if normalize:
        weights = weights / sum(weights)

    plt.bar(label_names, weights)
    plt.xticks(label_names, rotation=90)
    plt.title("Data Distribution")
    plt.xlabel("Label")
    plt.ylabel("Percentage of label in data")
    plt.show()

    print("Label counts:")
    print(dict(zip(label_names, weights)))


In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_recall_curve, f1_score, accuracy_score, precision_score, \
    recall_score, average_precision_score
from sklearn.preprocessing import label_binarize
import numpy as np
import matplotlib.pyplot as plt
import itertools
from itertools import cycle
import matplotlib.colors as mcolors

import sys

sys.path.append("../../")

METRICS = ["Precision", "Recall (Sensitivity)", "True negative rate (Specificity)", "F1-score"]


class ModelEvaluator:
    def __init__(self, label_names, output_path="../output/", y_true=None, y_pred=None):
        self.label_names = label_names
        self.df_indices = label_names + ["Macro avg", "Weighted avg"]
        self.output_path = output_path
        self.n_classes = len(label_names)

        if y_true is not None and y_pred is not None:
            self.update(y_true, y_pred)

    def update(self, y_true, y_pred):
        """
        Given a set of true labels and model predictions, calculate and store the following metrics:
            - Confusion matrix
            - FP: Number of false positives
            - FN: Number of false negatives
            - TP: Number of true positives
            - TN: Number of true negatives
            - Recall
            - Specificity
            - Precision
            - F1-score
            - Accuracy
            - FDR: False discovery rate
            - NPV: Negative predictive value
            - FPR: False positive rate
            - False negative rate
        """
        # Ignore division by 0 errors
        settings = np.seterr(divide='ignore', invalid='ignore')

        # ---- Set up raw components ----
        self.cm = confusion_matrix(y_true, y_pred)
        self.FP = self.cm.sum(axis=0) - np.diag(self.cm)
        self.FN = self.cm.sum(axis=1) - np.diag(self.cm)
        self.TP = np.diag(self.cm)
        self.TN = self.cm.sum() - (self.FP + self.FN + self.TP)

        # ---- Useful metrics at the class level ----
        self.recall = self.TP / (self.TP + self.FN)
        self.specificity = self.TN / (self.TN + self.FP)
        self.precision = self.TP / (self.TP + self.FP)
        self.f1 = (2 * self.precision * self.recall) / (self.precision + self.recall)

        # ---- Useful metrics across all classes ----
        self.avg_precision = np.array(
            [precision_score(y_true, y_pred, average='macro'), precision_score(y_true, y_pred, average='weighted')])
        self.avg_recall = np.array(
            [recall_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='weighted')])
        self.avg_specificity = np.array([np.mean(self.specificity), self.weighted_avg(self.specificity, y_true, )])
        self.avg_f1 = np.array(
            [f1_score(y_true, y_pred, average='macro'), f1_score(y_true, y_pred, average='weighted')])
        self.accuracy = accuracy_score(y_true, y_pred)
        self.acc = np.array(["-----", "-----", "-----", self.accuracy])

        # ---- Extra metrics at the class level ----
        self.FDR = self.FP / (self.TP + self.FP)  # False discovery rate
        self.NPV = self.TN / (self.TN + self.FN)  # Negative predictive value
        self.FPR = self.FP / (self.FP + self.TN)  # Fall out or false positive rate
        self.FNR = self.FN / (self.TP + self.FN)  # False negative rate


    def evaluate(self, y_true, y_pred, plot_cm=False, normalize=False, exp_name=None):
        """
        Given a set of true labels and model predictions, runs a series of selected evaluation metrics:
            - Precision
            - Recall (Sensitivity)
            - Accuracy
            - Specificity
            - Confusion matrix
            - Precision-Recall curve

        Parameters:
            `plot_cm`: (boolean) Plot confusion matrix
            `plot_prc`: (boolean) Plot precision-recall curve (averaged for all classes)
            `plot_prc_multi`: (boolean) Plot the multi-class version of the precision-recall curve (`plot_prc` MUST be `True` if this is set to `True`)
            `normalize`: (boolean) Normalize the confusion matrix content
            `store`: (boolean) Store the plots and the results dataframe. If this is set to `True`, then `exp_name` MUST have a value and it can't be None. The files will be stored in the `evaluate_model/output/` folder.
            `exp_name`: (str) The name of the model or the experiment, useful if we will want to store files (e.g `test_BETO_1`).
        """

        self.update(y_true, y_pred)

        data = np.stack((self.precision, self.recall, self.specificity, self.f1)).T
        avgs = np.array([self.avg_precision, self.avg_recall, self.avg_specificity, self.avg_f1]).T
        data_with_avgs = np.concatenate((data, avgs))

        metrics_df = pd.DataFrame(data_with_avgs, index=self.df_indices, columns=METRICS)
        metrics_df = metrics_df.applymap(lambda x: round(x, 2))
        metrics_df.loc['Accuracy'] = self.acc

        line = pd.DataFrame(dict(zip(METRICS, ["-----"] * len(METRICS))), index=["-----"])
        metrics_df = pd.concat([metrics_df.iloc[:self.n_classes], line, metrics_df.iloc[self.n_classes:]])
        self.metrics_df = metrics_df.fillna(0)

        if plot_cm:
            self.plot_confusion_matrix(color_map="Blues",
                                       normalize=normalize,
                                       exp_name=f"{self.output_path}{exp_name}")

        if exp_name:
            fname = f"{self.output_path + exp_name}_results.csv"
            self.metrics_df.to_csv(fname)
            print(f"Stored results: {fname}")

        return self.metrics_df

    def get_counts_per_label(self, y_true):
        """
        Return a map of {label: number of data points with that label} for the given list of labels

        Parameters:
            - y_true: a list of labels (integers)
        """
        label_counts = [0] * self.n_classes
        for label in y_true:
            label_counts[label] += 1
        return label_counts

    def weighted_avg(self, metric_array, y_true):
        """
        Given a numpy array of a particular metric for all classes (i.e precision for all classes),
        return a weighted average of the metric, where the weights are the number of data points that
        have a given label.

        Parameters:
            - metric array: a 1D-numpy array of floats representing metrics
            - y_true: a list of labels (integers)
        """
        weights = self.get_counts_per_label(y_true)
        weighted_metrics = sum(metric_array * weights)
        return weighted_metrics / len(y_true)


    def plot_confusion_matrix(self, title='Confusion matrix',
                              color_map=None,
                              normalize=True,
                              exp_name=None):
        """
        Adapted from: https://stackoverflow.com/questions/19233771/sklearn-plot-confusion-matrix-with-labels
        """
        if color_map is None:
            color_map = plt.get_cmap('Blues')

        plt.figure(figsize=(8, 6))
        plt.imshow(self.cm, interpolation='nearest', cmap=color_map)

        plt.title(title)
        plt.colorbar()
        plt.style.use('seaborn-white')

        if self.label_names:
            tick_marks = np.arange(len(self.label_names))
            plt.xticks(tick_marks, self.label_names, rotation=45)
            plt.yticks(tick_marks, self.label_names)

        if normalize:
            self.cm = self.cm.astype('float') / self.cm.sum(axis=1)[:, np.newaxis]

        thresh = self.cm.max() / 1.5 if normalize else self.cm.max() / 2
        for i, j in itertools.product(range(self.cm.shape[0]), range(self.cm.shape[1])):
            if normalize:
                plt.text(j, i, "{:0.2f}".format(self.cm[i, j]),
                         horizontalalignment="center",
                         color="white" if self.cm[i, j] > thresh else "black")
            else:
                plt.text(j, i, "{:,}".format(self.cm[i, j]),
                         horizontalalignment="center",
                         color="white" if self.cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.xlabel('Predicted label')
        plt.ylabel('True label')

        if exp_name:
            fname = f"{exp_name}_cm.png"
            plt.savefig(fname)
            print(f"Stored confusion matrix: {fname}")

        plt.show()

    def plot_precision_recall_curve(self, y_true, y_pred, bin_class=False, all_classes=False,
                                    exp_name=None):
        """
        Plots the precision-recall curve for either a binary or multi-class classification model.

        Parameters:
            y_true: (np.array[int]) The true labels for the dataset
            y_pred: If binary_class is True, y_pred should be a numpy array of floats holding the prediction probabilities,
                else y_pred should be a numpy array of ints holding the predictions themselves
            bin_class: (boolean) Whether we should plot the curve for a binary classification setting
            all_classes: (boolean) In a multi-class classification problem, whether we should plot the curve for all classes or just the average
            store: (boolean) Whether we want to store this plot or not
            exp_name: (str) The name of the experiment, used to name the file to store the plot. Requires store=True
        """

        if bin_class:
            if not isinstance(y_pred.flat[0], np.floating):
                print("Error: Array of predictions should contain probabilities [0.3, 0.75] instead of labels [0, 1] for binary classification problems.")
                return

            random_pred_precision = y_true.mean()

            precision, recall, _ = precision_recall_curve(y_true, y_pred)
            average_precision = average_precision_score(y_true, y_pred)

            plt.figure()
            plt.plot([0, 1], [random_pred_precision, random_pred_precision], linestyle='--', label='Random Prediction')
            plt.step(recall, precision, where='post')

            plt.xlabel('Recall')
            plt.ylabel('Precision')

            plt.ylim([0.0, 1.05])
            plt.xlim([0.0, 1.0])
            plt.title('Precision-Recall Curve. Avg Precision=' + str(round(average_precision, 2)))

            if exp_name:
                fname = f"{self.output_path}{exp_name}_prc.png"
                plt.savefig(fname)
                print(f"Stored Precision-Recall Curve: {fname}")

            plt.show()

        else:
            y_true_bin = label_binarize(y_true, classes=range(self.n_classes))
            y_pred_bin = label_binarize(y_pred, classes=range(self.n_classes))

            precision = dict()
            recall = dict()
            average_precision = dict()

            for i in range(self.n_classes):
                precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i],
                                                                    y_pred_bin[:, i])
                average_precision[i] = average_precision_score(y_true_bin[:, i], y_pred_bin[:, i])

            # A "micro-average": quantifying score on all classes jointly
            precision["micro"], recall["micro"], _ = precision_recall_curve(y_true_bin.ravel(),
                                                                            y_pred_bin.ravel())
            average_precision["micro"] = average_precision_score(y_true_bin, y_pred_bin,
                                                                 average="micro")

            random_pred_precision = y_true_bin.mean()

            if all_classes:

                # Setup plot details
                colors = cycle(list(mcolors.TABLEAU_COLORS.keys()))
                plt.figure(figsize=(7, 8))
                plt.style.use('seaborn-white')

                # Plot f1 score lines
                f_scores = np.linspace(0.2, 0.8, num=4)
                lines = []
                labels = []
                for f_score in f_scores:
                    x = np.linspace(0.01, 1)
                    y = f_score * x / (2 * x - f_score)
                    l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
                    plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

                # Plot precision-recall lines
                lines.append(l)
                labels.append('iso-f1 curves')
                l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2)
                lines.append(l)
                labels.append('Micro-average Precision-Recall (area = {0:0.2f})'
                              ''.format(average_precision["micro"]))

                for i, color in zip(range(self.n_classes), colors):
                    l, = plt.plot(recall[i], precision[i], color=color, lw=2)
                    lines.append(l)
                    labels.append('Precision-Recall for class {0} (area = {1:0.2f})'
                                  ''.format(i, average_precision[i]))

                rand_l, = plt.plot([0, 1], [random_pred_precision, random_pred_precision], linestyle='--')
                lines.append(rand_l)
                labels.append("Precision-Recall for Random Classifier")

                # Final touches on plot
                fig = plt.gcf()
                fig.subplots_adjust(bottom=0.25)
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('Recall')
                plt.ylabel('Precision')
                plt.title('Multiclass Precision-Recall Curve')
                plt.legend(lines, labels, loc=(0, -.68), prop=dict(size=14))

                if exp_name:
                    fname = f"{self.output_path}{exp_name}_prc.png"
                    plt.savefig(fname)
                    print(f"Stored Precision-Recall Curve: {fname}")
                plt.show()

            else:
                plt.figure()
                plt.plot([0, 1], [random_pred_precision, random_pred_precision], linestyle='--', label='Random Prediction')
                plt.step(recall["micro"], precision["micro"], where='post')

                plt.xlabel('Recall')
                plt.ylabel('Precision')

                plt.ylim([0.0, 1.05])
                plt.xlim([0.0, 1.0])
                plt.title('Averaged Precision-Recall Curve')

                if exp_name:
                    fname = f"{self.output_path}{exp_name}_prc.png"
                    plt.savefig(fname)
                    print(f"Stored Precision-Recall Curve: {fname}")

                plt.show()

    def plot_data_distribution(self, data, normalize=True):
        weights = np.array(self.get_counts_per_label(data))
        if normalize:
            weights = weights / sum(weights)

        plt.bar(self.label_names, weights)
        plt.xticks(self.label_names, rotation=90)
        plt.title("Data Distribution")
        plt.xlabel("Label")
        plt.ylabel("Percentage of label in data")
        plt.show()

        print("Label counts:")
        print(dict(zip(self.label_names, weights)))


In [None]:
def country_labeled_sentences(excel_map):
    result = {}
    sent_num = 0
    
    for country, dataframe in excel_map.items():

        new_sents_col = dataframe["Sentence"].dropna()
        new_labels_col= dataframe["Primary Instrument"].dropna()
        
        sentences = list(new_sents_col.apply(lambda x: x.replace("\n", "").strip()))
        label_col = new_labels_col.apply(lambda x: x.replace("(PES)", "").replace("(Bond)", "").strip())
        labels = [[string.strip() for string in label.split(", ")][0] for label in label_col]
        result[country] = {}

        for sent, label in zip(sentences, labels):
            if sent_num not in result[country]:
                result[country][sent_num] = {"text": sent, "labels": [label]}
            else:
                result[country][sent_num]["text"] = sent
                result[country][sent_num]["labels"] = [label]
            
            sent_num += 1
            
    return result

def merge_labels(all_labels, labels_to_merge):
    return [f"{labels_to_merge[0]} & {labels_to_merge[1]}" if label in labels_to_merge else label for label in all_labels]


In [None]:
from collections import Counter
import numpy as np
import torch
from torch.nn import functional as F
from tqdm import tqdm


def top_k_words(k, document, spacy_model, include_labels=None):
    doc = spacy_model(document)

    # all tokens that arent stop words or punctuations and are longer than 3 letters
    words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and len(token.text) > 3]

    # k most common tokens
    word_freq = Counter(words)
    common_words = word_freq.most_common(k)

    result = list(list(zip(*common_words))[0])

    if include_labels:
        result.extend(include_labels)

    return result


def top_k_word_embeddings(top_k_words, spacy_model):
    word_embeddings = []

    for word in top_k_words:
        doc = spacy_model(word)
        vector = doc.vector
        word_embeddings.append(vector.reshape(1, vector.shape[0]))

    return word_embeddings


def top_k_sbert_embeddings(top_k_words, sbert_model):
    sbert_embeddings = []

    for word in top_k_words:
        vector = sbert_model.encode([word], convert_to_numpy=True)
        sbert_embeddings.append(vector)

    return sbert_embeddings


def least_squares_with_reg(X, y, lamda=0.01):
    # Help from: https://stackoverflow.com/questions/27476933/numpy-linear-regression-with-regularization and https://www.kdnuggets.com/2016/11/linear-regression-least-squares-matrix-multiplication-concise-technical-overview.html
    # Multiple Linear Regression with OLS parameter estimation with L2 regularization term. lambda = 0 is equivalent to OLS estimation without regularization

    X_inv = cupy.array(np.linalg.inv(X.T.dot(X) + lamda * np.eye(X.shape[1])).dot(X.T))

    # return np.linalg.inv(X.T.dot(X) + lamda * np.eye(X.shape[1])).dot(X.T).dot(y)
    return X_inv.dot(y)


def calc_proj_matrix(sentences, k, spacy_model, sbert_model, lamda=0.01, include_labels=None):
    sents_as_str = ". ".join(sentences)
    top_words = top_k_words(k, sents_as_str, spacy_model, include_labels)
    word_emb = np.vstack(top_k_word_embeddings(top_words, spacy_model))
    sent_emb = np.vstack(top_k_sbert_embeddings(top_words, sbert_model))
    proj_matrix = least_squares_with_reg(sent_emb, word_emb, lamda)

    return proj_matrix


def encode_sentence(sentence, model, Z):
    sentence_rep = torch.from_numpy(np.matmul(model.encode(sentence), Z))
    sentence_rep = sentence_rep.reshape((1, sentence_rep.shape[0]))
    return sentence_rep


def encode_labels(labels, model, Z):
    return torch.from_numpy(np.matmul(model.encode(labels), Z))


def classify_sentence(sentence, label_names, model, Z):
    sentence_rep = encode_sentence(sentence, model, Z)
    label_reps = encode_labels(label_names, model, Z)

    return calc_cos_similarity(sentence_rep, label_reps, label_names)


def calc_cos_similarity(sentence_rep, label_reps, label_names):
    similarities = F.cosine_similarity(sentence_rep, label_reps)
    closest = similarities.argsort(descending=True)

    top_index = closest[0]
    return label_names[top_index], similarities[top_index]


def classify_sentence_given_label_reps(sentence, label_names, label_reps, model, Z):
    sentence_rep = encode_sentence(sentence, model, Z)

    return calc_cos_similarity(sentence_rep, label_reps, label_names)


def calc_all_cos_similarity(all_sents_reps, label_reps, label_names):
    model_preds, model_scores = [], []
    for sent_rep in tqdm(all_sents_reps):
        pred, score = calc_cos_similarity(sent_rep, label_reps, label_names)
        model_preds.append(pred)
        model_scores.append(score)

    return model_preds, model_scores


def classify_all_sentences(all_sents, label_names, sbert_model, proj_matrix):
    model_preds, model_scores = [], []
    label_reps = encode_labels(label_names, sbert_model, proj_matrix)

    for sent in tqdm(all_sents):
        pred, score = classify_sentence_given_label_reps(sent, label_names, label_reps, sbert_model, proj_matrix)
        model_preds.append(pred)
        model_scores.append(score)

    return model_preds, model_scores


def encode_all_sents(all_sents, sbert_model, proj_matrix=None):
    if proj_matrix is None:
        stacked = np.vstack([sbert_model.encode(sent) for sent in tqdm(all_sents)])
    else:
        stacked = np.vstack([encode_sentence(sent, sbert_model, proj_matrix) for sent in tqdm(all_sents)])
    return [torch.from_numpy(element).reshape((1, element.shape[0])) for element in stacked]


In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scprep


def visualize_embeddings_2D(embs, numeric_labels, tsne_perplexity, pca_k_n_comps=None, seed=69420, store_name=None):
    df = pd.DataFrame()
    df["y"] = np.array(numeric_labels)

    # Data for plot 1
    pca = PCA(n_components=2, random_state=seed)
    pca_result = pca.fit_transform(embs)
    df['pca-1'] = pca_result[:, 0]
    df['pca-2'] = pca_result[:, 1]

    # Data for plot 2
    tsne = TSNE(n_components=2, verbose=1, perplexity=tsne_perplexity, n_iter=1000, random_state=seed)
    tsne_results = tsne.fit_transform(embs)
    df["tsne-1"] = tsne_results[:, 0]
    df["tsne-2"] = tsne_results[:, 1]

    # Actual plotting
    plt.figure(figsize=(24, 4))
    ax1 = plt.subplot(1, 3, 1)
    sns.scatterplot(
        x="pca-1", y="pca-2",
        hue=df.y.tolist(),
        palette="bright",
        data=df,
        legend=False,
        ax=ax1
    ).set(title="PCA projection")

    ax2 = plt.subplot(1, 3, 2)
    sns.scatterplot(
        x="tsne-1", y="tsne-2",
        hue=df.y.tolist(),
        palette="bright",
        data=df,
        legend=False if pca_k_n_comps else "auto",
        ax=ax2
    ).set(title="t-SNE projection")

    if pca_k_n_comps:
        # Data for plot 3
        pca_k = PCA(n_components=pca_k_n_comps, random_state=seed)
        pca_k_result = pca_k.fit_transform(embs)
        tsne = TSNE(n_components=2, verbose=1, perplexity=tsne_perplexity, n_iter=1000, random_state=seed)
        tsne_pca_results = tsne.fit_transform(pca_k_result)
        df[f"tsne-pca-{pca_k_n_comps}-1"] = tsne_pca_results[:, 0]
        df[f"tsne-pca-{pca_k_n_comps}-2"] = tsne_pca_results[:, 1]

        # Actual plotting
        ax3 = plt.subplot(1, 3, 3)
        sns.scatterplot(
            x=f"tsne-pca-{pca_k_n_comps}-1", y=f"tsne-pca-{pca_k_n_comps}-2",
            hue=df.y.tolist(),
            palette="bright",
            data=df,
            ax=ax3
        ).set(title="t-SNE on PCA projection")

    plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0)
    
    if store_name:
        plt.savefig(store_name + "_viz.png")


def visualize_PCA_embeddings_3D(embs, labels, fname=None, seed=69420):
    pca = PCA(n_components=3, random_state=seed)
    pca_result = pca.fit_transform(embs)
    data = np.vstack([pca_result[:, 0], pca_result[:, 1], pca_result[:, 2]]).T
    colors = np.array(labels)

    return scprep.plot.rotate_scatter3d(data, c=colors, figsize=(10, 8), title=f"PCA 3 components",
                                        legend_anchor=(1.01, 1), filename=fname)


def visualize_tSNE_embeddings_3D(embs, labels, perplexity=50, fname=None, seed=69420):
    tsne = TSNE(n_components=3, verbose=1, perplexity=perplexity, n_iter=1000, random_state=seed)
    tsne_result = tsne.fit_transform(embs)
    data = np.vstack([tsne_result[:, 0], tsne_result[:, 1], tsne_result[:, 2]]).T
    colors = np.array(labels)

    return scprep.plot.rotate_scatter3d(data, c=colors, figsize=(10, 8), title=f"t-SNE {perplexity} perplexity",
                                        legend_anchor=(1.01, 1), filename=fname)


## Fine-tuning the embedding model on the labeled data

### Something we can try out:
https://www.sbert.net/examples/training/data_augmentation/README.html#extend-to-your-own-datasets

### Links:
https://github.com/UKPLab/sentence-transformers/issues/350

https://omoindrot.github.io/triplet-loss

### Possible tasks for fine-tuning:
1) Given a pair of sentence embeddings, do they belong to the same category (binary)?

2) Given a sentence and a category embedding, does the sentence belong to the category (binary)?

3) Given a sentence embedding, use a classifier to predict its category (multiclass) [https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli.py](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli.py)

4) Use a triplet loss approach such that sentences (texts) that have the same labels will become close in vector space, while sentences with a different label will be further away [https://github.com/UKPLab/sentencetransformers/blob/master/examples/training/other/training_batch_hard_trec_continue_training.py](https://github.com/UKPLab/sentencetransformers/blob/master/examples/training/other/training_batch_hard_trec_continue_training.py)
   
#### In this notebook **task number 3** is used to fine-tune the model.

In [None]:
# Train test split stratified
X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=0.15, stratify=all_labels, random_state=42)

In [None]:
# Define model to fine-tune
model = SentenceTransformer('stsb-xlm-r-multilingual')
# model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [None]:
class SoftmaxClassifier(nn.Module):
    """
    This loss adds a softmax classifier on top of the output of the transformer network. 
    It takes a sentence embedding and learns a mapping between it and the corresponding category.
    :param model: SentenceTransformer model
    :param sentence_embedding_dimension: Dimension of your sentence embeddings
    :param num_labels: Number of different labels
    """
    def __init__(self,
                 model: SentenceTransformer,
                 sentence_embedding_dimension: int,
                 num_labels: int):
        super(SoftmaxClassifier, self).__init__()
        self.model = model
        self.num_labels = num_labels
        self.classifier = nn.Linear(sentence_embedding_dimension, num_labels)

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        # Get batch sentence embeddings
        features = self.model(sentence_features[0])['sentence_embedding']
        
        # Get batch loss
        output = self.classifier(features)
        loss_fct = nn.CrossEntropyLoss()

        if labels is not None:
            loss = loss_fct(output, labels.view(-1))
            return loss
        else:
            return features, output

In [None]:
# Load data samples into batches
train_batch_size = 16
label2int = dict(zip(label_names, range(len(label_names))))
train_samples = []
for sent, label in zip(X_train, y_train):
    label_id = label2int[label]
    train_samples.append(InputExample(texts=[sent], label=label_id))
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

# Define the way the loss is computed
classifier = SoftmaxClassifier(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))

# Configure the dev set evaluator - still need to test whether this works
dev_samples = []
for sent, label in zip(X_test, y_test):
    label_id = label2int[label]
    dev_samples.append(InputExample(texts=[sent], label=label_id))
dev_dataset = SentencesDataset(dev_samples, model=model)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)
dev_evaluator = LabelAccuracyEvaluator(dataloader=dev_dataloader, softmax_model=classifier, name='lae-dev')

In [None]:
# Configure the training
num_epochs = 1
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1)  # 10% of train data for warm-up
model_save_path = "../../output/FineTuning"
# model_save_path = "/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuning"

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, classifier)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

In [None]:
# Load the saved model and obtain random sentence embedding
load_model = SentenceTransformer(model_save_path)
load_model.encode(all_sents[0])

## Run fine tuning experiments

In [None]:
class SoftmaxClassifier(nn.Module):
    """
    This loss adds a softmax classifier on top of the output of the transformer network. 
    It takes a sentence embedding and learns a mapping between it and the corresponding category.
    :param model: SentenceTransformer model
    :param sentence_embedding_dimension: Dimension of your sentence embeddings
    :param num_labels: Number of different labels
    """
    def __init__(self,
                 model: SentenceTransformer,
                 sentence_embedding_dimension: int,
                 num_labels: int):
        super(SoftmaxClassifier, self).__init__()
        self.model = model
        self.num_labels = num_labels
        self.classifier = nn.Linear(sentence_embedding_dimension, num_labels)

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        # Get batch sentence embeddings
        features = self.model(sentence_features[0])['sentence_embedding']
        
        # Get batch loss
        output = self.classifier(features)
        loss_fct = nn.CrossEntropyLoss()

        if labels is not None:
            loss = loss_fct(output, labels.view(-1))
            return loss
        else:
            return features, output

In [None]:
# Reading data from excel
data_excel = pd.read_excel("/content/drive/MyDrive/WRI-LatinAmerica-Talent/Cristina_Policy_Files/WRI_Policy_Tags.xlsx", engine="openpyxl", sheet_name=None)

In [None]:
# Formatting the data
all_labeled_sentences = country_labeled_sentences(data_excel)
labeled_sents = dict()
for sents in all_labeled_sentences.values():
    labeled_sents.update(sents)

# Fitlering out General Incentive and Unknown sentences
filtered_sents_maps = [sent for sent in labeled_sents.values() if sent['labels'][0] not in ["General incentive", "Unknown", "Other"]]
all_sents = [sent['text'] for sent in filtered_sents_maps]
all_labels = [sent['labels'][0] for sent in filtered_sents_maps]
all_labels = merge_labels(all_labels, ["Credit", "Guarantee"])
label_names = list(set(all_labels))
numeric_labels = labels2numeric(all_labels, label_names)
label_names

['Credit & Guarantee',
 'Fine',
 'Direct payment',
 'Technical assistance',
 'Supplies',
 'Tax deduction']

Reading files from JSON

In [None]:
dataset_fname = ""

# If the json is in the format that includes headers and other titles:
dataset = load_file(dataset_fname)
dataset_map = labeled_sentences_from_dataset(dataset) # Labels AND sentences
all_sents = sentences_from_dataset(dataset) # Just sentences
all_labels = labels_from_dataset(dataset) # Just labels

# If the json is in the format where it only contains sentences and labels
dataset = load_file(dataset_fname) 
dataset_map = labels_from_model_output(dataset) # Labels AND sentences
all_sents = sentences_from_model_output(dataset) # Just sentences
all_labels = labels_from_model_output(dataset) # Just labels

# The rest:
label_names = unique_labels(all_labels)
numeric_labels = labels2numeric(all_labels, label_names)
label_names

In [None]:
import time
import cupy

In [None]:
import spacy
spacy.prefer_gpu()
es_nlp = spacy.load('es_core_news_lg')

In [None]:
model_names = ['distiluse-base-multilingual-cased-v2', 'stsb-xlm-r-multilingual', 'paraphrase-xlm-r-multilingual-v1', 'quora-distilbert-multilingual']

# Train test split stratified
all_test_perc = [0.15, 0.2, 0.25, 0.3, 0.4]

# Output setup
output = {}

for test_perc in all_test_perc:
  output[f"test_perc={test_perc}"] = {}
  X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)

  # Load data samples into batches
  train_batch_size = 16
  label2int = dict(zip(label_names, range(len(label_names))))
  train_samples = []
  for sent, label in zip(X_train, y_train):
      label_id = label2int[label]
      train_samples.append(InputExample(texts=[sent], label=label_id))

  # Configure the dev set evaluator - still need to test whether this works
  dev_samples = []
  for sent, label in zip(X_test, y_test):
      label_id = label2int[label]
      dev_samples.append(InputExample(texts=[sent], label=label_id))
  
  for model_name in model_names:
    # Setup
    model_preds = []
    model_scores = []
    output[f"test_perc={test_perc}"][model_name] = []
    
    # Train set config
    model = SentenceTransformer(model_name)
    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    
    # Define the way the loss is computed
    classifier = SoftmaxClassifier(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))
    
    # Dev set config
    dev_dataset = SentencesDataset(dev_samples, model=model)
    dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)
    dev_evaluator = LabelAccuracyEvaluator(dataloader=dev_dataloader, softmax_model=classifier, name='lae-dev')

    # Configure the training
    max_num_epochs = 10
        
    for num_epochs in range(2, max_num_epochs + 2, 2):
        print("Num epochs:", num_epochs)
        
        warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1)  # 10% of train data for warm-up
        model_deets = f"model={model_name}_test-perc={test_perc}_n-epoch={num_epochs}"
        model_save_path = f"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuning_{model_deets}"
        

        # Train the model
        start = time.time()
        model.fit(train_objectives=[(train_dataloader, classifier)],
                  evaluator=dev_evaluator,
                  epochs=2, # We always tune on an extra epoch to see the performance gain
                  evaluation_steps=1000,
                  warmup_steps=warmup_steps,
                  output_path=model_save_path
                  )
        
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        print("Time taken for fine-tuning:", "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
        
        ### Classify sentences
        # Projection matrix Z low-dim projection
        print("Classifying sentences...")
        proj_matrix = cupy.asnumpy(calc_proj_matrix(all_sents, 50, es_nlp, model, 0.01))
        all_sent_embs = encode_all_sents(all_sents, model, proj_matrix)
        all_label_embs = encode_labels(label_names, model, proj_matrix)
        visualize_embeddings_2D(np.vstack(all_sent_embs), all_labels, tsne_perplexity=50, store_name=f"{model_save_path}/{model_deets}")
        model_preds, model_scores = calc_all_cos_similarity(all_sent_embs, all_label_embs, label_names)
        
        ### Evaluate the model
        numeric_preds = labels2numeric(model_preds, label_names)
        evaluator = ModelEvaluator(label_names, y_true=numeric_labels, y_pred=numeric_preds)
        
        output[f"test_perc={test_perc}"][model_name].append({"num_epochs": num_epochs, "avg_f1": evaluator.avg_f1.tolist()})
        
        evaluator.plot_confusion_matrix(color_map='Blues', exp_name=f"{model_save_path}/{model_deets}")

In [None]:
output.keys()

dict_keys(['distiluse-base-multilingual-cased-v2', 'stsb-xlm-r-multilingual', 'paraphrase-xlm-r-multilingual-v1', 'quora-distilbert-multilingual'])

In [None]:
new_json = {}

for key in output.keys():
  new_json[key] = {}
  for subkey in output[key].keys():
    new_json[key][subkey] = []
    for element in output[key][subkey]:
      el_copy = {"avg_f1": element["avg_f1"].tolist(), "num_epochs": element["num_epochs"]}
      new_json[key][subkey].append(el_copy)

In [None]:
import json
with open("/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuningResults.json", "w") as f:
  json.dump(new_json, f)