# Zero-Shot Learning Experiments: Latent Embeddings

Using https://joeddav.github.io/blog/2020/05/29/ZSL.html#A-latent-embedding-approach

### 1. Load 5 countries' sentences

In [None]:
import pandas as pd
import sys

sys.path.append("../../")
sys.path.append("../../../")
from tasks.data_loader.src.utils import *

In [None]:
def country_labeled_sentences(excel_map):
    result = {}
    sent_num = 0
    
    for country, dataframe in excel_map.items():

        new_sents_col = dataframe["Sentence"].dropna()
        new_labels_col= dataframe["Primary Instrument"].dropna()
        
        sentences = list(new_sents_col.apply(lambda x: x.replace("\n", "").strip()))
        label_col = new_labels_col.apply(lambda x: x.replace("(PES)", "").replace("(Bond)", "").strip())
        labels = [[string.strip() for string in label.split(", ")][0] for label in label_col]
        result[country] = {}

        for sent, label in zip(sentences, labels):
            if sent_num not in result[country]:
                result[country][sent_num] = {"text": sent, "labels": [label]}
            else:
                result[country][sent_num]["text"] = sent
                result[country][sent_num]["labels"] = [label]
            
            sent_num += 1
            
    return result

def sentences_from_model_output(model_preds):
    return [preds["text"] for preds in model_preds.values()]

In [None]:
data_excel = pd.read_excel("../input/WRI_Policy_Tags.xlsx", engine="openpyxl", sheet_name=None)

In [None]:
all_labeled_sentences = country_labeled_sentences(data_excel)
label_names = ['Credit',
 'Direct payment',
 'Fine',
 'General incentive',
 'Guarantee',
 'Supplies',
 'Tax deduction',
 'Technical assistance',
 'Unknown']

In [None]:
mexico_sents = sentences_from_model_output(all_labeled_sentences['Mexico'])

In [None]:
mexico_labels = labels_from_model_output(all_labeled_sentences['Mexico'])

In [None]:
labeled_sents = dict()

for sents in all_labeled_sentences.values():
    labeled_sents.update(sents)

In [None]:
all_sents = sentences_from_model_output(labeled_sents)
all_labels = labels_from_model_output(labeled_sents)
label_names = list(set(all_labels))

In [None]:
all_sents[:2], all_labels[:2]

### 2. Write out latent embedding algorithm

#### Step 1. Take the top K most frequent words V in the vocabulary of a word2vec model

In [None]:
import spacy
from collections import Counter

In [None]:
es_nlp = spacy.load('es_core_news_md')

In [None]:
sents_as_str = ". ".join(mexico_sents)

In [None]:
doc = es_nlp(sents_as_str)

In [None]:
# all tokens that arent stop words or punctuations
words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and len(token.text) > 3]

In [None]:
# 20 most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(20)

In [None]:
top_20_words = list(list(zip(*common_words))[0])

In [None]:
top_20_words

#### Step 2. Obtain embeddings for each word using word2vec, $\Phi_{word}(V)$

In [None]:
word2vec_embeddings = []

for word in top_20_words:
    doc = es_nlp(word)
    vector = doc.vector
    word2vec_embeddings.append(vector.reshape(1, 300))

In [None]:
word2vec_embeddings[5].shape

#### Step 3. Obtain embeddings for each word using S-BERT, $\Phi_{sent}(V)$

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [None]:
sbert_embeddings = []

for word in top_20_words:
    vector = model.encode([word], convert_to_numpy=True)
    sbert_embeddings.append(vector)

In [None]:
sbert_embeddings[5].shape

#### Step 4. Learn a least-squares linear projection matrix Z with L2 regularization from $\Phi_{sent}(V)$ to $\Phi_{word}(V)$

In [None]:
import numpy as np

In [None]:
stacked_sbert = np.vstack(sbert_embeddings)
stacked_word2vec = np.vstack(word2vec_embeddings)

In [None]:
stacked_sbert.shape, stacked_word2vec.shape

In [None]:
# Help from: https://stackoverflow.com/questions/27476933/numpy-linear-regression-with-regularization and https://www.kdnuggets.com/2016/11/linear-regression-least-squares-matrix-multiplication-concise-technical-overview.html
# Multiple Linear Regression with OLS parameter estimation with L2 regularization term
lamda = 0.01  # lambda = 0 is equivalent to OLS estimation without regularization
Z = np.linalg.inv(stacked_sbert.T.dot(stacked_sbert) + lamda*np.eye(stacked_sbert.shape[1])).dot(stacked_sbert.T).dot(stacked_word2vec)

In [None]:
Z, Z.shape

#### Step 5. Use $Z$ in our classification as an additional transformation to S-BERT embeddings

$$ \hat{c} = arg\,min\,cos(\Phi_{sent}(x)Z, \Phi_{sent}(c)Z)$$

In [None]:
import torch
from torch.nn import functional as F

In [None]:
sentence = 'Quien sera el presidente en 2020?'
labels = ['negocios', 'cultura', 'politica']

In [None]:
sentence_rep = torch.from_numpy(np.matmul(model.encode(sentence), Z)).reshape(1,300)
label_reps = torch.from_numpy(np.matmul(model.encode(labels), Z))

In [None]:
sentence_rep.shape, label_reps.shape

In [None]:
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

In [None]:
print(closest)

#### Step 6. Build functions for the process!

In [None]:
def top_k_words(k, document, spacy_model, include_labels=None):
    doc = spacy_model(document)
    
    # all tokens that arent stop words or punctuations and are longer than 3 letters
    words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and len(token.text) > 3]
    
    # k most common tokens
    word_freq = Counter(words)
    common_words = word_freq.most_common(k)
    
    result = list(list(zip(*common_words))[0])
    
    if include_labels:
        result.extend(include_labels)
    
    return result


def top_k_word_embeddings(top_k_words, spacy_model):
    word_embeddings = []

    for word in top_k_words:
        doc = spacy_model(word)
        vector = doc.vector
        word_embeddings.append(vector.reshape(1, vector.shape[0]))
        
    return word_embeddings


def top_k_sbert_embeddings(top_k_words, sbert_model):
    sbert_embeddings = []

    for word in top_k_words:
        vector = sbert_model.encode([word], convert_to_numpy=True)
        sbert_embeddings.append(vector)
    
    return sbert_embeddings


def least_squares_with_reg(X, y, lamda=0.01):
    # Help from: https://stackoverflow.com/questions/27476933/numpy-linear-regression-with-regularization and https://www.kdnuggets.com/2016/11/linear-regression-least-squares-matrix-multiplication-concise-technical-overview.html
    # Multiple Linear Regression with OLS parameter estimation with L2 regularization term. lambda = 0 is equivalent to OLS estimation without regularization
    return np.linalg.inv(X.T.dot(X) + lamda*np.eye(X.shape[1])).dot(X.T).dot(y)


def calc_proj_matrix(sentences, k, spacy_model, sbert_model, lamda=0.01, include_labels=None):
    sents_as_str = ". ".join(sentences)
    top_words = top_k_words(k, sents_as_str, spacy_model, include_labels)
    word_emb = np.vstack(top_k_word_embeddings(top_words, spacy_model))
    sent_emb = np.vstack(top_k_sbert_embeddings(top_words, sbert_model))
    proj_matrix = least_squares_with_reg(sent_emb, word_emb, lamda)
    
    return proj_matrix

def encode_sentence(sentence, model, Z):
    sentence_rep = torch.from_numpy(np.matmul(model.encode(sentence), Z))
    sentence_rep = sentence_rep.reshape(1, sentence_rep.shape[0])
    return sentence_rep

def encode_labels(labels, model, Z):
    return torch.from_numpy(np.matmul(model.encode(labels), Z))

def classify_sentence(sentence, labels, model, Z):
    sentence_rep = encode_sentence(sentence, model, Z)
    label_reps = encode_labels(labels, model, Z)
    
    similarities = F.cosine_similarity(sentence_rep, label_reps)
    closest = similarities.argsort(descending=True)
    
    top_index = closest[0]
    return labels[top_index], similarities[top_index]

def classify_sentence_given_label_reps(sentence, label_names, label_reps, model, Z):
    sentence_rep = encode_sentence(sentence, model, Z)
    
    similarities = F.cosine_similarity(sentence_rep, label_reps)
    closest = similarities.argsort(descending=True)
    
    top_index = closest[0]
    return label_names[top_index], similarities[top_index]

def classify_all_sentences(all_sents, label_names, sbert_model, proj_matrix):
    model_preds, model_scores = [], []
    label_reps = encode_labels(label_names, sbert_model, proj_matrix)

    for sent in tqdm(all_sents):
        pred, score = classify_sentence_given_label_reps(sent, label_names, label_reps, sbert_model, proj_matrix)
        model_preds.append(pred)
        model_scores.append(score)
        
    return model_preds, model_scores

#### Step 7. Time to play on our data!

In [None]:
from tqdm import tqdm
from tasks.evaluate_model.src.model_evaluator import *

In [None]:
sbert_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [None]:
model_preds, model_scores = [], []

for mexico_sent in tqdm(mexico_sents):
    pred, score = classify_sentence(mexico_sent, label_names, sbert_model, Z)
    model_preds.append(pred)
    model_scores.append(score)

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
num_multi_labels = labels_to_numeric(mexico_labels, label_names)
num_multi_preds = labels_to_numeric(model_preds, label_names)

In [None]:
num_multi_labels[:10], num_multi_preds[:10]

In [None]:
plot_data_distribution(num_multi_labels, label_names)

In [None]:
evaluator.evaluate(num_multi_labels, num_multi_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="multi_class_test")

In [None]:
evaluator.plot_precision_recall_curve(num_multi_labels, num_multi_preds, bin_class=False, all_classes=True, store=True, exp_name="multi_class_test")

## Further Experiments

### Things to play around with
- Labels
    - [x] Mix credit and guarantee together 
    - [x] Take out general incentive and unknown 
    - [ ] Replace unknown with something else?
- [x] Visualize embeddings 
- Embeddings
    - [x] Include the labels in the Z matrix process
    - [ ] Different values of lamda for projection matrix (for regulatization)
    - [ ] Learn an additional least-squares projection matrix to the embeddings of any available labels from their corresponding data embeddings (as described in the *When some annotated data is available* section of the base article)
    - Top k words 
        - [ ] Different values of k for top k words
        - [ ] Use more words for k words, and use the stems of the word 
- [ ] Model type for sentence embeddings
- [ ] Model type for word embeddings?
- [ ] Fine tuning sentence model

### Initial setup

In [None]:
# Setup - Get projection matrix and define model
sbert_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')
proj_matrix = calc_proj_matrix(all_sents, 50, es_nlp, sbert_model, 0.1)
all_sents = sentences_from_model_output(labeled_sents)
all_labels = labels_from_model_output(labeled_sents)
label_names = list(set(all_labels))

### Experiment 1. Merge Credit and Guarantee

In [None]:
def merge_labels(all_labels, labels_to_merge):
    return [f"{labels_to_merge[0]} & {labels_to_merge[1]}" if label in labels_to_merge else label for label in all_labels]

In [None]:
all_labels = merge_labels(all_labels, ["Credit", "Guarantee"]) 
label_names = list(set(all_labels))
num_labels = labels_to_numeric(all_labels, label_names)
plot_data_distribution(num_labels, label_names)

In [None]:
model_preds, model_scores = classify_all_sentences(all_sents, label_names, sbert_model, proj_matrix)
num_preds = labels_to_numeric(model_preds, label_names)

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
evaluator.evaluate(num_labels, num_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="latent_emb_exp1")

In [None]:
evaluator.plot_precision_recall_curve(num_labels, model_preds, bin_class=False, all_classes=True, store=True, exp_name="latent_emb_exp1")

### Experiment 2. Get rid of Unknown and General incentive, and merge Credit and Guarantee

In [None]:
filtered_sents_maps = [sent for sent in labeled_sents.values() if sent['labels'][0] not in ["General incentive", "Unknown"]]
all_sents = [sent['text'] for sent in filtered_sents_maps]
all_labels = [sent['labels'][0] for sent in filtered_sents_maps]
all_labels = merge_labels(all_labels, ["Credit", "Guarantee"]) 
label_names = list(set(all_labels))
label_names

In [None]:
num_labels = labels_to_numeric(all_labels, label_names)
plot_data_distribution(num_labels, label_names)

In [None]:
model_preds, model_scores = classify_all_sentences(all_sents, label_names, sbert_model, proj_matrix)
num_preds = labels_to_numeric(model_preds, label_names)

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
evaluator.evaluate(num_labels, num_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="latent_emb_exp2")

In [None]:
evaluator.plot_precision_recall_curve(num_labels, model_preds, bin_class=False, all_classes=True, store=True, exp_name="latent_emb_exp2")

### Experiment 3. Visualizing data (Setup from experiment 2.)

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import scprep

In [None]:
sbert_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')
proj_matrix = calc_proj_matrix(all_sents, 50, es_nlp, sbert_model, 0.01)
all_sent_embs = np.vstack([encode_sentence(sent, sbert_model, proj_matrix) for sent in tqdm(all_sents)])

In [None]:
numeric_labels = labels_to_numeric(all_labels, label_names)
df = pd.DataFrame()
df["y"] = np.array(numeric_labels)
all_sent_embs.shape, len(df["y"])

In [None]:
def visualize_embeddings_2D(embs, numeric_labels, tsne_perplexity, pca_k_n_comps=None, seed=69420):
    df = pd.DataFrame()
    df["y"] = np.array(numeric_labels)
    num_labels = len(set(numeric_labels))
    
    # Data for plot 1
    pca = PCA(n_components=2, random_state=seed)
    pca_result = pca.fit_transform(embs)
    df['pca-1'] = pca_result[:,0]
    df['pca-2'] = pca_result[:,1] 
    
    # Data for plot 2
    tsne = TSNE(n_components=2, verbose=1, perplexity=tsne_perplexity, n_iter=1000, random_state=seed)
    tsne_results = tsne.fit_transform(embs)
    df["tsne-1"] = tsne_results[:,0]
    df["tsne-2"] = tsne_results[:,1]
    
    # Actual plotting
    plt.figure(figsize=(24, 4))
    ax1 = plt.subplot(1, 3, 1)
    sns.scatterplot(
        x="pca-1", y="pca-2",
        hue=df.y.tolist(),
        palette="bright",
        data=df,
        legend=False,
        ax=ax1
    ).set(title="PCA projection")
    
    ax2 = plt.subplot(1, 3, 2)
    sns.scatterplot(
        x="tsne-1", y="tsne-2",
        hue=df.y.tolist(),
        palette="bright",
        data=df,
        legend=False if pca_k_n_comps else "auto",
        ax=ax2
    ).set(title="t-SNE projection")
    
    if pca_k_n_comps:
        # Data for plot 3
        pca_k = PCA(n_components=pca_k_n_comps, random_state=seed)
        pca_k_result = pca_k.fit_transform(embs)
        tsne = TSNE(n_components=tsne_n_comps, verbose=1, perplexity=tsne_perplexity, n_iter=1000, random_state=seed)
        tsne_pca_results = tsne.fit_transform(pca_k_result)
        df[f"tsne-pca-{pca_k_n_comps}-1"] = tsne_pca_results[:,0]
        df[f"tsne-pca-{pca_k_n_comps}-2"] = tsne_pca_results[:,1]
        
        # Actual plotting
        ax3 = plt.subplot(1, 3, 3)
        sns.scatterplot(
            x=f"tsne-pca-{pca_k_n_comps}-1", y=f"tsne-pca-{pca_k_n_comps}-2",
            hue=df.y.tolist(),
            palette="bright",
            data=df,
            ax=ax3
        ).set(title="t-SNE on PCA projection")

    plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)

def visualize_PCA_embeddings_3D(embs, labels, n_comps, fname=None, seed=69420):
    if n_comps < 3:
        print("The number of PCA components has to be at least 3!")
        return
    
    n_labels = len(set(labels))
    pca = PCA(n_components=n_comps, random_state=seed)
    pca_result = pca.fit_transform(embs)
    data = np.vstack([pca_result[:,0], pca_result[:,1], pca_result[:,2]]).T
    colors = np.array(labels)
    
    return scprep.plot.rotate_scatter3d(data, c=colors, figsize=(10,8), title=f"PCA {n_comps} components", legend_anchor=(1.01, 1), filename=fname)

def visualize_tSNE_embeddings_3D(embs, labels, n_comps=3, tsne_perplexity=50, fname=None, seed=69420):
    
    n_labels = len(set(labels))
    tsne = TSNE(n_components=n_comps, verbose=1, perplexity=tsne_perplexity, n_iter=1000, random_state=seed)
    tsne_result = tsne.fit_transform(embs)
    data = np.vstack([tsne_result[:,0], tsne_result[:,1], tsne_result[:,2]]).T
    colors = np.array(labels)
    
    return scprep.plot.rotate_scatter3d(data, c=colors, figsize=(10,8), title=f"t-SNE {tsne_perplexity} perplexity", legend_anchor=(1.01, 1), filename=fname)

In [None]:
visualize_embeddings_2D(all_sent_embs, all_labels, pca_k_n_comps=50, tsne_perplexity=40)

In [None]:
visualize_PCA_embeddings_3D(all_sent_embs, all_labels, 50, "PCA_50_components.gif")

In [None]:
visualize_tSNE_embeddings_3D(all_sent_embs, all_labels, 3, tsne_perplexity=50, fname="tSNE_50_perplexity.gif")

In [None]:
import phate

In [None]:
phate_operator = phate.PHATE(knn=4, decay=15, t=12)#(k=2, t=5000, n_pca=50, random_state=69420, knn_dist='cosine')
tree_phate = phate_operator.fit_transform(all_sent_embs)
phate.plot.scatter2d(phate_operator, c=all_labels, legend_anchor=(1.01, 1))
phate.plot.rotate_scatter3d(phate_operator, c=all_labels, legend_anchor=(1.01, 1), filename="phate_knn=4_decay=15_t=12.gif")

### Experiment 4. Include the labels in the Z matrix process (Setup from experiment 2.)

In [None]:
proj_matrix = calc_proj_matrix(all_sents, 50, es_nlp, sbert_model, 0.1, include_labels=label_names)

In [None]:
model_preds, model_scores = classify_all_sentences(all_sents, label_names, sbert_model, proj_matrix)
num_preds = labels_to_numeric(model_preds, label_names)

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
evaluator.evaluate(num_labels, num_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="latent_emb_exp3")

In [None]:
evaluator.plot_precision_recall_curve(num_labels, model_preds, bin_class=False, all_classes=True, store=True, exp_name="latent_emb_exp3")

### Experiment 5.  Fine tune sentence embedding model (Setup from experiment 2.)
The FineTuning folder that contains the fine-tuned model is located on Google Drive under the folder WRI-LatinAmerica-Talent/Modeling/FineTuning. To execute the following code you should download this folder to tasks/augment_data/output/FineTuning.

In [None]:
# Load the saved model and obtain random sentence embedding
model_save_path = "../output/FineTuning"
load_model = SentenceTransformer(model_save_path)

In [None]:
# Simple low-dim projection
all_sent_embs = np.vstack([load_model.encode(sent) for sent in all_sents])
visualize_embeddings(all_sent_embs, all_labels, tsne_perplexity=40)

In [None]:
# Projection matrix Z low-dim projection
proj_matrix = calc_proj_matrix(all_sents, 50, es_nlp, load_model, 0.01)
all_sent_embs = np.vstack([encode_sentence(sent, load_model, proj_matrix) for sent in all_sents])
visualize_embeddings(all_sent_embs, all_labels, tsne_perplexity=40)

In [None]:
num_labels = labels_to_numeric(all_labels, label_names)
plot_data_distribution(num_labels, label_names)

In [None]:
# Classify all sentences
model_preds, model_scores = classify_all_sentences(all_sents, label_names, load_model, proj_matrix)
num_preds = labels_to_numeric(model_preds, label_names)

In [None]:
evaluator = ModelEvaluator(label_names)

In [None]:
evaluator.evaluate(num_labels, num_preds, 
                   plot_cm=True, normalize=True, 
                   store=True, exp_name="latent_emb_exp4")

In [None]:
evaluator.plot_precision_recall_curve(num_labels, model_preds, bin_class=False, all_classes=True, store=True, exp_name="latent_emb_exp4")