In [None]:
# Here, we generated idiom embeddings using the trained PIER+ model.

In [1]:
import os
import json
from datetime import datetime
from tqdm import trange
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_distances


from src.utils.data_util import  DataHandlerCLS
from src.train_valid_test_step import *
from config import Config as config
from src.model.bart_adapters import BartAdapterCombined
from src.model.bert_adapters import  BertAdapter
from src.utils.model_util import *


## Load PIER+ model

In [3]:
# Load model
data_handler = DataHandlerCLS()

# Manage and initialize model
# ---------------------------------------------------------------------------------
# Initialize model
model = BartAdapterCombined(data_handler.config)
model.to(config.DEVICE)
model.eval()



Number of idioms: 1521
Training dataset size: 32693
Validation dataset size: 4102
Testing dataset size: 4080
Load base model from facebook/bart-base
=> Initializing Adapters with Fusion Module...

[COMPOSITIONAL MODULE]: 
==> Using BART output as compositional embedding!

[NON-COMPOSITIONAL MODULE]: 
==> Loading Adapter from /home/zzeng/workspace/UIUC_research/RepresentationLearning/models/PIER/checkpoints/bart-adapters_non-compositional_magpie_random-GIEA/best/
==> Non-compositional adapter loaded!

[COMBINED MODULE]: 
==> Adding fusion module!
Fuse[compositional, non-compositional]
==> Loading Adapter Fusion from /home/zzeng/workspace/UIUC_research/RepresentationLearning/models/PIER/checkpoints/bart-adapters_fusion_magpie_random-PIER/best/
==>  Adapter Fusion Loaded!


BartAdapterCombined(
  (model): BartForConditionalGeneration(
    (model): BartModel(
      (shared): Embedding(50265, 768, padding_idx=1)
      (encoder): BartEncoder(
        (invertible_adapters): ModuleDict()
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
        (layers): ModuleList(
          (0): BartEncoderLayer(
            (self_attn): BartAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=

## Generate IE embeddings from  PIER+ model

In [4]:
# Run prediction on test set
def mean_pooling(token_embeddings, attention_mask):
    # token_embeddings: [batch_size, max_seq_len, hidden_dim]
    # token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

idioms2embed = {}


In [5]:
model.eval()    
            
count = 0        
bbar = tqdm(enumerate(data_handler.validset_generator),
                ncols=100, leave=False, total=data_handler.config.num_batch_valid)

for idx, data in bbar:

    with torch.no_grad():
        # model forward pass to compute loss
        outputs = model.model(**data['inputs'])
    embeds = model.mean_pooling(outputs.last_hidden_state, data['phrase_masks'])
    embeds = embeds.detach().cpu().numpy()
    idioms = data['idioms']
    labels = data['labels']
    for i, idiom in enumerate(idioms): 
        if labels[i] == 1: 
            count +=1
            if idiom not in idioms2embed: 
                idioms2embed[idiom] = []
            idioms2embed[idiom].append(embeds[i])

                                                                                                    

In [6]:
idioms2embed = {k: np.mean(v, 0) for k, v in idioms2embed.items()}      
idioms = [k for k in idioms2embed.keys()]
print("Number of idioms with embeddings:", len(idioms))

Number of idioms with embeddings: 927


In [7]:
### Generate similarity matrix among the IEs and find top-k similar IEs
embed_matrix = np.array([idioms2embed[k] for k in idioms])
embed_distances = cosine_distances(embed_matrix, embed_matrix)
embed_distances_argsort = np.argsort(embed_distances, axis=- 1)
k = 20
idioms_to_k_similar_idioms = {}
for idx, idiom in enumerate(idioms):
    idioms_to_k_similar_idioms[idiom] = [[idioms[i], embed_distances[idx][i]] for i in embed_distances_argsort[idx][:k].tolist()]


## Show Example Similar IEs in the embedding space

In [8]:
idioms[:5]

['come to think of it',
 'in light of',
 'get stuck in',
 'keep a straight face',
 'work to rule']

In [9]:
idioms_to_k_similar_idioms["streets ahead"]

[['streets ahead', 0.0],
 ['within an ace of', 0.2657256],
 ['thin as a rake', 0.30747408],
 ['by a long chalk', 0.3227167],
 ['writ large', 0.3315162],
 ['out of this world', 0.3358181],
 ['hands down', 0.33906674],
 ['see eye to eye', 0.34751314],
 ['sure as eggs is eggs', 0.35784304],
 ['plain as a pikestaff', 0.35981655],
 ['for my money', 0.36552393],
 ['to die for', 0.36816168],
 ['to a T', 0.3721143],
 ['safe as houses', 0.37966752],
 ['like a bat out of hell', 0.38828826],
 ['piping hot', 0.38970816],
 ['tough as old boots', 0.3901841],
 ['far and away', 0.39553428],
 ['to all intents and purposes', 0.39599103],
 ['par for the course', 0.40517056]]

In [10]:
idioms_to_k_similar_idioms["in the final analysis"]

[['in the final analysis', 0.0],
 ['at the end of the day', 0.28763485],
 ['in light of', 0.3319497],
 ['all things being equal', 0.3830461],
 ['moment of truth', 0.41219515],
 ['the die is cast', 0.42674035],
 ['fall into place', 0.43464047],
 ['at a pinch', 0.4614349],
 ['true to form', 0.46339655],
 ['make the grade', 0.47711295],
 ['all over bar the shouting', 0.47885817],
 ['in a nutshell', 0.48183256],
 ['come out in the wash', 0.483594],
 ['without fail', 0.48694438],
 ['for my money', 0.49782443],
 ['up to scratch', 0.50782543],
 ['cut and dried', 0.508388],
 ['run the gamut', 0.50872135],
 ['bar none', 0.5088779],
 ['when all is said and done', 0.51531637]]

In [11]:
idioms_to_k_similar_idioms["see red"]

[['see red', 0.0],
 ['go spare', 0.11840683],
 ['in stitches', 0.19273853],
 ['pissed off', 0.20207787],
 ['hot and bothered', 0.21367598],
 ['drive someone up the wall', 0.24889445],
 ['on the warpath', 0.2571019],
 ['high as a kite', 0.2678098],
 ["knock someone's block off", 0.27082056],
 ["do someone's head in", 0.27665257],
 ["get under someone's skin", 0.28626728],
 ["get someone's goat", 0.2875234],
 ['eat your heart out', 0.30015606],
 ['down in the dumps', 0.3022977],
 ['laugh like a drain', 0.30811393],
 ['touch a nerve', 0.3123538],
 ['scream blue murder', 0.31640375],
 ['spit the dummy', 0.3178118],
 ["get up someone's nose", 0.31822938],
 ['bang to rights', 0.32358658]]

## Save Idiom Embeddings for implicit evaluation

In [12]:

idioms2embed = {k:v.tolist() for k, v in idioms2embed.items()}

In [13]:

def write_json_file(path, data):
    with open(path, 'w') as f:
        json.dump(data, f)
    return

In [14]:
write_json_file('./generated_embeddings/idiom2embed-PIER.json', idioms2embed)