#IEEE Coreference Resolution Task
##SetSimilaritySearch + Bert-base embedding semantic similarity

##Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


##Get Entities from NER task

In [None]:
import numpy as np
import json
import pandas as pd

In [None]:
def get_json_list(multiclass_file, software = 'doccano'):
    with open(multiclass_file, 'r') as json_file:
        json_list = list(json_file)

    annotations_json = []
    for line in json_list:
        annotations_json.append(json.loads(line))

    if software == 'prodigy':
        annotations_json = prodigy_to_doccano(annotations_json)

    return annotations_json

In [None]:
def multiclass_to_multilabel(abstract):
    one_hot = [0] * 4
    for annotation in abstract['annotations']:
        original_label = annotation['label'] 
        one_hot_copy = one_hot.copy()
        one_hot_copy[original_label] = 1
        annotation['label_one_hot'] = one_hot_copy

    return abstract

In [None]:
multiclass_file = '/content/drive/MyDrive/ieee_ner_coref/assets/data/annotations/yx_converted_abs_combine_114.jsonl' # Input from Doccano/Prodigy

In [None]:
# Set software to 'doccano' or 'prodigy'
annotations_json = get_json_list(multiclass_file, software = 'doccano')

In [None]:
for abstract in annotations_json:
    abstract = multiclass_to_multilabel(abstract)
    abstract['id'] = abstract['meta']['paperid']
    abstract['text'] = abstract['text'].strip('®')

##Define related classes

In [None]:
from enum import Enum
class EntityClass(Enum):
    '''Represents the class of an entity, e.g. "Method"'''
    NONE = 0
    ORG = 1
    METHOD = 2
    PRODUCT = 3

class Entity:
    '''An entity that has been identified as part of a document

    Attributes:
        parent_doc_id: an identifier for the document this entity exists within
        start: offset of the first character of this entity
          in the parent doc
        text: text from the parent document with this entity
        klass: class of this entity, e.g. "Method"
    '''
    def __init__(self,id,start,text,klass):
      parent_doc_id: int
      start: int
      text: str
      klass: EntityClass
      self.parent_doc_id = id
      self.start = start
      self.text = text
      self.klass = klass

    def __str__(self):
        return self.text

    @property
    def end(self):
        '''Offset of the last character of this entity in the parent doc'''
        return self.start + len(self.text)

    @property
    def location(self):
        return (self.start, self.end)

##Construct dataframe

In [None]:
#The context of entities
for abstract in annotations_json:
  for annotation in abstract['annotations']:
     if sum(np.asarray(annotation['label_one_hot'])) > 0:
      text = abstract['text']
      start = annotation['start_offset']
      end = annotation['end_offset']
      print(text[start:end])

simultaneous backward geocoding
airborne InSAR
multi-aspect SAR
Voting Logic Fusion
coarse to fine
Haar-like
HOG
CAVIAR
full-body detection (FBD)
head-shoulder detection (HSD)
FBD
HSD
MIT, INRIA
bsp-tree
bsp-tree
BP neural network
BP neural network
BP neural networks
hold-out
10-fold cross validation
SIFT
surveillance context (scale invariant image transform (SIFT) keypoints
 geometric primitive features
Latent Semantic Indexing
Expectation Maximization
Normalized Score (ENS)
latent semantic indexing
hyperspectral band grouping
hyperspectral analysis
spatial adaptivity
bilateral filtering
SIFT (scale invariant feature transform)
SIFT
SIFT
K-means
K-means
K-means
K-means
phase correlation
Bayesian learning
sparse Bayesian learning (SBL)
SBL
Bayesian-MCMC
Bayesian theorem
reversible jump MCMC
full orthostereoscopic image capture and projection
conceptual graph formalism (CGF)
OpenNLP
CGF
conceptual graph (CG)
CGs
CG
CG
CG
VerbNet
WordNet
Signal Detection Theory
Sensor fusion
Transferable

In [None]:
entities = []

In [None]:
for abstract in annotations_json:
  for annotation in abstract['annotations']:
     if sum(np.asarray(annotation['label_one_hot'])) > 0:
      text = abstract['text']
      start = annotation['start_offset']
      end = annotation['end_offset']
      id = abstract['id']
      label = annotation['label']
      if label == 1:
        item = Entity(id,start,text[start:end],EntityClass.ORG)
      if label == 2:
        item = Entity(id,start,text[start:end],EntityClass.METHOD)
      if label == 3: 
        item = Entity(id,start,text[start:end],EntityClass.PRODUCT)
      print(item)
      entities.append(item)

In [None]:
np.save("/content/entities.npy", entities)

##SetSimilaritySearch Clustering

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 54.6 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 19.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [None]:
from transformers import (
    BertTokenizer, BertTokenizerFast, BatchEncoding,
    DataCollatorForTokenClassification, BertForTokenClassification,
    Trainer, TrainingArguments
)

In [None]:
###NER model 
class BertForTokenClassificationML(BertForTokenClassification):
    def set_label_weights(self, weights):
        self.label_weights = weights

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss.view(-1), :]
                active_labels = labels[active_loss, :]

                loss_fct = torch.nn.BCEWithLogitsLoss()
                loss = loss_fct(active_logits, active_labels.float())
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
import torch
##Load fine-tuned model from NER task
PATH = "/content/drive/MyDrive/model/bert_finetune.pt"
model = torch.load(PATH,map_location=torch.device('cpu'))

In [None]:
from src.ieee_ner_coref import EntityClusterer
from src.models import Entity
from src.models import EntityClass

data = []
file = "/content/entities.npy"
with open(file, "r") as f:
    for line in f.readlines():
        line = line.strip('\n')  
        data.append(line)
data = np.load(file,allow_pickle=True)
data = data.tolist()
###Get the results from SetSimilaritySearch Clustering
entity_groups = EntityClusterer(method='best').cluster(data)

### Find unclustered entities

In [None]:
un_clustered_l = []
for idx in range(len(data)):
  tag = True
  for i in range(len(entity_groups)):
    for j in range(len(entity_groups[i])):
      if data[idx].text in entity_groups[i][j].text:
        tag = False
  if tag == True:
    un_clustered_l.append(data[idx])

###Save text for unclustered entities

In [None]:
un_cluster = []
for i in range(len(un_clustered_l)):
  un_cluster.append(un_clustered_l[i].text)

### Get all the entities which are clustered

In [None]:
cluster_entities = []
for i in range(len(entity_groups)):
    for j in range(len(entity_groups[i])):
      cluster_entities.append(entity_groups[i][j].text)

In [None]:
##Directly load saving lists
file1 = '/content/cl_entities.npy'
cluster_entities = np.load(file1,allow_pickle=True)
cluster_entities = cluster_entities.tolist()
file2 = '/content/un_entities.npy'
un_cluster = np.load(file2,allow_pickle=True)
un_cluster = un_cluster.tolist()

##BERT-based embedding measurement

In [None]:
### Get the dictionary for the Bert tokenizer
!wget "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt"

--2021-08-16 05:11:47--  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.131.173
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.131.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 213450 (208K) [text/plain]
Saving to: ‘bert-base-cased-vocab.txt’


2021-08-16 05:11:48 (283 KB/s) - ‘bert-base-cased-vocab.txt’ saved [213450/213450]



In [None]:
vocab = []
file = './bert-base-cased-vocab.txt'

In [None]:
with open(file, "r") as f:
    for line in f.readlines():
        line = line.strip('\n') 
        vocab.append(line)
len(vocab)

28996

In [None]:
crf_w2i = {w : i for i, w in enumerate(vocab)}

In [None]:
def get_similarity(embeddings,
                   w2i,
                   term1,
                   term2):
    """

    """
    ## Check Terms
    for term in term1 :
        if term not in w2i:
            raise KeyError(f"Term `{term}` not found")
    for term in term2 :
        if term not in w2i:
            raise KeyError(f"Term `{term}` not found")
    ## Get Indices
    embeddings = embeddings.cpu()
    term1_ind = [] 
    term2_ind = []
    for term in term1 :
      term1_ind.append(torch.LongTensor([w2i[term]]))
    for term in term2 :
      term2_ind.append(torch.LongTensor([w2i[term]]))
  
    ## Retrieve Embeddings and Compute Cosine Similarity
    term1_embed = 0 
    term2_embed = 0
    for idx1 in term1_ind:
      term1_embed += embeddings(idx1)
    for idx2 in term2_ind:
      term2_embed += embeddings(idx2)
    term1_embed = term1_embed/(len(term1_ind))
    term2_embed = term2_embed/(len(term2_ind))

    distance = float(cosine_distances(term1_embed.detach(), term2_embed.detach()))
    return distance
    # print(term1, term2, distance)

In [None]:
### Retrieve embedding layer and the tokenizer
embed = model.bert.embeddings.word_embeddings
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
### Using cosine distance to measure similarity
from sklearn.metrics.pairwise import cosine_distances

In [None]:
entities = []
file1 = './entities.txt'

In [None]:
with open(file1, "r") as f:
    for line in f.readlines():
        line = line.strip('\n') 
        entities.append(line)
len(entities)

318

In [None]:
### This function is for showing the most similar entities
def show_similarity_top(term,data,num)->dict:
  sort_dic = {}
  for item in data:
    item1 = tokenizer.tokenize(item)
    temp = get_similarity(embed, crf_w2i, term,item1)
    sort_dic[item] = temp
  sorted_dic = dict(sorted(sort_dic.items(), key = lambda kv:(kv[1], kv[0])))
  # print(sorted_dic)
  cnt = 0 
  for key, value in sorted_dic.items():
      cnt += 1
      if cnt > num:
          break
      print("{}:{}".format(key, value)) 

In [None]:
def show_similarity(term,data)->dict:
  sort_dic = {}
  for item in data:
    item1 = tokenizer.tokenize(item)
    temp = get_similarity(embed, crf_w2i, term,item1)
    sort_dic[item] = temp
  sorted_dic = dict(sorted(sort_dic.items(), key = lambda kv:(kv[1], kv[0])))
  return sorted_dic

###Test the functions

In [None]:
term = tokenizer.tokenize("CGs")
show_similarity_top(term,entities,10)

CGs:0.0
CG:0.11934900283813477
CGF:0.2058941125869751
AbDG:0.3184809684753418
IEKFs:0.3396766781806946
FG-NET:0.35870277881622314
HOG:0.37542617321014404
conceptual graph (CG):0.3831406235694885
C4.5:0.4044969081878662
conceptual graph formalism (CGF):0.4098879098892212


In [None]:
term1 = tokenizer.tokenize('two layer hierarchical classification')
term2 = tokenizer.tokenize('hierarchical clustering')
get_similarity(embed,crf_w2i,term1,term2)

0.17247456312179565

Load the clustered entities and unclustered entities

In [None]:
file1 = '/content/cl_entities.npy'
cluster = np.load(file1,allow_pickle=True)
cluster = cluster.tolist()
file2 = '/content/un_entities.npy'
un_cluster = np.load(file2,allow_pickle=True)
un_cluster = un_cluster.tolist()

In [None]:
### Load the clustered entities (only contain text)
file3 = '/content/cluster_item.npy'
cluster_item = np.load(file3,allow_pickle=True)

In [None]:
### Number of clusters
len(cluster_item)

80

In [None]:
### This function is to get the average tensor for each clusters
def cluster_avg_embed(cluster_item,embeddings,tokenizer,w2i):
  cluster_embed = []
  for idx in range(len(cluster_item)):
    avg = 0
    for j in range(len(cluster_item[idx])):
      embed_t = 0
      term1 = tokenizer.tokenize(cluster_item[idx][j])
      term_ind = []
      for term in term1 :
        term_ind.append(torch.LongTensor([w2i[term]]))
      for idx1 in term_ind:
        embed_t += embeddings(idx1)
      embed_t = embed_t/(len(term_ind))
      avg += embed_t
    avg = avg/(len(cluster_item[idx]))
    cluster_embed.append(avg)
  return cluster_embed

In [None]:
## Get the embedding representations for each cluster
cluster_embed = cluster_avg_embed(cluster_item,embed,tokenizer,crf_w2i)

In [None]:
## Save embedding tensors
val= torch.tensor([item.cpu().detach().numpy() for item in cluster_embed])
np.save('/content/cluster_embed.npy',val)
## Load embedding tensors
cluster_embed = np.load('/content/cluster_embed.npy')
cluster_embed = cluster_embed.tolist()

In [None]:
###This function is to get the embedding representation of a single term
def get_embedding(term,tokenizer,w2i,embeddings):
  term1 = tokenizer.tokenize(term)
  temp = 0
  term_ind = []
  for t in term1 :
    term_ind.append(torch.LongTensor([w2i[t]]))
  for idx1 in term_ind:
    temp += embeddings(idx1)
  temp = temp/(len(term_ind))
  return temp

In [None]:
### This function is to calculate distance between embeddings
def embed_cos_distance(cluster_embed_,term):
  dict_r = {}
  for i in range(len(cluster_embed_)):
    distance = float(cosine_distances(cluster_embed_[i].detach(), term.detach()))
    dict_r[i] = distance
  return dict_r

TEST

In [None]:
term1 = tokenizer.tokenize('multi-coil MRI')
term2 = tokenizer.tokenize('multiple coil MRI(Magnetic Resonance Imaging) ')
get_similarity(embed,crf_w2i,term1,term2)

0.13428616523742676

###Find entities which should be included into the clusters

In [None]:
mod_list_ = []
### Basically, for each unclustered entity comparing cosine distance with each cluster and decide whether it should be included. 
for w in un_cluster:
  dict_ = {}
  w_embed = get_embedding(w,tokenizer,crf_w2i,embed)
  res = embed_cos_distance(cluster_embed,w_embed)
  if len(w)<=8:
    if res[min(res, key=res.get)]<0.28:
      dict_[w] = min(res, key=res.get)
      mod_list_.append(dict_)
  else:
    if res[min(res, key=res.get)]<0.35:
      dict_[w] = min(res, key=res.get)
      mod_list_.append(dict_)

In [None]:
### The entities which should be included (of course a small part of this would be misplaced)
### The keys in the dict are entities text and values are clusters index
mod_list_

[{'multi-aspect SAR': 10},
 {'SIFT (scale invariant feature transform)': 42},
 {'phase correlation': 63},
 {'CGs': 5},
 {'Predictive Diagnostic Optimisation': 66},
 {'Naive Bayes Classifier': 53},
 {'hierarchical-like particle filter': 73},
 {'fuzzy CMAC': 21},
 {'semi-fragile watermark': 67},
 {'SIFT-based face recognition': 42},
 {'Scale Invariant Feature Transform (or SIFT)': 42},
 {'SIFT-based': 42},
 {'two layer hierarchical classification': 73},
 {'non-local means': 47},
 {'Sugeno Measures': 59},
 {'interval type-2 fuzzy logic': 77},
 {'modular neural network': 60},
 {'Non-hierarchical k-means': 73},
 {'FPGA-based Naive Bayes': 53},
 {'IEKFs': 17}]

In [None]:
add_list = []
for i in range(len(mod_list_)):
  key, = mod_list_[i].keys()
  add_list.append(key)

In [None]:
term1 = tokenizer.tokenize('Bayesian theorem')
term2 = tokenizer.tokenize('Bayesian-MCMC')
get_similarity(embed,crf_w2i,term1,term2)

0.2753172516822815

In [None]:
term1 = tokenizer.tokenize('hyperspectral band grouping')
term2 = tokenizer.tokenize('hyperspectral analysis')
get_similarity(embed,crf_w2i,term1,term2)

0.14471805095672607

In [None]:
## remained entities of un_clustered list after Bert-embedding process
remain_list = list(set(un_cluster).difference(set(add_list)))

In [None]:
remain_list

['Bayesian theorem',
 '(RPat)',
 'ANASTASIL',
 'multi-band wavelet',
 '10-fold cross validation',
 'Xbox 360',
 'Vision-based microassembly',
 'simultaneous backward geocoding',
 'memory learning strategy',
 'tactile information processing',
 'YOHO',
 'EER',
 'Intelligent Reduction Algorithm based on Expert Knowledge',
 'power spectral density estimation',
 'pictorial-structures',
 'Voting Logic Fusion',
 'mixed-mode multiresolution motion estimation',
 'artificial potential field',
 'least-squares',
 'fuzzy inference',
 'spatial adaptivity',
 'WordNet',
 'super-resolution mapping',
 'PSNR',
 'EMT',
 'DCT',
 'coarse-to-fine optical flow',
 'Multi-Interval ID3',
 'knowledge-based document-analysis',
 'Road segment Partitioning',
 'hyperspectral analysis',
 'dynamic time warping',
 'bigram models',
 'CAS-PEAL',
 'super-resolution mapping analyses',
 'stereoscopic rendering',
 'Middlebury1',
 'stochastic cloning',
 'best-first search',
 'PKI (Public Key Infrastructure) based semi-fragile 

###Add entities which should be included to the original clusters

In [None]:
###In order to keep the consistency of entities, construct 
file2 = '/content/entities.npy'
entities = np.load(file2,allow_pickle=True)

In [None]:
def get_Entity(text,entities):
  res = 0
  for x in entities:
    if text == x.text:
      res = x
      break
  return res

In [None]:
###Using following operations to add mishandled entities into clusters
for item in mod_list_:
  for i in range(len(entity_groups)):
    flag = 0
    for j in range(len(entity_groups[i])):
      if item==entity_groups[i][j].text:
        temp = get_Entity(item,entities)
        entity_groups[i].append(temp)
        flag = 1
        break
    if flag == 1:
      break

###DBSCAN(find new clusters)

In [None]:
# from sklearn import datasets
X = remain_list.copy()
import numpy as np
import random
import time
def findNeighbor(j,X,eps,distance,embed,crf_w2i,tokenizer):
    N=[]
    for p in range(len(X)):   #Find all objects in a neighbourhood
        term1 = tokenizer.tokenize(X[j])
        term2 = tokenizer.tokenize(X[p])
        temp = distance(embed,crf_w2i,term1,term2) #cosine distance
        # temp=np.sqrt(np.sum(np.square(X[j]-X[p])))   #Euclidean distance
        if(temp<=eps):
            N.append(p)
    return N


def dbscan(X,eps,min_Pts,distance,embed,crf_w2i,tokenizer):
    k=-1
    NeighborPts=[]      #array,all items in a neighbourhood
    Ner_NeighborPts=[]
    fil=[]                          #visited list to be empty at start
    gama=[x for x in range(len(X))]            # all the items should be un-visited at start
    cluster=[-1 for y in range(len(X))]
    while len(gama)>0:
        j=random.choice(gama)
        gama.remove(j)  #remove from un-visited
        fil.append(j)   #add to visited
        NeighborPts=findNeighbor(j,X,eps,distance,embed,crf_w2i,tokenizer)
        if len(NeighborPts) < min_Pts:
            cluster[j]=-1   #mark as a noise point
        else:
            k=k+1
            cluster[j]=k
            for i in NeighborPts:
                if i not in fil:
                    gama.remove(i)
                    fil.append(i)
                    Ner_NeighborPts=findNeighbor(i,X,eps,distance,embed,crf_w2i,tokenizer)
                    if len(Ner_NeighborPts) >= min_Pts:
                        for a in Ner_NeighborPts:
                            if a not in NeighborPts:
                                NeighborPts.append(a)
                    if (cluster[i]==-1):
                        cluster[i]=k
    return cluster


eps=0.27
min_Pts=2
begin=time.time()
C=dbscan(X,eps,min_Pts,get_similarity,embed,crf_w2i,tokenizer)
end=time.time()
print ("time using:",end-begin)


time using: 13.966970920562744


In [None]:
##Get index for each new clusters
from collections import defaultdict
d = defaultdict(list)
for k,va in [(v,i) for i,v in enumerate(C)]:
 d[k].append(va)
print(d)

defaultdict(<class 'list'>, {-1: [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 27, 28, 29, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 118, 119], 3: [3, 101], 4: [6, 117], 1: [9, 85], 5: [22, 34], 2: [26, 100], 6: [30, 57], 0: [44, 90]})


In [None]:
new_cluster_l = []
# cluster_num = len(entity_groups)
cluster_num = 80
for i in range(max(C)+1):
  temp = {}
  temp[cluster_num+i] = [X[idx] for idx in d[i]]
  new_cluster_l.append(temp)
new_cluster_l

[{80: ['fully connected recurrent network', 'recurrent network']},
 {81: ['tactile information processing', 'tactile sensory suppression']},
 {82: ['coarse-to-fine optical flow', 'coarse to fine']},
 {83: ['multi-band wavelet', 'complex wavelet']},
 {84: ['Vision-based microassembly', 'automated microassembly']},
 {85: ['super-resolution mapping', 'super-resolution mapping analyses']},
 {86: ['hyperspectral analysis', 'hyperspectral band grouping']}]

In [None]:
### Add new clusters into original list
entity_groups.append(new_cluster_l)

##SpanBERT for embedding measurement

In [None]:
# Import generic wrappers
from transformers import AutoModel, AutoTokenizer 


# Define the model repo
model_name = "SpanBERT/spanbert-base-cased" 


# Download pytorch model
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=413.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=215475882.0, style=ProgressStyle(descri…




Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
span_embed = model.embeddings.word_embeddings

In [None]:
###Test the feasibility of tokenizer and embedding
term1 = tokenizer.tokenize('two layer hierarchical classification')
term2 = tokenizer.tokenize('hierarchical clustering')
get_similarity(span_embed,crf_w2i,term1,term2)

0.11089026927947998

In [None]:
def show_similarity_top1(term,data,num)->dict:
  sort_dic = {}
  for item in data:
    item1 = tokenizer.tokenize(item)
    temp = get_similarity(span_embed, crf_w2i, term,item1)
    sort_dic[item] = temp
  sorted_dic = dict(sorted(sort_dic.items(), key = lambda kv:(kv[1], kv[0])))
  # print(sorted_dic)
  cnt = 0 
  for key, value in sorted_dic.items():
      cnt += 1
      if cnt > num:
          break
      print("{}:{}".format(key, value)) 

In [None]:
term = tokenizer.tokenize("CGs")
show_similarity_top1(term,entities,10)

CGs:4.76837158203125e-07
CG:0.18933910131454468
CGF:0.2702394723892212
HME-CPS:0.2835298180580139
CP:0.30165767669677734
CF:0.3144184350967407
CIM:0.32503771781921387
human-machine-environment cyber-physical system (HME-CPS):0.34812843799591064
conceptual graph (CG):0.360026478767395
CAS-PEAL:0.36104893684387207


In [None]:
cluster_embed_span = cluster_avg_embed(cluster_item,span_embed,tokenizer,crf_w2i)

In [None]:
mod_list_span = []
for w in un_cluster:
  dict_ = {}
  w_embed = get_embedding(w,tokenizer,crf_w2i,span_embed)
  res = embed_cos_distance(cluster_embed_span,w_embed)
  if res[min(res, key=res.get)]<0.20:
    dict_[w] = min(res, key=res.get)
    mod_list_span.append(dict_)

Note: spanBert embedding dose not perform well in this case

In [None]:
mod_list_span

[{'multi-aspect SAR': 10},
 {'Haar-like': 29},
 {'phase correlation': 63},
 {'Bayesian-MCMC': 21},
 {'reversible jump MCMC': 21},
 {'OpenNLP': 16},
 {'low-level features vector quantization': 36},
 {'multi-coil MRI': 16},
 {'Predictive Diagnostic Optimisation': 66},
 {'Temporal Decision Tree': 57},
 {'Naive Bayes Classifier': 53},
 {'hierarchical-like particle filter': 73},
 {'semi-fragile watermark': 67},
 {'PKI (Public Key Infrastructure) based semi-fragile watermarking': 29},
 {'CAS-PEAL': 29},
 {'FG-NET': 0},
 {'SIFT-based': 42},
 {'two layer hierarchical classification': 73},
 {'coarse-to-fine optical flow': 61},
 {'non-local means': 47},
 {'Sugeno Measures': 59},
 {'interval type-2 fuzzy logic': 77},
 {'Non-hierarchical k-means': 73},
 {'FPGA-based Naive Bayes': 0},
 {'TNO MARS/Prescan2': 16},
 {'stochastic cloning': 7},
 {'SD-Scicon UK Ltd': 39}]

##Function for searching relative Entities

note: you can get the corresponding cluster information by this function

In [None]:
def find_crf(word,new_cluster):
  for i in range(len(new_cluster)):
    flag = False
    for j in range(len(new_cluster[i])):
      if word == new_cluster[i][j].text:
        flag = True        
    if flag == True:
      print(new_cluster[i])
    #   res.append(new_cluster[i])
    # return res

### Example
find_crf('FCMAC',entity_groups)