<a href="https://colab.research.google.com/github/xahram/Sci-Bert/blob/main/Sci_Bert_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# First step is to load the NIPS data that is uploaded in the Google Drive

In [1]:
# Mount the google drive folder into the directory to access files

from google.colab import drive
drive.mount('/gdrive')


Mounted at /gdrive


In [2]:
# Load all necessary libraries

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import time 
nltk.download("punkt")
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the NIPS dataset from the drive

nips_papers_df = pd.read_csv('/gdrive/My Drive/Master_dataset/papers.csv')  
nips_papers_df.head()

nips_papers = nips_papers_df.infer_objects()

nips_papers.dtypes

nips_papers["year"] = pd.to_datetime(nips_papers["year"], format="%Y")
# nips_papers['year'] = nips_papers['year'].dt.year
nips_papers.sort_values(by='year')

print(nips_papers.dtypes)

max(nips_papers["year"])
min(nips_papers["year"])

nips_papers = nips_papers.sort_values(by = "year")


id                     int64
year          datetime64[ns]
title                 object
event_type            object
pdf_name              object
abstract              object
paper_text            object
dtype: object


In [4]:
from os import close
# Slice Data Frame by 3 year interval


# print(len(nips_papers))

# Partition/Group Papers into df by the interval/freq of 3 years, closed = left to start combinbing from the 1987
nips_papers_3y_grouped = nips_papers.groupby(pd.Grouper(key='year', freq='3Y', sort=True, closed="left"))



# Save partitions in the Dictionary format with 10 intervals
nips_papers_partitions = {}
initial_partition_id = 0
for i, g  in nips_papers_3y_grouped:
    nips_papers_partitions[initial_partition_id] = g
    initial_partition_id = initial_partition_id + 1


print(nips_papers_partitions)
# nips_papers_three_year_partition[0].tail()


#for i, g in nips_papers.groupby(pd.Grouper(key=nips_papers["year"], freq='A')):
#     print(g)







{0:        id       year                                              title  \
0       1 1987-01-01  Self-Organization of Associative Database and ...   
328    13 1987-01-01   Temporal Patterns of Activity in Neural Networks   
6853   72 1987-01-01  Ensemble' Boltzmann Units have Collective Comp...   
6743   71 1987-01-01  Centric Models of the Orientation Map in Prima...   
6632   70 1987-01-01  On the Power of Neural Networks for Solving Ha...   
...   ...        ...                                                ...   
1650  250 1989-01-01                               Optimal Brain Damage   
1661  251 1989-01-01  A Self-organizing Associative Memory System fo...   
1672  252 1989-01-01  Can Simple Cells Learn Curves? A Hebbian Model...   
1683  253 1989-01-01  Subgrouping Reduces Complexity and Speeds Up L...   
1638  249 1989-01-01  Neural Network Analysis of Distributed Represe...   

     event_type                                           pdf_name  \
0           NaN  1-self-o

In [5]:
# IMPORT word2phrase to create bigrams and unigrams
!git clone https://github.com/travisbrady/word2phrase.git

Cloning into 'word2phrase'...
remote: Enumerating objects: 93, done.[K
remote: Total 93 (delta 0), reused 0 (delta 0), pack-reused 93[K
Unpacking objects: 100% (93/93), done.


# ALL TIME WINDOWS SCI-BERT

In [6]:
# Convert List of Time Slice DF paper_text content to lists

start = time.time()

# Loop through every DF and convert paper_text to list and concatenate all the papers of one time slice 
## this will be a list like  ["All paper content string of first slice", "all paper content string of 2nd slice", ...] 

papers_contents_list = [" ".join(time_slice_df["paper_text"].tolist()) for time_slice_df in nips_papers_partitions.values()]

#### MEASURE THE EXECUTION TIME FOR RUNNING THE CONCATENATION CODE

end = time.time()
print(end - start)

# papers_contents_list
# len(papers_contents_list[0])

0.10576963424682617


In [7]:
#  Join Paper titles for bigram and unigram extraction


papers_titles_list = [" ".join(time_slice_df["title"].tolist()) for time_slice_df in nips_papers_partitions.values()]



## Step 1 -  Pre Processing 

# Remove Stopwords 

In [8]:
import re


# function to rmeove digits and numbers from papers 
def regex_remove_digits(papers_contents_list):      
    # Remove any digits for the corpus
    all_time_window_papers_content_list = [re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", time_slice_paper) 
                                                    for time_slice_paper in papers_contents_list] 
    # Remove words with length less than 3 

    # https://stackoverflow.com/questions/24332025/remove-words-of-length-less-than-4-from-string
    all_time_window_papers_content_list = [re.sub(r'\b\w{1,2}\b', '', time_slice_paper) 
                                          for time_slice_paper in all_time_window_papers_content_list]

    return all_time_window_papers_content_list






In [9]:

# Custom Stopwords List for Scientific Literature 
from sklearn.feature_extraction import text

path_to_stop_words = '/gdrive/My Drive/Master_dataset/stopwords_10000_most_frequent_filtered.txt'

with open(path_to_stop_words, "r") as file1:
    FileasList = file1.readlines()


stopwords = [s.strip('\n') for s in FileasList]
print(len(stopwords))


scientific_literature_stopwords = text.ENGLISH_STOP_WORDS.union(stopwords)

len(scientific_literature_stopwords)


9954


9958

In [10]:
# Get all paper content and titles for bigram and unigram generation
all_time_window_papers_content_list = regex_remove_digits(papers_contents_list)
all_time_window_papers_title_list = regex_remove_digits(papers_titles_list)


## Get Bag Of Candidate Keywords For All Time Windows

In [11]:

all_time_window_papers_titles = " ".join(all_time_window_papers_title_list)

In [12]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords

#About 900 stopwords
stop_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(scientific_literature_stopwords)



token = nltk.word_tokenize(all_time_window_papers_titles)
output = [w for w in token if not w in stop_words]
bigrams = ngrams(output,2)


# candidate_keywords = [ for n in ngrams]
ngrams = Counter(bigrams).most_common()




In [13]:
candidate_keywords = [( " ".join(n[0]) , n[1] )for n in ngrams]
candidate_keywords = candidate_keywords[:300]



###################################################

In [14]:
candidate_keywords

[('Neural Networks', 247),
 ('Reinforcement Learning', 148),
 ('Neural Network', 135),
 ('Gaussian Process', 67),
 ('Graphical Models', 59),
 ('Support Vector', 57),
 ('Gaussian Processes', 49),
 ('Active Learning', 46),
 ('Variational Inference', 45),
 ('Monte Carlo', 44),
 ('Online Learning', 43),
 ('Speech Recognition', 42),
 ('Recurrent Neural', 37),
 ('Component Analysis', 36),
 ('Gradient Descent', 34),
 ('Hidden Markov', 34),
 (': The', 32),
 ('Deep Learning', 32),
 ('Learning :', 30),
 ('Markov Models', 30),
 ('Vector Machines', 30),
 ('Analog VLSI', 29),
 ('Stochastic Gradient', 29),
 ('Markov Decision', 28),
 ('Feature Selection', 28),
 (': Learning', 27),
 ('Networks Learning', 27),
 ('Random Fields', 27),
 ('Machine Learning', 27),
 ('Networks :', 26),
 ('Belief Propagation', 26),
 ('Kernel Learning', 26),
 ('Unsupervised Learning', 25),
 ('neural networks', 25),
 ('Model Selection', 25),
 ('Matrix Completion', 25),
 ('Dynamic Programming', 24),
 ('Function Approximation', 

### Step 3 - BERT EMBEDDING GENERATE

In [17]:
title_ngram_candidate_keywords_time_slices_sorted_ = [ngram[0] for ngram in candidate_keywords]
t = "\n".join(title_ngram_candidate_keywords_time_slices_sorted_)
t

'Neural Networks\nReinforcement Learning\nNeural Network\nGaussian Process\nGraphical Models\nSupport Vector\nGaussian Processes\nActive Learning\nVariational Inference\nMonte Carlo\nOnline Learning\nSpeech Recognition\nRecurrent Neural\nComponent Analysis\nGradient Descent\nHidden Markov\n: The\nDeep Learning\nLearning :\nMarkov Models\nVector Machines\nAnalog VLSI\nStochastic Gradient\nMarkov Decision\nFeature Selection\n: Learning\nNetworks Learning\nRandom Fields\nMachine Learning\nNetworks :\nBelief Propagation\nKernel Learning\nUnsupervised Learning\nneural networks\nModel Selection\nMatrix Completion\nDynamic Programming\nFunction Approximation\nDecision Processes\nObject Recognition\nTime Series\nMixture Models\nLatent Variable\nMetric Learning\nDeep Neural\nSpiking Neurons\nBayesian Inference\nDensity Estimation\nApproximate Inference\nConvex Optimization\nSupervised Learning\nDynamical Systems\nConvolutional Neural\nGenerative Models\nLarge Scale\nMatrix Factorization\n: Appl

In [18]:
!pip install pytorch-pretrained-bert
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 28.3 MB/s 
[?25hCollecting boto3
  Downloading boto3-1.24.51-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 24.8 MB/s 
Collecting botocore<1.28.0,>=1.27.51
  Downloading botocore-1.27.51-py3-none-any.whl (9.0 MB)
[K     |████████████████████████████████| 9.0 MB 59.6 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 7.6 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.11-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 75.3 MB/s 
  Downloading urllib3-1.25.11-py2.py3

In [19]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from collections import OrderedDict
import unidecode
import numpy as np
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)
import os
import matplotlib.pyplot as plt
# % matplotlib inline

In [20]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
import torch
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Found GPU at: /device:GPU:0


'Tesla T4'

In [21]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Use the pre-trained Base BERT model 
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda()
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

100%|██████████| 231508/231508 [00:03<00:00, 66385.06B/s] 
100%|██████████| 407873900/407873900 [00:33<00:00, 12171191.62B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [22]:
from collections import OrderedDict
class Data():

    def __getitem__(self, content=None):
         if content!=None:
             self.doc = "".join(content)
         return self.doc
     
    def _preprocess(self,targets,corpus):
        self.index=[]
        self.t_index=OrderedDict()
        for target in targets:
            
            for _,item in enumerate(corpus):
                # if target in item:
                  if item.lower().find(target) != -1:
                # if bool(re.search(target, item)):

                      count_target=item.count(target)
                  #   Avoiding the sentences with multiple occurrences of the target term for the time being###
                      if count_target==1:
                        if target not in self.t_index.keys():
                            self.t_index[target]=[_]
                        else:
                            self.t_index[target].append(_)
                        self.index.append(_)
        return self.index,self.t_index

In [23]:
'''
LOAD & EXTRACT DATA
'''
import os


# OUTPUT_DIR = root_dir+'Colab Notebooks/Challenge_Semeval/CLUSTERING/English_test/' # the path thatcontains the (Corpus1_text, Corpus2_text, Targets)
# p1 = os.path.join(OUTPUT_DIR, 'ccoha1.txt')
# p2 = os.path.join(OUTPUT_DIR, 'ccoha2.txt')
# t = os.path.join(OUTPUT_DIR, 'targets.txt')

# INPUT_DIR = '.\\evaluation\\semeval2020_ulscd_eng\\'
# p1 = os.path.join(INPUT_DIR, 'corpus1\\ccoha1.txt')
# p2 = os.path.join(INPUT_DIR, 'corpus2\\ccoha2.txt')
# #TARGET_DIR = '.\\targets\\'
# t = os.path.join(INPUT_DIR, 'targets.txt')
# p1='ccoha1.txt'
# p2='ccoha2.txt'
# t='targets.txt'

p1 = nips_papers_partitions[0]["paper_text"].tolist()
p2 = nips_papers_partitions[1]["paper_text"].tolist()
p3 = nips_papers_partitions[2]["paper_text"].tolist()
p4 = nips_papers_partitions[3]["paper_text"].tolist()
p5 = nips_papers_partitions[4]["paper_text"].tolist()
p6 = nips_papers_partitions[5]["paper_text"].tolist()
p7 = nips_papers_partitions[6]["paper_text"].tolist()
p8 = nips_papers_partitions[7]["paper_text"].tolist()
p9 = nips_papers_partitions[8]["paper_text"].tolist()
p10 = nips_papers_partitions[9]["paper_text"].tolist()


t = t
datasets = Data() 

# doc1 =  ["Sentence1", "Sentence2".....]
doc1=datasets.__getitem__(p1).split('\n')   
doc2=datasets.__getitem__(p2).split('\n')
doc3=datasets.__getitem__(p3).split('\n')
doc4=datasets.__getitem__(p4).split('\n')
doc5=datasets.__getitem__(p5).split('\n')
doc6=datasets.__getitem__(p6).split('\n')
doc7=datasets.__getitem__(p7).split('\n')
doc8=datasets.__getitem__(p8).split('\n')
doc9=datasets.__getitem__(p9).split('\n')
doc10=datasets.__getitem__(p10).split('\n')


t1=datasets.__getitem__(t).split('\n')
target_act=[x for x in t1 if len(x)>1]
t1=[x.lower() for x in t1 if len(x)>1]

index1=datasets._preprocess(t1,doc1)
index2=datasets._preprocess(t1,doc2)
index3=datasets._preprocess(t1,doc3)
index4=datasets._preprocess(t1,doc4)
index5=datasets._preprocess(t1,doc5)
index6=datasets._preprocess(t1,doc6)
index7=datasets._preprocess(t1,doc7)
index8=datasets._preprocess(t1,doc8)
index9=datasets._preprocess(t1,doc9)
index10=datasets._preprocess(t1,doc10)


index_t1=index1[1]
index_t2=index2[1]
index_t3=index3[1]
index_t4=index4[1]
index_t5=index5[1]
index_t6=index6[1]
index_t7=index7[1]
index_t8=index8[1]
index_t9=index9[1]
index_t10=index10[1]

print('The target words are:',t1)
target_words=t1

print('The index_t1 are ', index_t1)
print('The index_t2 are ', index_t2)


#conversions
target_uni=[unidecode.unidecode(m) for m in t1]
target_toks=[]
# print(target_uni)
for k in t1:
  target_toks.append(tokenizer.tokenize(k))
print('converted target toks',target_toks)

The target words are: ['neural networks', 'reinforcement learning', 'neural network', 'gaussian process', 'graphical models', 'support vector', 'gaussian processes', 'active learning', 'variational inference', 'monte carlo', 'online learning', 'speech recognition', 'recurrent neural', 'component analysis', 'gradient descent', 'hidden markov', ': the', 'deep learning', 'learning :', 'markov models', 'vector machines', 'analog vlsi', 'stochastic gradient', 'markov decision', 'feature selection', ': learning', 'networks learning', 'random fields', 'machine learning', 'networks :', 'belief propagation', 'kernel learning', 'unsupervised learning', 'neural networks', 'model selection', 'matrix completion', 'dynamic programming', 'function approximation', 'decision processes', 'object recognition', 'time series', 'mixture models', 'latent variable', 'metric learning', 'deep neural', 'spiking neurons', 'bayesian inference', 'density estimation', 'approximate inference', 'convex optimization', 

In [24]:
len(t1)
print(len(index_t1))
print(len(index_t2))
# target_toks

len(list(index_t1.values())[1])


129
157


27

In [25]:
def _pre_bert(doc,index,t):
  
  # index =  index_t1 -> { target_w1: index, target_w2: index2, target_w1 : index5 } -  index = Sentence index in which target word appears
  s=["Not Found"]  
  
  if t in index.keys():
      s=[doc[ind] for ind in index[t]]

  print('len of sentences',len(s))
  l=len(s)
  marked_text = ["[CLS] " + text + " [SEP]" for text in s]
  tokenized_text = [tokenizer.tokenize(m) for m in marked_text]
  
  tokenized_text=[x[:512] if len(x)>512 else x for x in tokenized_text]
  indexed_tokens = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
  segments_ids = [[1] * len(x) for x in tokenized_text]
  return s,marked_text,tokenized_text,indexed_tokens,segments_ids,l


def _bert_features(tokens_tensor, segments_tensors,tokenized_text):
  # print(len(tokens_tensor[0]))
  with torch.no_grad():
      encoded_layers, _ = model(tokens_tensor.to(device), segments_tensors.to(device))
  # print ("Number of layers:", len(encoded_layers))
  layer_i = 0

  # # print ("Number of batches:", len(encoded_layers[layer_i]))
  batch_i = 0

  # print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
  token_i = 0

  # print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))
  # Convert the hidden state embeddings into single token vectors

  # Holds the list of 12 layer embeddings for each token
  # Will have the shape: [# tokens, # layers, # features]
  token_embeddings = [] 

  # For each token in the sentence...
  # tokenized_text=[x for x in tokenized_text if x not in ['_', 'n', '##n','v', '##b']]
  for token_i in range(len(tokenized_text)):
    
    # Holds 12 layers of hidden states for each token 
    hidden_layers = [] 
    
    # For each of the 12 layers...
    for layer_i in range(len(encoded_layers)):
      
      # Lookup the vector for `token_i` in `layer_i`
      vec = encoded_layers[layer_i][batch_i][token_i]
      
      hidden_layers.append(vec)
      
    token_embeddings.append(hidden_layers)

  # Sanity check the dimensions:
  # print ("Number of tokens in sequence:", len(token_embeddings))
  # print ("Number of layers per token:", len(token_embeddings[0]))
  return token_embeddings
# s,marked_text,tokenized_text,indexed_tokens,segments_ids
def _get_embeddings(pre,tg):
  m_embed_full=[]
  # print('len(pre[0])',len(pre[0]))
  # print(tg)
  for _,item in enumerate(pre[0]):
    # Convert inputs to PyTorch tensors
    # print(item)
    token_list=pre[2][_]
    
    tokens_tensor = torch.tensor([pre[3][_]])
    segments_tensors = torch.tensor([pre[4][_]])
    # Predict hidden states features for each layer
    token_embeddings=_bert_features(tokens_tensor, segments_tensors,pre[2][_])
    concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]

    summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768]
    
    #consider the tokenized target
  
    indxs=[]
    # print(token_list)
    for tok in tg:
      '''
      remove -1,-2,-3
      '''
      if tok in token_list:
        if tok not in ['_', 'n', '##n','v', '##b']:
          indxs.append(token_list.index(tok))

    # print('indxs',indxs)
    if len(indxs)==1:
      #bert_embed=concatenated_last_4_layers [indxs[0]]
      bert_embed=summed_last_4_layers [indxs[0]]

      m_embed_full.append(bert_embed)
    elif len(indxs)>1:
      b_emb=[]
      for ind in indxs:
        #b_emb.append(concatenated_last_4_layers[ind])
        b_emb.append(summed_last_4_layers[ind])
        
      bert_embed= torch.sum(torch.stack(b_emb), 0)
      m_embed_full.append(bert_embed)
    # indx=token_list.index(tg.lower())
    # indx = [i for (i, elem) in enumerate(pre[2][_]) if t in elem]
    # print('indx',indx)
    # print(pre[1][_],indx)

    # if len(indx)>0:
    # bert_embed=concatenated_last_4_layers[indx[0]]
    
    # cosine_similarity(summed_last_4_layers[10].reshape(1,-1), summed_last_4_layers[19].reshape(1,-1))[0][0]
    
    
  return  m_embed_full, summed_last_4_layers
# For a particular target word,do clustering and find if there is a sense change


In [26]:
# Embedding for all time windows

sents_all=[]

# Holds all bert embeddings 
# [] -> [[embedding1], [embedding2],...]
X=[]

def embeddings_extract(target_words,target_toks,doc1,index_t1):
  t=target_words
  X_C1=[]
  lens1=[]
  for k,t in enumerate(target_words) :
    berts=[]
    sents=[]
    print('The target word is',t)    
    
    #get the sentences from corpus c1 and c2 for the specific target word 't'
    
    # This will generate tokenized sentences, tokens for the specific word. Or sentences containing specific word
    pre1=_pre_bert(doc1,index_t1,t)

    # lens1.append(pre1[-1])
    # lens2.append(pre2[-1])
    # print(pre1)
    
    sents.extend(pre1[0])
    #aggregate all the embeddings
    # s,marked_text,tokenized_text,indexed_tokens,segments_ids

    '''
    Get the embeddings of the targets from corpus 1 and 2
    '''
    _ , b1=_get_embeddings(pre1,target_toks[k])
    print('len of t1',len(b1))
    
    '''
    store the lenghts of no. of sentences extracted for each target word for each corpus
    '''
    lens1.append(len(b1))
    
    berts.extend(b1)
    print('len of each target word extractions is',len(berts))
    X.append(berts)

    # ______________ Placeholder to flatten the tensors into 1-D tensor for the 
    #           respective sentence tensors of specific keyword _______________ (b1)

    X_C1.append(b1)# the embeddings for C1
    sents_all.append(sents)
  return X,X_C1,lens1,sents_all



In [27]:
# import time
# start_time = time.time()

# embed_full,embed_C1,len_c1,sents=embeddings_extract(target_words,target_toks,doc1,index_t1)

# lens=[len_c1]
# # lens.append(len_c2)
# print('saved')
# print("--- %s seconds ---" % (time.time() - start_time))

In [28]:
# BERT Embeddings

embed_full,embed_C1,len_c1,sents=embeddings_extract(target_words,target_toks,doc1,index_t1)
embed_full,embed_C2,len_c2,sents=embeddings_extract(target_words,target_toks,doc2,index_t2)
embed_full,embed_C3,len_c3,sents=embeddings_extract(target_words,target_toks,doc3,index_t3)
embed_full,embed_C4,len_c4,sents=embeddings_extract(target_words,target_toks,doc4,index_t4)
embed_full,embed_C5,len_c5,sents=embeddings_extract(target_words,target_toks,doc5,index_t5)
embed_full,embed_C6,len_c6,sents=embeddings_extract(target_words,target_toks,doc6,index_t6)
embed_full,embed_C7,len_c7,sents=embeddings_extract(target_words,target_toks,doc7,index_t7)
embed_full,embed_C8,len_c8,sents=embeddings_extract(target_words,target_toks,doc8,index_t8)
embed_full,embed_C9,len_c9,sents=embeddings_extract(target_words,target_toks,doc9,index_t9)
embed_full,embed_C10,len_c10,sents=embeddings_extract(target_words,target_toks,doc10,index_t10)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
The target word is random projections
len of sentences 2
len of t1 35
len of each target word extractions is 35
The target word is graphical models
len of sentences 330
len of t1 22
len of each target word extractions is 22
The target word is subspace clustering
len of sentences 4
len of t1 34
len of each target word extractions is 34
The target word is generalized linear
len of sentences 25
len of t1 40
len of each target word extractions is 40
The target word is : deep
len of sentences 1
len of t1 4
len of each target word extractions is 4
The target word is sparse inverse
len of sentences 1
len of t1 4
len of each target word extractions is 4
The target word is inverse covariance
len of sentences 13
len of t1 20
len of each target word extractions is 20
The target word is robust pca
len of sentences 1
len of t1 4
len of each target word extractions is 4
The target word is adversarial networks
len of sentences 1
len of 

In [29]:
saved_map3 = {

    "embed_C1":embed_C1,
    "embed_C2":embed_C2,
    "embed_C3":embed_C3,
    "embed_C4":embed_C4,
    "embed_C5":embed_C5,
    "embed_C6":embed_C6,
    "embed_C7":embed_C7,
    "embed_C8":embed_C8,
    "embed_C9":embed_C9,
    "embed_C10":embed_C10,
    "sents":sents,
    "embed_full":embed_full
}

import pickle
with open('/gdrive/My Drive/Master_dataset/bert_embeddings_2.pickle', 'wb+') as f:
     pickle.dump(saved_map3, f)

## Apply K-NN with Cosine Similarity 

In [30]:
import pickle
import numpy as np
import pandas as pd
import nltk
import torch


In [31]:
with open('/gdrive/My Drive/Master_dataset/bert_embeddings_2.pickle', 'rb+') as f:
  saved_map = pickle.load(f)

In [32]:
type(saved_map["embed_C7"][0][0])

torch.Tensor

In [33]:
embed_C1 = saved_map["embed_C1"]
embed_C2 = saved_map["embed_C2"]
embed_C3 = saved_map["embed_C3"]
embed_C4 = saved_map["embed_C4"]
embed_C5 = saved_map["embed_C5"]
embed_C6 = saved_map["embed_C6"]
embed_C7 = saved_map["embed_C7"]
embed_C8 = saved_map["embed_C8"]
embed_C9 = saved_map["embed_C9"]
embed_C10 = saved_map["embed_C10"]

In [34]:
# Apply K-NN to calculate the Nearest neighbor of keywords

def convert_tensors_tolist(mapping):
  for i, word_sentences in enumerate(mapping):
        # Use below line whe converting tensors to numpy array
        X1=np.array([np.array(x.to('cpu')) for x in word_sentences])

        # X1=np.array([np.array(x) for x in word_sentences])
        X1=X1.sum(axis=0).tolist()
        mapping[i] = X1
  return mapping

In [35]:
##### FLATTEN tensors of sentences of respective words to 1-D

# embed_C1_ = convert_tensors_tolist(embed_C1)
embed_C2_ = convert_tensors_tolist(embed_C2)
embed_C3_ = convert_tensors_tolist(embed_C3)
embed_C4_ = convert_tensors_tolist(embed_C4)
embed_C5_ = convert_tensors_tolist(embed_C5)
embed_C6_ = convert_tensors_tolist(embed_C6)
embed_C7_ = convert_tensors_tolist(embed_C7)
embed_C8_ = convert_tensors_tolist(embed_C8)
embed_C9_ = convert_tensors_tolist(embed_C9)
embed_C10_ = convert_tensors_tolist(embed_C10)



In [36]:
len(embed_C2_[0])

768

In [37]:
from sklearn.neighbors import NearestNeighbors

def nn_(X):
  model = NearestNeighbors(n_neighbors=10,
                          metric='cosine',
                          algorithm='brute',
                          n_jobs=-1)

  n_n = model.fit(X)  
  distance, indeces = model.kneighbors(X)


  return indeces


In [38]:
# indeces_1 = nn_(embed_C1_)
indeces_2 = nn_(embed_C2_)
indeces_3 = nn_(embed_C3_)
indeces_4 = nn_(embed_C4_)
indeces_5 = nn_(embed_C5_)
indeces_6 = nn_(embed_C6_)
indeces_7 = nn_(embed_C7_)
indeces_8 = nn_(embed_C8_)
indeces_9 = nn_(embed_C9_)
indeces_10 = nn_(embed_C10_)



In [39]:
vec2dynamics_keywords = ['Neural Network', 'Reinforcement Learning', 'Active Learning', 'Monte Carlo', 'Learning Deep',
                          'Machine Learning', 'Supervised Learning', 'Time Series', 'Artificial Neural',
                         'Gaussian Process', 'Active Learning', 'Gradient Descent', 'Hidden Markov',
                         'Nearest Neighbor', 'Dynamical Systems', 'Dimensionality Reduction',
                         'Unsupervised Learning', 'Graphical Models', 'Dynamic Programming', 'Component Analysis']

In [40]:
# keywords_np = np.array(candidate_keywords)

candidate_keywords_ = np.array([keyword[0] for keyword in candidate_keywords])

candidate_keywords_

array(['Neural Networks', 'Reinforcement Learning', 'Neural Network',
       'Gaussian Process', 'Graphical Models', 'Support Vector',
       'Gaussian Processes', 'Active Learning', 'Variational Inference',
       'Monte Carlo', 'Online Learning', 'Speech Recognition',
       'Recurrent Neural', 'Component Analysis', 'Gradient Descent',
       'Hidden Markov', ': The', 'Deep Learning', 'Learning :',
       'Markov Models', 'Vector Machines', 'Analog VLSI',
       'Stochastic Gradient', 'Markov Decision', 'Feature Selection',
       ': Learning', 'Networks Learning', 'Random Fields',
       'Machine Learning', 'Networks :', 'Belief Propagation',
       'Kernel Learning', 'Unsupervised Learning', 'neural networks',
       'Model Selection', 'Matrix Completion', 'Dynamic Programming',
       'Function Approximation', 'Decision Processes',
       'Object Recognition', 'Time Series', 'Mixture Models',
       'Latent Variable', 'Metric Learning', 'Deep Neural',
       'Spiking Neurons', 'Ba

In [41]:
list(set(vec2dynamics_keywords) - set(candidate_keywords_))

[]

In [42]:
monitering_keywords = list(set(vec2dynamics_keywords).intersection(set(candidate_keywords_)))

In [43]:
len(monitering_keywords)

19

In [44]:


def get_nearest_keywords(indeces, keywords_np=candidate_keywords_):
  tup_nearest_neighbor = []
  for index, candidate_keyword in enumerate(keywords_np):
      # Take the current index of the keyword and get the list of 10 nearest index from KNN algorithm
      nearest_neighbors_indeces_of_current_keyword = indeces[index]

      # Filter the keyword list using the list of indeces obtained in previous step
      nearest_keywords = keywords_np[nearest_neighbors_indeces_of_current_keyword]

      # Create tuple with first element as the keyword for current iteration and 2nd element as list of its nearest neighbors
      tup_nearest_neighbor.append({candidate_keyword : set(nearest_keywords)})


  return tup_nearest_neighbor

In [45]:
# nn_1 = get_nearest_keywords(indeces_1)
nn_2 = get_nearest_keywords(indeces_2)
nn_3 = get_nearest_keywords(indeces_3)
nn_4 = get_nearest_keywords(indeces_4)
nn_5 = get_nearest_keywords(indeces_5)
nn_6 = get_nearest_keywords(indeces_6)
nn_7 = get_nearest_keywords(indeces_7)
nn_8 = get_nearest_keywords(indeces_8)
nn_9 = get_nearest_keywords(indeces_9)
nn_10 = get_nearest_keywords(indeces_10)


In [46]:
def get_target_nn(nn, monitering_keywords = monitering_keywords):
    return [n for n in nn for key in list(n.keys()) if key in monitering_keywords]
    

In [47]:
nn_2_ = get_target_nn(nn_2)
nn_3_ = get_target_nn(nn_3)
nn_4_ = get_target_nn(nn_4)
nn_5_ = get_target_nn(nn_5)
nn_6_ = get_target_nn(nn_6)
nn_7_ = get_target_nn(nn_7)
nn_8_ = get_target_nn(nn_8)
nn_9_ = get_target_nn(nn_9)
nn_10_ = get_target_nn(nn_10)





In [48]:
nn_10_

[{'Reinforcement Learning': {'Conditional Random',
   'Decision Trees',
   'Learning Approach',
   'Learning Efficient',
   'Model Selection',
   'Natural Language',
   'Random Fields',
   'Reinforcement Learning',
   'Structure Learning',
   'Support Vector'}},
 {'Neural Network': {'Active Learning',
   'Convolutional Networks',
   'Convolutional Neural',
   'Data Learning',
   'Distance Metric',
   'Gaussian Graphical',
   'Neural Net',
   'Neural Network',
   'Stochastic Gradient',
   'Stochastic Learning'}},
 {'Gaussian Process': {': Probabilistic',
   'Distance Metric',
   'Gaussian Process',
   'Large Margin',
   'Least Squares',
   'Nearest Neighbor',
   'Neighbor Classification',
   'Regret Bounds',
   'Stochastic Gradient',
   'The Infinite'}},
 {'Graphical Models': {'Belief Networks',
   'Density Estimation',
   'Generative Model',
   'Graphical Models',
   'Importance Sampling',
   'Large Scale',
   'Mutual Information',
   'Neural Model',
   'Spectral Methods',
   'graphica

In [49]:
type(nn_10_[0]["Reinforcement Learning"])

set

In [50]:
all_slice_nn = [nn_2_, nn_3_, nn_4_ , nn_5_ , nn_6_, nn_7_ , nn_8_ , nn_9_ , nn_10_]

In [51]:
import math

def log_stability(A, B):
    a = len(A.intersection(B))
    b = len(A-B)

    if a != 0 and b!=0:
      return ( math.log( len(A.intersection(B)) , 10) / math.log ( 0.5 * len((A - B) ) , 10)  )
    else:
      return 0

def calc_stability(nn):
    i = 0
    # print(nn)
    stability = []
    for _,n in enumerate(nn):
       i = _ + 1
      #  print(len(nn))
       if i < len(nn):
          # print(i)
          stability.append( log_stability(nn[_], nn[i]) )

    # print(stability)
    return stability

def extract_keyword_nns(target_nn_10tw, keyword):
    keyword_neighbors_all_windows = []
    for _ , target_nn in enumerate(target_nn_10tw):
      # print(list(target_nn[_].keys()))

      keyword_neighbors_all_windows.extend([target[keyword] for index, target in enumerate(target_nn) if list(target.keys())[0] == keyword ])
    

    # print(keyword_neighbors_all_windows)
    s = calc_stability(keyword_neighbors_all_windows)
    
    return s

def extract_stability(target_nn_10tw):
    s_n = []
    for _ , keyword in enumerate(monitering_keywords):
      #  print(extract_keyword_nns(target_nn_10tw, keyword) )
       s_n.append( { keyword: extract_keyword_nns(target_nn_10tw, keyword) } )
        # s_n.extend( { keyword : extract_keyword_nns(target_nn_10tw, keyword) } )

    return s_n

In [52]:
keyword_stab = extract_stability(all_slice_nn)
keyword_stab

[{'Neural Network': [0.5,
   0.5,
   0.8769514395748774,
   1.7564707973660298,
   1.2618595071429146,
   0.0,
   0.0,
   0.5]},
 {'Dynamical Systems': [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0]},
 {'Reinforcement Learning': [0.5,
   0.0,
   1.2618595071429146,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0]},
 {'Active Learning': [0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5, 0.0]},
 {'Nearest Neighbor': [0.0,
   0.0,
   0.8769514395748774,
   0.8769514395748774,
   0.0,
   0.0,
   0.0,
   0.5]},
 {'Gaussian Process': [2.584962500721156,
   2.584962500721156,
   0,
   0.5,
   0.5,
   0.0,
   0.5,
   0.0]},
 {'Gradient Descent': [0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0]},
 {'Time Series': [0.0, 0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0]},
 {'Supervised Learning': [0.0,
   0.5,
   0.0,
   0.0,
   0.5,
   0.0,
   0.0,
   0.8769514395748774]},
 {'Unsupervised Learning': [0.0,
   0.5,
   0.0,
   0.0,
   0.5,
   0.8769514395748774,
   1.2618595071429146,
   1.2618595071429146]},
 {'Artificial Neural': [0.0,
   0.8769514

In [53]:
log_stability(nn_8_[0]["Reinforcement Learning"], nn_7_[0]["Reinforcement Learning"])


0.0

In [54]:
v = [n for n in nn_2_] 



list(nn_2_[0].keys())[0]

'Reinforcement Learning'