In [1]:
import tensorflow as tf
import numpy as np 
import nltk
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from transformers import BertTokenizer,TFBertModel,AutoTokenizer,AutoConfig

In [3]:
%config Completer.use_jedi = False


In [4]:
stop_words=['.', 'از', 'برای', 'در','با','ما','همه','[SEP]','[CLS]','است', 'کنند']

In [5]:
doc="""تهران (دربارهٔ این پرونده آوا راهنما·اطلاعات) پرجمعیت‌ترین شهر و پایتخت ایران، مرکز استان تهران و شهرستان تهران است. این شهر با ۸٬۶۹۳٬۷۰۶ تن جمعیت، بیست و چهارمین شهر پرجمعیت جهان و پرجمعیت‌ترین شهر باختر آسیا به‌شمار می‌رود. کلان‌شهر تهران نیز دومین کلان‌شهر پرجمعیت خاورمیانه است.. """

In [6]:
top_n=10
nr_candidate=30
model_name_or_path = "HooshvareLab/bert-fa-zwnj-base"


get word and text embedding using <a hert="https://github.com/hooshvare/parsbert">parsbert</a>

text embedding = mean(word_embedding)

In [20]:

def  bert_embeder(model_name_or_path,doc,stop_words):
    config = AutoConfig.from_pretrained(model_name_or_path)
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
    
    model = TFBertModel.from_pretrained(model_name_or_path)
    count=CountVectorizer(ngram_range=(1,1),stop_words=stop_words).fit([doc])
    encoded_input=tokenizer.encode(count.get_feature_names())
    input_ids = tf.constant(encoded_input)[None, :]  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]
    candidate_embedding=last_hidden_states[0].numpy()
    doc_embedding=np.mean(last_hidden_states.numpy(),axis=1)
    
    candidate=tokenizer.tokenize(tokenizer.decode(encoded_input))

    return doc_embedding,candidate_embedding,candidate


In [21]:

def max_sum_sim(doc_embedding, candidate_embeddings, candidate, top_n, nr_candidate):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidate = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidate based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidate:])
    words_vals = [candidate[index] for index in words_idx]
    distances_candidate = distances_candidate[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidate[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [22]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [28]:
doc_embedding,candidate_embedding,candidate=bert_embeder(model_name_or_path,doc,stop_words)


distances=cosine_similarity(doc_embedding,candidate_embedding)

keywords=[candidate[index] for index in distances.argsort()[0][-top_n:]]
high_mmr=mmr(doc_embedding,candidate_embedding,candidate,top_n,diversity=0.7)
low_mmr=mmr(doc_embedding,candidate_embedding,candidate,top_n,diversity=0.2)


Some layers from the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-base and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'stop_words.' % sorted(inconsistent))


In [32]:
print(low_mmr)

['پرجمعیت', '[UNK]', 'تهران', 'خاورمیانه', 'شهر', '[UNK]', 'پرونده', 'راهنما', 'پایتخت', 'دومین']


In [30]:
print(high_mmr)

['پرجمعیت', '[UNK]', 'نیز', 'اطلاعات', 'تن', 'این', 'مرکز', 'به', 'بیست', 'رود']
