In [None]:
# !pip install sentence-transformers
# !pip install transformers

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertModel, AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import spacy 

In [None]:
from transformers import BertTokenizer, BertModel
from warnings import filterwarnings as filt

filt('ignore')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained("bert-base-uncased")
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
nlp = spacy.load("en_core_web_sm")

def get_question(sentence, answer):

  mdl = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
  tknizer = AutoTokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')

  text = "context: {} answer: {}".format(sentence,answer)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=300)


  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]


  Question = dec[0].replace("question:","")
  Question= Question.strip()
  return Question

def get_embedding(doc):

  bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  bert_model = BertModel.from_pretrained("bert-base-uncased")
  
  # txt = '[CLS] ' + doc + ' [SEP]'
  tokens = bert_tokenizer.tokenize(txt)
  token_idx = bert_tokenizer.convert_tokens_to_ids(tokens)
  segment_ids = [1] * len(tokens)

  torch_token = torch.tensor([token_idx])
  torch_segment = torch.tensor([segment_ids])

  return bert_model(torch_token, torch_segment)[-1].detach().numpy()

def get_pos(context):
  doc = nlp(context)
  docs = [d.pos_ for d in doc]
  return docs, context.split()

def get_sent(context):
  doc = nlp(context)
  return list(doc.sents)

def get_vector(doc):
  stop_words = "english"
  n_gram_range = (1,1)
  df = CountVectorizer(ngram_range = n_gram_range, stop_words = stop_words).fit([doc])
  return df.get_feature_names()


def get_key_words(context, module_type = 't'):
  keywords = []
  top_n = 5
  for txt in get_sent(context):
    keywd = get_vector(str(txt))
    print(f'vectors : {keywd}')
    if module_type == 't':
      doc_embedding = get_embedding(str(txt))
      keywd_embedding = get_embedding(' '.join(keywd))
    else:
      doc_embedding = model.encode([str(txt)])
      keywd_embedding = model.encode(keywd)
    
    distances = cosine_similarity(doc_embedding, keywd_embedding)
    print(distances)
    keywords += [(keywd[index], str(txt)) for index in distances.argsort()[0][-top_n:]]

  return keywords

In [None]:
txt = 'Mauricio Pochettino open to leaving Paris St-Germain if Man Utd make approach'

In [None]:
# generating questions based on the above txt

In [None]:
# this will use my own embedding 
get_key_words(txt)

vectors : ['approach', 'germain', 'leaving', 'make', 'man', 'mauricio', 'open', 'paris', 'pochettino', 'st', 'utd']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.we

[[1.]]


[('approach',
  'Mauricio Pochettino open to leaving Paris St-Germain if Man Utd make approach')]

In [None]:
# this will use sentence transformers embedding and this is working pretty good 
txt = 'Mauricio Pochettino open to leaving Paris St-Germain if Man Utd make approach'
for ans, context in get_key_words(txt, 'st'):
  print('=======================================')
  print()
  print(get_question(context, ans))
  print()

vectors : ['approach', 'germain', 'leaving', 'make', 'man', 'mauricio', 'open', 'paris', 'pochettino', 'st', 'utd']
[[0.38606027 0.47576404 0.36602828 0.28078705 0.31993675 0.4191036
  0.47562945 0.5885888  0.40264377 0.30833414 0.33931935]]

Who is open to leaving Paris St-Germain if Man Utd make an approach?


Who is open to leaving Paris St-Germain if Man Utd make an approach?


Is Mauricio Pochettino open to leaving Paris St-Germain?


Pochettino is open to leaving Paris St-Germain if Man Utd make an approach?


What city is Mauricio Pochettino open to leaving if Man Utd make an approach?



In [None]:
{'cat' for _ in range(5)}

{'cat'}