In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd '/content/gdrive/MyDrive/Question Generation'

/content/gdrive/MyDrive/Question Generation


In [3]:
import warnings
warnings.filterwarnings('ignore')

## Constituency parsing to find last verb or noun phrase

In [4]:
!pip install --quiet benepar

[K     |████████████████████████████████| 3.3 MB 31.3 MB/s 
[K     |████████████████████████████████| 2.6 MB 45.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 44.0 MB/s 
[K     |████████████████████████████████| 636 kB 56.8 MB/s 
[K     |████████████████████████████████| 895 kB 78.5 MB/s 
[?25h  Building wheel for benepar (setup.py) ... [?25l[?25hdone


In [5]:
import benepar
benepar.download('benepar_en3')

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.


True

In [6]:
from benepar.spacy_plugin import BeneparComponent
import spacy
# Loading spaCy’s en model and adding benepar model to its pipeline
nlp = spacy.load('en')
nlp.add_pipe(BeneparComponent('benepar_en3'))

text='The owner of the house was playing cricket.'
test_sent = text.rstrip('?:!.,;')

# Generating a parse tree for the text
tree_string = list(nlp(test_sent).sents)[0]._.parse_string
print(tree_string)

(S (NP (NP (DT The) (NN owner)) (PP (IN of) (NP (DT the) (NN house)))) (VP (VBD was) (VP (VBG playing) (NP (NN cricket)))))


In [7]:
from nltk import tokenize
from nltk.tree import Tree

tree = Tree.fromstring(tree_string)
print(tree)
print(tree.pretty_print())

(S
  (NP (NP (DT The) (NN owner)) (PP (IN of) (NP (DT the) (NN house))))
  (VP (VBD was) (VP (VBG playing) (NP (NN cricket)))))
                   S                                   
                ___|___________________                 
               NP                      VP              
      _________|___               _____|_____           
     |             PP            |           VP        
     |          ___|___          |      _____|_____     
     NP        |       NP        |     |           NP  
  ___|____     |    ___|____     |     |           |    
 DT       NN   IN  DT       NN  VBD   VBG          NN  
 |        |    |   |        |    |     |           |    
The     owner  of the     house was playing     cricket

None


In [8]:
# access left and right child
tree[0].pretty_print()
tree[1].pretty_print() 

               NP              
      _________|___             
     |             PP          
     |          ___|___         
     NP        |       NP      
  ___|____     |    ___|____    
 DT       NN   IN  DT       NN 
 |        |    |   |        |   
The     owner  of the     house

       VP              
  _____|_____           
 |           VP        
 |      _____|_____     
 |     |           NP  
 |     |           |    
VBD   VBG          NN  
 |     |           |    
was playing     cricket



## Extracting the last verb or noun phrase using tree string

In [9]:
tree_string

'(S (NP (NP (DT The) (NN owner)) (PP (IN of) (NP (DT the) (NN house)))) (VP (VBD was) (VP (VBG playing) (NP (NN cricket)))))'

In [10]:
# EXTRACT LAST VERB PHRASE FROM TREE STRING
def verbphrase(tree_string):
  split_text = tree_string.split('VP')[-1]
  string = ''
  for ch in split_text:
    if(not ch.isupper() and ch != '(' and ch != ')'):
      string = string + ch
  res = " ".join(string.split())
  return res

In [11]:
# EXTRACT LAST NOUN PHRASE FROM TREE STRING
def nounphrase(tree_string):
  split_text = tree_string.split('NP')[-1]
  string = ''
  for ch in split_text:
    if(not ch.isupper() and ch != '(' and ch != ')'):
      string = string + ch
  res = " ".join(string.split())
  return res

## Spliting the original sentence at last verb or noun phrase depending upon the length of phrase.

We decide between using noun and verb phrase by looking at their lenghts. As removal of longer phrase will result in generated sentence being too far from original sentence

In [12]:
np = nounphrase(tree_string)
vp = verbphrase(tree_string)
sent_to_gpt = ''
if(len(vp) > len(np)):
  sent_to_gpt = vp
else:
  sent_to_gpt = np
sent_to_gpt = text.split(sent_to_gpt)[0]

In [13]:
sent_to_gpt

'The owner of the house was '

## Generate false statements of the sentence using GPT-2

In [19]:
!pip install --quiet transformers==4.2.2

In [15]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
GPT2model = TFGPT2LMHeadModel.from_pretrained("gpt2",pad_token_id=GPT2tokenizer.eos_token_id)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=497933648.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [16]:
partial_sentence = sent_to_gpt
input_ids = GPT2tokenizer.encode(partial_sentence,return_tensors='tf')
print (input_ids)
maximum_length = len(partial_sentence.split())+20

tf.Tensor([[ 464 4870  286  262 2156  373  220]], shape=(1, 7), dtype=int32)


https://huggingface.co/blog/how-to-generate

In [17]:
# Activate top_k sampling and top_p sampling with only from 90% most likely words
sample_outputs = GPT2model.generate(
    input_ids, 
    do_sample=True, 
    max_length=maximum_length, 
    top_p=0.80,  
    top_k=60,  
    repetition_penalty  = 10.0,
    num_return_sequences=5
)

In [18]:
import nltk
nltk.download('punkt')
generated_sentences=[]

for i, sample_output in enumerate(sample_outputs):
    decoded_sentence = GPT2tokenizer.decode(sample_output, skip_special_tokens=True)
    final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
    generated_sentences.append(final_sentence)
    print (i,": ",final_sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
0 :  The owner of the house was  the one who bought it and he's now asking for $500,000.
1 :  The owner of the house was  in possession, and in his place were two young children who had been shot to death by
2 :  The owner of the house was iced up.
3 :  The owner of the house was iced tea by a friend who had made him drink and then his wife took it to them.
4 :  The owner of the house was  unaware that he would have to remove his belongings after being evacuated.
