# Load Models and Libraries

In [None]:
!pip install transformers



In [None]:
!git clone https://github.com/amontgomerie/question_generator

fatal: destination path 'question_generator' already exists and is not an empty directory.


In [None]:
%cd question_generator/
%load questiongenerator.py
from questiongenerator import QuestionGenerator
from questiongenerator import print_qa

/content/question_generator


In [None]:
!pip3 install wikipedia

In [None]:
!pip install sentence-transformers

Load Question Generator

In [None]:
from questiongenerator import QuestionGenerator
qg = QuestionGenerator()

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Load long answer generator

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
qa_s2s_tokenizer = AutoTokenizer.from_pretrained('yjernite/bart_eli5')
qa_s2s_model = AutoModelForSeq2SeqLM.from_pretrained('yjernite/bart_eli5').to('cuda:0')
_ = qa_s2s_model.eval()

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Load paraphrasing model

In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
import torch
import wikipedia
import numpy as np
import pandas as pd
from time import time


In [None]:
#transform scraped files into paragraphs
def text_to_paragraph(text):
  paparagraph = ""

  #get main content
  stripped = text.split("== See also ==")
  if len(stripped)==1:
      stripped = text.split("== Bibliography ==")
  if len(stripped)==1:
      stripped = text.split("== References ==")

  # get paragraphs
  stripped = stripped[0].split("==")

  #remove titles
  for st in stripped:
    if len(st) > 100:
      paparagraph += "\<P>" + st.replace("\n", " ").replace("</s>", "")

  return paparagraph

#Long Answer Generation

In [None]:
def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"):
    q_ls = [q for q, a in qa_list]
    a_ls = [a for q, a in qa_list]
    q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True)
    q_ids, q_mask = (
        torch.LongTensor(q_toks["input_ids"]).to(device),
        torch.LongTensor(q_toks["attention_mask"]).to(device),
    )
    a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True)
    a_ids, a_mask = (
        torch.LongTensor(a_toks["input_ids"]).to(device),
        torch.LongTensor(a_toks["attention_mask"]).to(device),
    )
    lm_labels = a_ids[:, 1:].contiguous().clone()
    lm_labels[a_mask[:, 1:].contiguous() == 0] = -100
    model_inputs = {
        "input_ids": q_ids,
        "attention_mask": q_mask,
        "decoder_input_ids": a_ids[:, :-1].contiguous(),
        "lm_labels": lm_labels,
    }
    return model_inputs

In [None]:
def qa_s2s_generate(
    question_doc,
    qa_s2s_model,
    qa_s2s_tokenizer,
    num_answers=1,
    num_beams=None,
    min_len=64,
    max_len=256,
    do_sample=False,
    temp=1.0,
    top_p=None,
    top_k=None,
    max_input_length=512,
    device="cuda:0",
):
    model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,)
    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
    generated_ids = qa_s2s_model.generate(
        input_ids=model_inputs["input_ids"],
        attention_mask=model_inputs["attention_mask"],
        min_length=min_len,
        max_length=max_len,
        do_sample=do_sample,
        early_stopping=True,
        num_beams=1 if do_sample else n_beams,
        temperature=temp,
        top_k=top_k,
        top_p=top_p,
        eos_token_id=qa_s2s_tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        num_return_sequences=num_answers,
        decoder_start_token_id=qa_s2s_tokenizer.bos_token_id,
    )
    return [qa_s2s_tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in generated_ids]


In [None]:
def get_long_answer(question, context):
  question_doc = "question: {} context: {}".format(question, context)
  answer = qa_s2s_generate(
            question_doc, qa_s2s_model, qa_s2s_tokenizer,
            num_answers=1,
            num_beams=8,
            min_len=64,
            max_len=128,
            max_input_length=1024,
            device="cuda:0"
    )[0]
  
  return answer

#Scraping Wikipedia by topic
For now it is a basic utility for getting content for question/answer generation

In [None]:
def get_wikepedia_content(topic):
  # search wikipedia by topic
  # TODO: improve topic search and filter irrelevant replies
  
  results , suggested = wikipedia.search(topic, suggestion=True)
  print(suggested)
  if suggested:
    results_2 = wikipedia.search(suggested)
    print(results_2)
    results = list(set([suggested] + results + results_2))

  print(results)

  titles = []
  contents = []
  sub_intents = []
  for res in results:
    try:
        page = wikipedia.page(res)
    except :
        continue

    title = page.title
    titles.append(title)

    # basic check if titles match in first character
    if title[0].lower()!=res[0].lower():
      print("misimatch: ", (res, title))
      continue
    contents.append(page.content)
    print("done : ", (res, title))
    sub_intents.append(res)

  return contents, sub_intents
    

In [None]:
main_intent = "exercise"
contents, sub_intents = get_wikepedia_content(main_intent)

None
['Exercise', 'Military exercise', 'Kegel exercise', 'Squat (exercise)', 'Exercise RIMPAC', 'Aerobic exercise', 'Crunch (exercise)', 'Exercise Armageddon', 'Exercise physiology', 'Exercise machine']
done :  ('Exercise', 'Exercise')
done :  ('Military exercise', 'Military exercise')
done :  ('Kegel exercise', 'Kegel exercise')
done :  ('Exercise RIMPAC', 'Exercise RIMPAC')
done :  ('Aerobic exercise', 'Aerobic exercise')
done :  ('Crunch (exercise)', 'Crunch (exercise)')
done :  ('Exercise Armageddon', 'Exercise Armageddon')
done :  ('Exercise physiology', 'Exercise physiology')
done :  ('Exercise machine', 'Exercise machine')


In [None]:
sub_intents

['Exercise',
 'Military exercise',
 'Kegel exercise',
 'Exercise RIMPAC',
 'Aerobic exercise',
 'Crunch (exercise)',
 'Exercise Armageddon',
 'Exercise physiology',
 'Exercise machine']

In [None]:
import string

In [None]:
sub_intents = [ sub_.translate(str.maketrans('', '', string.punctuation)).replace(" ", "_").lower() for sub_ in sub_intents]
sub_intents

['exercise',
 'military_exercise',
 'kegel_exercise',
 'exercise_rimpac',
 'aerobic_exercise',
 'crunch_exercise',
 'exercise_armageddon',
 'exercise_physiology',
 'exercise_machine']

#Generate Questions, short and long Answers

the qg model generates questions based on extracted portions from the context. Thoe portions are the answers

In [None]:
from tqdm.notebook import tqdm

In [None]:
qg.SEQ_LENGTH = 1024

In [None]:
qas = []
for i in tqdm(range(len(contents))):
  context = text_to_paragraph(contents[i])
  qa_ = qg.generate(context, num_questions=30)
  qas.append(qa_)


  0%|          | 0/9 [00:00<?, ?it/s]

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...



Token indices sequence length is longer than the specified maximum sequence length for this model (969 > 512). Running this sequence through the model will result in indexing errors


Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

('\nWas only able to generate 12 questions.', 'For more questions, please input a longer text.')
Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

('\nWas only able to generate 17 questions.', 'For more questions, please input a longer text.')


In [None]:
#get generate answers and questions
all_generated_answers = []
all_generated_questions = []
for data in qas:
  generated_answers = []
  generated_questions = []
  for d in data:
    answers = d["answer"] 
    questions = d["question"]
    if type(answers)==list:
      for ans in answers:
        if ans["correct"]:
          answers = ans["answer"]
    
    generated_answers.append(answers)
    generated_questions.append(questions)
  
  all_generated_answers.append(generated_answers)
  all_generated_questions.append(generated_questions)

In [None]:
# get unique questions per context
# keep indices for the answers
unique_questions = []
indices = []
sub_intents_id = []
i = 0
for gen_q in all_generated_questions:
  _, idx = np.unique(gen_q, return_index=True)
  unique_ = np.array(gen_q)[np.sort(idx)]
  sub_intents_id += [i]  * len(unique_)
  unique_questions.append(unique_)
  indices.append(idx)
  i+=1

In [None]:
# get unique answers per context
unique_answers = []

for i in range(len(indices)):
  gen_a = all_generated_answers[i]
  idx = indices[i]
  unique_ = np.array(gen_a)[np.sort(idx)]
  unique_answers.append(unique_)
  

Now we generate long answers for the previous questions

In [None]:
# get long answers
all_long_answers = []
for i in tqdm(range(len(unique_questions))):
  context_ = contents[i]
  context_ = text_to_paragraph(context_)
  
  long_answers = []
  for j in tqdm(range(len(unique_questions[i]))):
    query = unique_questions[i][j]
    long_answers.append(get_long_answer(query, context_))
  
  all_long_answers.append(long_answers)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
import itertools

In [None]:
# get the unique questions and answers over all the scraped contents

unique_questions = list(itertools.chain.from_iterable(unique_questions))
_, idx = np.unique(unique_questions, return_index=True)
unique_questions = np.array(unique_questions)[np.sort(idx)]

#################################

unique_answers = list(itertools.chain.from_iterable(unique_answers))
unique_answers = np.array(unique_answers)[np.sort(idx)]

#################################

unique_long_answers = list(itertools.chain.from_iterable(all_long_answers))
unique_long_answers = np.array(unique_long_answers)[np.sort(idx)]

#################################

sub_intents_id = np.array(sub_intents_id)[np.sort(idx)]

Save results to a dataframe

In [None]:
df = pd.DataFrame()

In [None]:
df['query'] = unique_questions
df['short_answer'] = unique_answers
df['long_answer'] = unique_long_answers
df['sub_intent'] = sub_intents_id

In [None]:
dict_ = dict(zip(np.unique(sub_intents_id), sub_intents))
dict_, sub_intents

({0: 'exercise',
  1: 'military_exercise',
  2: 'kegel_exercise',
  3: 'exercise_rimpac',
  4: 'aerobic_exercise',
  5: 'crunch_exercise',
  6: 'exercise_armageddon',
  7: 'exercise_physiology',
  8: 'exercise_machine'},
 ['exercise',
  'military_exercise',
  'kegel_exercise',
  'exercise_rimpac',
  'aerobic_exercise',
  'crunch_exercise',
  'exercise_armageddon',
  'exercise_physiology',
  'exercise_machine'])

In [None]:
df = df.replace({"sub_intent": dict_})

In [None]:
df.head()

Unnamed: 0,query,short_answer,long_answer,sub_intent
0,What is the role of contracting muscles in pro...,"That is, contracting muscles release multiple ...","When you work out, your body releases endorphi...",exercise
1,What is the city of Bogotá doing to make it ea...,"The city of Bogotá, Colombia, for example, blo...",The city of Bogotá is not doing anything to ma...,exercise
2,What is the evidence that school-based interve...,"However, there is some evidence that school-ba...",There is a lot of evidence that school-based i...,exercise
3,Who said that exercise is the only thing that ...,"Dating back to 65 BCE, it was Marcus Cicero, R...",It's not that exercise is the only thing that ...,exercise
4,What was the link between exercise and health?,The link between physical health and exercise ...,Exercise increases your heart rate and blood p...,exercise


In [None]:
df.to_csv("generated_qas_exercise.csv")

# TODO: Comapre answers and select the most relevant

# Create Rasa Files

In [None]:
pip install -U PyYAML

In [None]:
import yaml

class MyDumper(yaml.Dumper):
  
    def increase_indent(self, flow=False, indentless=False):
        return super(MyDumper, self).increase_indent(flow, False)

Creating nlu file

In [None]:
#default intents
examples = ["hey", "hello", "hi", "hi there","good morning","good evening",
            "hey there","Hiiii"," what's up"]
greet = dict({"intent": "greet",
              "examples": AsLiteral(yaml.dump(examples))})

examples = ["bye","goodbye","see you","see you later","see ya","ok then",
            "got to go","talk to you later"]
goodbye = dict({"intent": "goodbye",
              "examples": AsLiteral(yaml.dump(examples))})

examples = ["yes","yeah","sure","right","why not","ok","okay","alright",
            "positively","yes please","y","indeed","of course",
            "that sounds good","correct"]
affirm = dict({"intent": "affirm",
              "examples": AsLiteral(yaml.dump(examples))})

examples = ["no","n","never","I don't think so","don't like that",
            "denied","i deny","i doubt","no way","not really","could have been better"]
deny = dict({"intent": "deny",
              "examples": AsLiteral(yaml.dump(examples))})
 
examples = ["are you a bot?","are you a human?","am I talking to a bot?",
            "am I talking to a human?","who are you ?"]
bot_challenge = dict({"intent": "bot_challenge",
              "examples": AsLiteral(yaml.dump(examples))})

examples = ["I would like to ask a question", "I would like to ask some questions",
            "A question","question", "wanna ask","ask question","ask questions",
            "Could you answer a question","question about fitness",
            "yes, question","yes question"]
ask_questions = dict({"intent": "ask_questions",
              "examples": AsLiteral(yaml.dump(examples))})

examples = ["that's not what I want to do", "wait for stop","you're no help",
            "this is no help at all","how old are you?","don't you know this?",
            "this isn't working","I already told you that","don't like that",
            "I don't want to tell you that","none of your business",
            "that's not right","stop asking","nevermind","I want to do something else",
            "I changed my mind","kshwf geg aifa ?","sgri qqwp twrq",
            "whw tra; bdo d","tis ege pw","gte hhe ew"]
out_of_scope = dict({"intent": "out_of_scope",
              "examples": AsLiteral(yaml.dump(examples))})

examples = ["thanks","thank you","oh, great thanks","ok thanks","thanks bye"]
thankyou = dict({"intent": "thankyou",
              "examples": AsLiteral(yaml.dump(examples))})


predefined_intents = [greet, goodbye, affirm, deny, bot_challenge, 
                      ask_questions, out_of_scope, thankyou]

In [None]:
def create_nlu(df, predefined_intents):
  # first_time = True
  all_intents = predifined_intents
  previous_intent = ""
  i = 0
  with tqdm(total=df.shape[0]) as progress_bar:
    for query, _, _, intent in df.itertuples(index=False):
      # intent = intent.split("ask_")[1]
      if intent == previous_intent:
        i+=1
      else:
        i=0
        previous_intent = intent

      sub_intent = intent + "_" + str(i)
      #generate 15 examples and keep only 5
      #noticed it gives more varied examples
      preds = [query] + get_response(query, 15, 15)[::3]
      intent_dict = get_intent(intent, sub_intent, preds)
      
      all_intents.append(intent_dict)
      progress_bar.update(1)

  intent_yaml = dict({
          "version": "2.0",
          "nlu": all_intents
          })

  return intent_yaml

In [None]:
intent_yaml = create_nlu(df, predefined_intents)

In [None]:
nlu_file = open("nlu_file.yml", "w")
yaml.dump(intent_yaml, nlu_file, Dumper=MyDumper, default_flow_style=False, sort_keys=False)
nlu_file.close()

Creating rules file

In [None]:
unique_intents = df.sub_intent.unique()
unique_intents

In [None]:
predefined_rules = [dict({"rule": "Greeting",
                        "steps": [{"intent": "greet"},
                                  {"action": "utter_greet"},
                        ]}),
                   dict({"rule": "goodbye",
                      "steps": [{"intent": "goodbye"},
                                {"action": "utter_goodbye"},
                                ]}),
                  dict({"rule": "Thanks",
                        "steps": [{"intent": "thankyou"},
                                  {"action": "utter_no_worries"},
                                  ]}),
                  dict({"rule": "user challenges",
                        "steps": [{"intent": "bot_challenge"},
                                  {"action": "utter_iamabot"},
                                  ]}),
                  dict({"rule": "out of scope",
                        "steps": [{"intent": "out_of_scope"},
                                  {"action": "utter_rephrase"},
                                  ]}),
                  dict({"rule": "Ask blender bot",
                        "steps": [{"intent": "nlu_fallback"},
                                  {"action": "action_blenderbot_chat"},
                                  ]}),
                  ]

In [None]:
def create_rules(unique_intents, predefined_rules):
  all_rules = predefined_rules

  for intent in unique_intents:
    rule = dict({"rule": "respond to " + intent,
                 "steps": [{"intent": intent},
                           {"action": "utter_" + intent},
                          ]
                })
    all_rules.append(rule)

  rules_yaml = dict({
          "version": "2.0",
          "rules": all_rules
          })
  return rules_yaml

In [None]:
rules_yaml = create_rules(unique_intents, predifned_rules)

In [None]:
rules_file = open("rules_file_exercise.yml", "w")
yaml.dump(rules_yaml, rules_file, Dumper=MyDumper, default_flow_style=False, sort_keys=False)
rules_file.close()

Create domain file

In [None]:
predefined_responses = {"utter_greet": [{"text": "Hi!" }],
                        "utter_goodbye": [{"text": "Bye"}],
                        "utter_iamabot": [{"text": "I am a bot, powered by Rasa."}],
                        "utter_no_worries": [{"text": "No problem :)"}],
                        "utter_rephrase": [{"text": "I didn't quite understand that. Can you rephrase?"}]}

In [None]:
def create_domain(df, predefined_responses, short=True):
  #create domain file with short answers if short else long answers

  all_responses = predefined_responses
  previous_intent = ""
  i = 0
  with tqdm(total=df.shape[0]) as progress_bar:
    for query,short_answer, long_answer, intent in df.itertuples(index=False):
      
      if short :
        answer = short_answer
      else:
        answer = long_answer
      
      if intent == previous_intent:
        i+=1
      else:
        i=0
        previous_intent = intent

      sub_intent = intent + "_" + str(i)
      key = "utter_" + intent + "/ask_" + sub_intent
      
      all_responses[key]= [{"text":answer}]
      
      progress_bar.update(1)

  domain_yaml = dict({
          "version": '2.0',
          "config": {"store_entities_as_slots": True},
          "session_config":
                {"session_expiration_time": 0,
                "carry_over_slots_to_new_session": True},
          "responses": all_responses,
          "actions":["action_blenderbot_chat"]
          })
  
  return domain_yaml

In [None]:
domain_yaml = create_domain(df, predefined_responses, short=True)

In [None]:
domain_file = open("domain_file_exercise.yml", "w")
yaml.dump(domain_yaml, domain_file, Dumper=MyDumper, default_flow_style=False, sort_keys=False)
domain_file.close()