In [None]:
#https://github.com/AMontgomerie/question_generator/blob/master/examples/question_generation_example.ipynb

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!git clone https://github.com/amontgomerie/question_generator/

fatal: destination path 'question_generator' already exists and is not an empty directory.


In [None]:
# %load questiongenerator.py
import en_core_web_sm
import json
import numpy as np
import random
import re
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
)
from typing import Any, List, Mapping, Tuple


class QuestionGenerator:
    """A transformer-based NLP system for generating reading comprehension-style questions from
    texts. It can generate full sentence questions, multiple choice questions, or a mix of the
    two styles.

    To filter out low quality questions, questions are assigned a score and ranked once they have
    been generated. Only the top k questions will be returned. This behaviour can be turned off
    by setting use_evaluator=False.
    """

    def __init__(self) -> None:

        QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
        self.ANSWER_TOKEN = "<answer>"
        self.CONTEXT_TOKEN = "<context>"
        self.SEQ_LENGTH = 512

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.qg_tokenizer = AutoTokenizer.from_pretrained(
            QG_PRETRAINED, use_fast=False)
        self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
        self.qg_model.to(self.device)
        self.qg_model.eval()

        self.qa_evaluator = QAEvaluator()

    def generate(
        self,
        article: str,
        use_evaluator: bool = True,
        num_questions: bool = None,
        answer_style: str = "all"
    ) -> List:
        """Takes an article and generates a set of question and answer pairs. If use_evaluator
        is True then QA pairs will be ranked and filtered based on their quality. answer_style
        should selected from ["all", "sentences", "multiple_choice"].
        """

        print("Generating questions...\n")

        qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
        generated_questions = self.generate_questions_from_inputs(qg_inputs)

        message = "{} questions doesn't match {} answers".format(
            len(generated_questions), len(qg_answers)
        )
        assert len(generated_questions) == len(qg_answers), message

        if use_evaluator:
            print("Evaluating QA pairs...\n")
            encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
                generated_questions, qg_answers
            )
            scores = self.qa_evaluator.get_scores(encoded_qa_pairs)

            if num_questions:
                qa_list = self._get_ranked_qa_pairs(
                    generated_questions, qg_answers, scores, num_questions
                )
            else:
                qa_list = self._get_ranked_qa_pairs(
                    generated_questions, qg_answers, scores
                )

        else:
            print("Skipping evaluation step.\n")
            qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)

        return qa_list

    def generate_qg_inputs(self, text: str, answer_style: str) -> Tuple[List[str], List[str]]:
        """Given a text, returns a list of model inputs and a list of corresponding answers.
        Model inputs take the form "answer_token <answer text> context_token <context text>" where
        the answer is a string extracted from the text, and the context is the wider text surrounding
        the context.
        """

        VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]

        if answer_style not in VALID_ANSWER_STYLES:
            raise ValueError(
                "Invalid answer style {}. Please choose from {}".format(
                    answer_style, VALID_ANSWER_STYLES
                )
            )

        inputs = []
        answers = []

        if answer_style == "sentences" or answer_style == "all":
            segments = self._split_into_segments(text)

            for segment in segments:
                sentences = self._split_text(segment)
                prepped_inputs, prepped_answers = self._prepare_qg_inputs(
                    sentences, segment
                )
                inputs.extend(prepped_inputs)
                answers.extend(prepped_answers)

        if answer_style == "multiple_choice" or answer_style == "all":
            sentences = self._split_text(text)
            prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(
                sentences
            )
            inputs.extend(prepped_inputs)
            answers.extend(prepped_answers)

        return inputs, answers

    def generate_questions_from_inputs(self, qg_inputs: List) -> List[str]:
        """Given a list of concatenated answers and contexts, with the form:
        "answer_token <answer text> context_token <context text>", generates a list of 
        questions.
        """
        generated_questions = []

        for qg_input in qg_inputs:
            question = self._generate_question(qg_input)
            generated_questions.append(question)

        return generated_questions

    def _split_text(self, text: str) -> List[str]:
        """Splits the text into sentences, and attempts to split or truncate long sentences."""
        MAX_SENTENCE_LEN = 128
        sentences = re.findall(".*?[.!\?]", text)
        cut_sentences = []

        for sentence in sentences:
            if len(sentence) > MAX_SENTENCE_LEN:
                cut_sentences.extend(re.split("[,;:)]", sentence))

        # remove useless post-quote sentence fragments
        cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
        sentences = sentences + cut_sentences

        return list(set([s.strip(" ") for s in sentences]))

    def _split_into_segments(self, text: str) -> List[str]:
        """Splits a long text into segments short enough to be input into the transformer network.
        Segments are used as context for question generation.
        """
        MAX_TOKENS = 490
        paragraphs = text.split("\n")
        tokenized_paragraphs = [
            self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
        ]
        segments = []

        while len(tokenized_paragraphs) > 0:
            segment = []

            while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
                paragraph = tokenized_paragraphs.pop(0)
                segment.extend(paragraph)
            segments.append(segment)

        return [self.qg_tokenizer.decode(s, skip_special_tokens=True) for s in segments]

    def _prepare_qg_inputs(
        self,
        sentences: List[str],
        text: str
    ) -> Tuple[List[str], List[str]]:
        """Uses sentences as answers and the text as context. Returns a tuple of (model inputs, answers).
        Model inputs are "answer_token <answer text> context_token <context text>" 
        """
        inputs = []
        answers = []

        for sentence in sentences:
            qg_input = f"{self.ANSWER_TOKEN} {sentence} {self.CONTEXT_TOKEN} {text}"
            inputs.append(qg_input)
            answers.append(sentence)

        return inputs, answers

    def _prepare_qg_inputs_MC(self, sentences: List[str]) -> Tuple[List[str], List[str]]:
        """Performs NER on the text, and uses extracted entities are candidate answers for multiple-choice
        questions. Sentences are used as context, and entities as answers. Returns a tuple of (model inputs, answers). 
        Model inputs are "answer_token <answer text> context_token <context text>"
        """
        spacy_nlp = en_core_web_sm.load()
        docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
        inputs_from_text = []
        answers_from_text = []

        for doc, sentence in zip(docs, sentences):
            entities = doc.ents
            if entities:

                for entity in entities:
                    qg_input = f"{self.ANSWER_TOKEN} {entity} {self.CONTEXT_TOKEN} {sentence}"
                    answers = self._get_MC_answers(entity, docs)
                    inputs_from_text.append(qg_input)
                    answers_from_text.append(answers)

        return inputs_from_text, answers_from_text

    def _get_MC_answers(self, correct_answer: Any, docs: Any) -> List[Mapping[str, Any]]:
        """Finds a set of alternative answers for a multiple-choice question. Will attempt to find
        alternatives of the same entity type as correct_answer if possible.
        """
        entities = []

        for doc in docs:
            entities.extend([{"text": e.text, "label_": e.label_}
                            for e in doc.ents])

        # remove duplicate elements
        entities_json = [json.dumps(kv) for kv in entities]
        pool = set(entities_json)
        num_choices = (
            min(4, len(pool)) - 1
        )  # -1 because we already have the correct answer

        # add the correct answer
        final_choices = []
        correct_label = correct_answer.label_
        final_choices.append({"answer": correct_answer.text, "correct": True})
        pool.remove(
            json.dumps({"text": correct_answer.text,
                       "label_": correct_answer.label_})
        )

        # find answers with the same NER label
        matches = [e for e in pool if correct_label in e]

        # if we don't have enough then add some other random answers
        if len(matches) < num_choices:
            choices = matches
            pool = pool.difference(set(choices))
            choices.extend(random.sample(pool, num_choices - len(choices)))
        else:
            choices = random.sample(matches, num_choices)

        choices = [json.loads(s) for s in choices]

        for choice in choices:
            final_choices.append({"answer": choice["text"], "correct": False})

        random.shuffle(final_choices)
        return final_choices

    @torch.no_grad()
    def _generate_question(self, qg_input: str) -> str:
        """Takes qg_input which is the concatenated answer and context, and uses it to generate
        a question sentence. The generated question is decoded and then returned.
        """
        encoded_input = self._encode_qg_input(qg_input)
        output = self.qg_model.generate(input_ids=encoded_input["input_ids"])
        question = self.qg_tokenizer.decode(
            output[0],
            skip_special_tokens=True
        )
        return question

    def _encode_qg_input(self, qg_input: str) -> torch.tensor:
        """Tokenizes a string and returns a tensor of input ids corresponding to indices of tokens in 
        the vocab.
        """
        return self.qg_tokenizer(
            qg_input,
            padding='max_length',
            max_length=self.SEQ_LENGTH,
            truncation=True,
            return_tensors="pt",
        ).to(self.device)

    def _get_ranked_qa_pairs(
        self, generated_questions: List[str], qg_answers: List[str], scores, num_questions: int = 10
    ) -> List[Mapping[str, str]]:
        """Ranks generated questions according to scores, and returns the top num_questions examples.
        """
        if num_questions > len(scores):
            num_questions = len(scores)
            print((
                f"\nWas only able to generate {num_questions} questions.",
                "For more questions, please input a longer text.")
            )

        qa_list = []

        for i in range(num_questions):
            index = scores[i]
            qa = {
                "question": generated_questions[index].split("?")[0] + "?",
                "answer": qg_answers[index]
            }
            qa_list.append(qa)

        return qa_list

    def _get_all_qa_pairs(self, generated_questions: List[str], qg_answers: List[str]):
        """Formats question and answer pairs without ranking or filtering."""
        qa_list = []

        for question, answer in zip(generated_questions, qg_answers):
            qa = {
                "question": question.split("?")[0] + "?",
                "answer": answer
            }
            qa_list.append(qa)

        return qa_list


class QAEvaluator:
    """Wrapper for a transformer model which evaluates the quality of question-answer pairs.
    Given a QA pair, the model will generate a score. Scores can be used to rank and filter
    QA pairs.
    """

    def __init__(self) -> None:

        QAE_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
        self.SEQ_LENGTH = 512

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
        self.qae_model = AutoModelForSequenceClassification.from_pretrained(
            QAE_PRETRAINED
        )
        self.qae_model.to(self.device)
        self.qae_model.eval()

    def encode_qa_pairs(self, questions: List[str], answers: List[str]) -> List[torch.tensor]:
        """Takes a list of questions and a list of answers and encodes them as a list of tensors."""
        encoded_pairs = []

        for question, answer in zip(questions, answers):
            encoded_qa = self._encode_qa(question, answer)
            encoded_pairs.append(encoded_qa.to(self.device))

        return encoded_pairs

    def get_scores(self, encoded_qa_pairs: List[torch.tensor]) -> List[float]:
        """Generates scores for a list of encoded QA pairs."""
        scores = {}

        for i in range(len(encoded_qa_pairs)):
            scores[i] = self._evaluate_qa(encoded_qa_pairs[i])

        return [
            k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
        ]

    def _encode_qa(self, question: str, answer: str) -> torch.tensor:
        """Concatenates a question and answer, and then tokenizes them. Returns a tensor of 
        input ids corresponding to indices in the vocab.
        """
        if type(answer) is list:
            for a in answer:
                if a["correct"]:
                    correct_answer = a["answer"]
        else:
            correct_answer = answer

        return self.qae_tokenizer(
            text=question,
            text_pair=correct_answer,
            padding="max_length",
            max_length=self.SEQ_LENGTH,
            truncation=True,
            return_tensors="pt",
        )

    @torch.no_grad()
    def _evaluate_qa(self, encoded_qa_pair: torch.tensor) -> float:
        """Takes an encoded QA pair and returns a score."""
        output = self.qae_model(**encoded_qa_pair)
        return output[0][0][1]


def print_qa(qa_list: List[Mapping[str, str]], show_answers: bool = True) -> None:
    """Formats and prints a list of generated questions and answers."""

    for i in range(len(qa_list)):
        # wider space for 2 digit q nums
        space = " " * int(np.where(i < 9, 3, 4))

        print(f"{i + 1}) Q: {qa_list[i]['question']}")

        answer = qa_list[i]["answer"]

        # print a list of multiple choice answers
        if type(answer) is list:

            if show_answers:
                print(
                    f"{space}A: 1. {answer[0]['answer']} "
                    f"{np.where(answer[0]['correct'], '(correct)', '')}"
                )
                for j in range(1, len(answer)): 
                    print(
                        f"{space + '   '}{j + 1}. {answer[j]['answer']} "
                        f"{np.where(answer[j]['correct']==True,'(correct)', '')}"
                    )

            else:
                print(f"{space}A: 1. {answer[0]['answer']}")
                for j in range(1, len(answer)):
                    print(f"{space + '   '}{j + 1}. {answer[j]['answer']}")

            print("")
       
        # print full sentence answers
        else:
            if show_answers:
                print(f"{space}A: {answer}\n")
        

In [None]:
import torch
if torch.cuda.is_available:
  print('GPU available')
else:
  print('Please set GPU via Edit -> Notebook Settings.')

GPU available


In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
qg = QuestionGenerator()
"""
with open('indian_history.txt', 'r') as a:
    article = a.read()
"""

import json
import requests
request=requests.get("https://generate-questions.devbyopeneyes.com/api/getFileData/62fb41beef4751bbf70c6272")
resp= request.json()
file_path=(resp["data"]["file_path"])
_id=(resp["data"]["_id"])
number_of_question=(resp["data"]["number_of_question"])
file_type=(resp["data"]["file_type"])

!pip install PyPDF2

if file_type=="txt":
  import urllib.request
  response = urllib.request.urlopen(file_path)
  html = response.read()
  text=html.decode('utf8')
  print(text)
elif file_type=="pdf":
  import requests, PyPDF2

  url = file_path
  response = requests.get(url)
  my_raw_data = response.content

  with open("my_pdf.pdf", 'wb') as my_data:
      my_data.write(my_raw_data)

  open_pdf_file = open("my_pdf.pdf", 'rb')
  read_pdf = PyPDF2.PdfFileReader(open_pdf_file)
  if read_pdf.isEncrypted:
      read_pdf.decrypt("")
      print(read_pdf.getPage(0).extractText())

  else:
      print(read_pdf.getPage(0).extractText())
elif file_type=="docx":
  None
elif file_type=="doc":
  None
else:
  print("Invalid File Type")


"""
import urllib.request
response = urllib.request.urlopen(file_path)
html = response.read()
text=html.decode('utf8')
print(text)
"""

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/413M [00:00<?, ?B/s]

According to consensus in modern genetics, anatomically modern humans first arrived on the Indian subcontinent from Africa between 73,000 and 55,000 years ago.[1] However, the earliest known human remains in South Asia date to 30,000 years ago. Settled life, which involves the transition from foraging to farming and pastoralism, began in South Asia around 7,000 BCE. At the site of Mehrgarh presence can be documented of the domestication of wheat and barley, rapidly followed by that of goats, sheep, and cattle.[2] By 4,500 BCE, settled life had spread more widely,[2] and began to gradually evolve into the Indus Valley Civilization, an early civilization of the Old world, which was contemporaneous with Ancient Egypt and Mesopotamia. This civilisation flourished between 2,500 BCE and 1900 BCE in what today is Pakistan and north-western India, and was noted for its urban planning, baked brick houses, elaborate drainage, and water supply.[3]

Early on in the second millennium BCE, persist

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk import sent_tokenize
import pandas as pd
import numpy as np
sentences = sent_tokenize(text)
print(sentences)
text_to_sentence= np.array(sentences)
len(text_to_sentence)

['According to consensus in modern genetics, anatomically modern humans first arrived on the Indian subcontinent from Africa between 73,000 and 55,000 years ago.', '[1] However, the earliest known human remains in South Asia date to 30,000 years ago.', 'Settled life, which involves the transition from foraging to farming and pastoralism, began in South Asia around 7,000 BCE.', 'At the site of Mehrgarh presence can be documented of the domestication of wheat and barley, rapidly followed by that of goats, sheep, and cattle.', '[2] By 4,500 BCE, settled life had spread more widely,[2] and began to gradually evolve into the Indus Valley Civilization, an early civilization of the Old world, which was contemporaneous with Ancient Egypt and Mesopotamia.', 'This civilisation flourished between 2,500 BCE and 1900 BCE in what today is Pakistan and north-western India, and was noted for its urban planning, baked brick houses, elaborate drainage, and water supply.', '[3]\r\n\r\nEarly on in the sec

23

In [None]:
sentence_to_words=[]
for i in text_to_sentence[0:]:
    sentence_to_words.extend(i.split()) 
sentence_to_words

['According',
 'to',
 'consensus',
 'in',
 'modern',
 'genetics,',
 'anatomically',
 'modern',
 'humans',
 'first',
 'arrived',
 'on',
 'the',
 'Indian',
 'subcontinent',
 'from',
 'Africa',
 'between',
 '73,000',
 'and',
 '55,000',
 'years',
 'ago.',
 '[1]',
 'However,',
 'the',
 'earliest',
 'known',
 'human',
 'remains',
 'in',
 'South',
 'Asia',
 'date',
 'to',
 '30,000',
 'years',
 'ago.',
 'Settled',
 'life,',
 'which',
 'involves',
 'the',
 'transition',
 'from',
 'foraging',
 'to',
 'farming',
 'and',
 'pastoralism,',
 'began',
 'in',
 'South',
 'Asia',
 'around',
 '7,000',
 'BCE.',
 'At',
 'the',
 'site',
 'of',
 'Mehrgarh',
 'presence',
 'can',
 'be',
 'documented',
 'of',
 'the',
 'domestication',
 'of',
 'wheat',
 'and',
 'barley,',
 'rapidly',
 'followed',
 'by',
 'that',
 'of',
 'goats,',
 'sheep,',
 'and',
 'cattle.',
 '[2]',
 'By',
 '4,500',
 'BCE,',
 'settled',
 'life',
 'had',
 'spread',
 'more',
 'widely,[2]',
 'and',
 'began',
 'to',
 'gradually',
 'evolve',
 'into'

In [None]:
tokens = [ w for w in sentence_to_words if w[0]!='[' and w[-1]!= ']' ]
tokens

['According',
 'to',
 'consensus',
 'in',
 'modern',
 'genetics,',
 'anatomically',
 'modern',
 'humans',
 'first',
 'arrived',
 'on',
 'the',
 'Indian',
 'subcontinent',
 'from',
 'Africa',
 'between',
 '73,000',
 'and',
 '55,000',
 'years',
 'ago.',
 'However,',
 'the',
 'earliest',
 'known',
 'human',
 'remains',
 'in',
 'South',
 'Asia',
 'date',
 'to',
 '30,000',
 'years',
 'ago.',
 'Settled',
 'life,',
 'which',
 'involves',
 'the',
 'transition',
 'from',
 'foraging',
 'to',
 'farming',
 'and',
 'pastoralism,',
 'began',
 'in',
 'South',
 'Asia',
 'around',
 '7,000',
 'BCE.',
 'At',
 'the',
 'site',
 'of',
 'Mehrgarh',
 'presence',
 'can',
 'be',
 'documented',
 'of',
 'the',
 'domestication',
 'of',
 'wheat',
 'and',
 'barley,',
 'rapidly',
 'followed',
 'by',
 'that',
 'of',
 'goats,',
 'sheep,',
 'and',
 'cattle.',
 'By',
 '4,500',
 'BCE,',
 'settled',
 'life',
 'had',
 'spread',
 'more',
 'and',
 'began',
 'to',
 'gradually',
 'evolve',
 'into',
 'the',
 'Indus',
 'Valley',


In [None]:
import string
remove = string.punctuation
import re 
remove = remove.replace(".", "")
pattern = r"[{}]".format(re.escape(remove))
table = str.maketrans('', '', pattern)
stripped = [w.translate(table) for w in tokens]
print(stripped[:])

['According', 'to', 'consensus', 'in', 'modern', 'genetics', 'anatomically', 'modern', 'humans', 'first', 'arrived', 'on', 'the', 'Indian', 'subcontinent', 'from', 'Africa', 'between', '73000', 'and', '55000', 'years', 'ago.', 'However', 'the', 'earliest', 'known', 'human', 'remains', 'in', 'South', 'Asia', 'date', 'to', '30000', 'years', 'ago.', 'Settled', 'life', 'which', 'involves', 'the', 'transition', 'from', 'foraging', 'to', 'farming', 'and', 'pastoralism', 'began', 'in', 'South', 'Asia', 'around', '7000', 'BCE.', 'At', 'the', 'site', 'of', 'Mehrgarh', 'presence', 'can', 'be', 'documented', 'of', 'the', 'domestication', 'of', 'wheat', 'and', 'barley', 'rapidly', 'followed', 'by', 'that', 'of', 'goats', 'sheep', 'and', 'cattle.', 'By', '4500', 'BCE', 'settled', 'life', 'had', 'spread', 'more', 'and', 'began', 'to', 'gradually', 'evolve', 'into', 'the', 'Indus', 'Valley', 'Civilization', 'an', 'early', 'civilization', 'of', 'the', 'Old', 'world', 'which', 'was', 'contemporaneous',

In [None]:
words_to_sentense=' '.join(stripped)
words_to_sentense

'According to consensus in modern genetics anatomically modern humans first arrived on the Indian subcontinent from Africa between 73000 and 55000 years ago. However the earliest known human remains in South Asia date to 30000 years ago. Settled life which involves the transition from foraging to farming and pastoralism began in South Asia around 7000 BCE. At the site of Mehrgarh presence can be documented of the domestication of wheat and barley rapidly followed by that of goats sheep and cattle. By 4500 BCE settled life had spread more and began to gradually evolve into the Indus Valley Civilization an early civilization of the Old world which was contemporaneous with Ancient Egypt and Mesopotamia. This civilisation flourished between 2500 BCE and 1900 BCE in what today is Pakistan and northwestern India and was noted for its urban planning baked brick houses elaborate drainage and water supply. Early on in the second millennium BCE persistent drought caused the population of the Ind

In [None]:
type(words_to_sentense)

str

In [None]:
MCQ =[]
qa_list = qg.generate(
    words_to_sentense, 
    num_questions= int(int(number_of_question)*2), 
    answer_style= 'multiple_choice'
)
MCQ.append(qa_list)
print_qa(qa_list)

Generating questions...





Evaluating QA pairs...

1) Q: What religions were synthesised with the preexisting cultures of the subcontinent?
   A: 1. Vedic Brahmanism (correct)
      2. Hindu 
      3. Central Asia 
      4. around 7000 

2) Q: What Empire ruled the Indian subcontinent?
   A: 1. Classical 
      2. Vedas 
      3. BCE 
      4. the Maurya Empire (correct)

3) Q: What was the first civilization of the Old world?
   A: 1. IndoAryans 
      2. the Indus Valley Civilization (correct)
      3. IndoAryan 
      4. Gangetic 

4) Q: What other civilizations were contemporaneous with the Indus Valley Civilization?
   A: 1. Classical 
      2. Ancient Egypt and Mesopotamia (correct)
      3. Vedas 
      4. BCE 

5) Q: What religions were associated with the urbanisation of Greater Magadha?
   A: 1. Greater Magadha 
      2. the Indus Valley 
      3. Mediterranean 
      4. Vedic (correct)

6) Q: What plains did the IndoAryans spread from the Punjab into?
   A: 1. IndoAryans 
      2. Punjab 
      3. Hin

In [None]:
MCQ

[[{'question': 'What religions were synthesised with the preexisting cultures of the subcontinent?',
   'answer': [{'answer': 'Vedic Brahmanism', 'correct': True},
    {'answer': 'Hindu', 'correct': False},
    {'answer': 'Central Asia', 'correct': False},
    {'answer': 'around 7000', 'correct': False}]},
  {'question': 'What Empire ruled the Indian subcontinent?',
   'answer': [{'answer': 'Classical', 'correct': False},
    {'answer': 'Vedas', 'correct': False},
    {'answer': 'BCE', 'correct': False},
    {'answer': 'the Maurya Empire', 'correct': True}]},
  {'question': 'What was the first civilization of the Old world?',
   'answer': [{'answer': 'IndoAryans', 'correct': False},
    {'answer': 'the Indus Valley Civilization', 'correct': True},
    {'answer': 'IndoAryan', 'correct': False},
    {'answer': 'Gangetic', 'correct': False}]},
  {'question': 'What other civilizations were contemporaneous with the Indus Valley Civilization?',
   'answer': [{'answer': 'Classical', 'correct'

In [None]:
output = []
for e in MCQ:
    for d in e:
        row = {"options": []}
        for a in d["answer"]:
            row["options"].append(a["answer"])
            if a["correct"]:
                row["answer"] = a["answer"]
        row["question"] = d["question"]
        output.append(row)
print(output)

[{'options': ['Vedic Brahmanism', 'Hindu', 'Central Asia', 'around 7000'], 'answer': 'Vedic Brahmanism', 'question': 'What religions were synthesised with the preexisting cultures of the subcontinent?'}, {'options': ['Classical', 'Vedas', 'BCE', 'the Maurya Empire'], 'answer': 'the Maurya Empire', 'question': 'What Empire ruled the Indian subcontinent?'}, {'options': ['IndoAryans', 'the Indus Valley Civilization', 'IndoAryan', 'Gangetic'], 'answer': 'the Indus Valley Civilization', 'question': 'What was the first civilization of the Old world?'}, {'options': ['Classical', 'Ancient Egypt and Mesopotamia', 'Vedas', 'BCE'], 'answer': 'Ancient Egypt and Mesopotamia', 'question': 'What other civilizations were contemporaneous with the Indus Valley Civilization?'}, {'options': ['Greater Magadha', 'the Indus Valley', 'Mediterranean', 'Vedic'], 'answer': 'Vedic', 'question': 'What religions were associated with the urbanisation of Greater Magadha?'}, {'options': ['IndoAryans', 'Punjab', 'Hindu

In [None]:
pip install language_tool_python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting language_tool_python
  Downloading language_tool_python-2.7.1-py3-none-any.whl (34 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.7.1


In [None]:
new_questions = []
[new_questions.append(x) for x in output if x not in new_questions]
len(new_questions)
new_questions
len(new_questions)

10

In [None]:
questions=str(new_questions)

In [None]:
from nltk.internals import ReadError

is_bad_rule = lambda rule: rule.message == 'Possible spelling mistake found.' and len(rule.replacements) and rule.replacements[0][0].isupper()
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')
#for i in new_questions:
i=0
#while True:
for i in questions:
  matches = tool.check(i)
  matches = [rule for rule in matches if not is_bad_rule(rule)]
  lang_tool=language_tool_python.utils.correct(questions, matches)
  grammer_correction=[]
  #if matches>0:
  
  grammer_correction.append(lang_tool)
  #else:
    #z.append(y)

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:09<00:00, 23.1MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpx9z6qlqt.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


In [None]:
grammer_correction

["]{'options': ['Vedic Brahmanism', 'Hindu', 'Central Asia', 'around 7000'], 'answer': 'Vedic Brahmanism', 'question': 'What religions were synthesised with the preexisting cultures of the subcontinent?'}, {'options': ['Classical', 'Vedas', 'BCE', 'the Maurya Empire'], 'answer': 'the Maurya Empire', 'question': 'What Empire ruled the Indian subcontinent?'}, {'options': ['IndoAryans', 'the Indus Valley Civilization', 'IndoAryan', 'Gangetic'], 'answer': 'the Indus Valley Civilization', 'question': 'What was the first civilization of the Old world?'}, {'options': ['Classical', 'Ancient Egypt and Mesopotamia', 'Vedas', 'BCE'], 'answer': 'Ancient Egypt and Mesopotamia', 'question': 'What other civilizations were contemporaneous with the Indus Valley Civilization?'}, {'options': ['Greater Magadha', 'the Indus Valley', 'Mediterranean', 'Vedic'], 'answer': 'Vedic', 'question': 'What religions were associated with the urbanisation of Greater Magadha?'}, {'options': ['IndoAryans', 'Punjab', 'Hin

In [None]:
special_character = [[grammer_correction[0].strip("'[]")]]

In [None]:
special_character

[["{'options': ['Vedic Brahmanism', 'Hindu', 'Central Asia', 'around 7000'], 'answer': 'Vedic Brahmanism', 'question': 'What religions were synthesised with the preexisting cultures of the subcontinent?'}, {'options': ['Classical', 'Vedas', 'BCE', 'the Maurya Empire'], 'answer': 'the Maurya Empire', 'question': 'What Empire ruled the Indian subcontinent?'}, {'options': ['IndoAryans', 'the Indus Valley Civilization', 'IndoAryan', 'Gangetic'], 'answer': 'the Indus Valley Civilization', 'question': 'What was the first civilization of the Old world?'}, {'options': ['Classical', 'Ancient Egypt and Mesopotamia', 'Vedas', 'BCE'], 'answer': 'Ancient Egypt and Mesopotamia', 'question': 'What other civilizations were contemporaneous with the Indus Valley Civilization?'}, {'options': ['Greater Magadha', 'the Indus Valley', 'Mediterranean', 'Vedic'], 'answer': 'Vedic', 'question': 'What religions were associated with the urbanisation of Greater Magadha?'}, {'options': ['IndoAryans', 'Punjab', 'Hin

In [None]:
import ast
special_character_rm=str(special_character).replace('"',' ')
special_character_rm= ast.literal_eval(special_character_rm)
special_character_rm

[[{'options': ['Vedic Brahmanism', 'Hindu', 'Central Asia', 'around 7000'],
   'answer': 'Vedic Brahmanism',
   'question': 'What religions were synthesised with the preexisting cultures of the subcontinent?'},
  {'options': ['Classical', 'Vedas', 'BCE', 'the Maurya Empire'],
   'answer': 'the Maurya Empire',
   'question': 'What Empire ruled the Indian subcontinent?'},
  {'options': ['IndoAryans',
    'the Indus Valley Civilization',
    'IndoAryan',
    'Gangetic'],
   'answer': 'the Indus Valley Civilization',
   'question': 'What was the first civilization of the Old world?'},
  {'options': ['Classical', 'Ancient Egypt and Mesopotamia', 'Vedas', 'BCE'],
   'answer': 'Ancient Egypt and Mesopotamia',
   'question': 'What other civilizations were contemporaneous with the Indus Valley Civilization?'},
  {'options': ['Greater Magadha',
    'the Indus Valley',
    'Mediterranean',
    'Vedic'],
   'answer': 'Vedic',
   'question': 'What religions were associated with the urbanisation of 

In [None]:
import itertools
from itertools import chain
chain = itertools.chain.from_iterable(special_character_rm)
total_quesions=(list(chain))

In [None]:
len(total_quesions)

10

In [None]:
total_quesions

[{'options': ['Vedic Brahmanism', 'Hindu', 'Central Asia', 'around 7000'],
  'answer': 'Vedic Brahmanism',
  'question': 'What religions were synthesised with the preexisting cultures of the subcontinent?'},
 {'options': ['Classical', 'Vedas', 'BCE', 'the Maurya Empire'],
  'answer': 'the Maurya Empire',
  'question': 'What Empire ruled the Indian subcontinent?'},
 {'options': ['IndoAryans',
   'the Indus Valley Civilization',
   'IndoAryan',
   'Gangetic'],
  'answer': 'the Indus Valley Civilization',
  'question': 'What was the first civilization of the Old world?'},
 {'options': ['Classical', 'Ancient Egypt and Mesopotamia', 'Vedas', 'BCE'],
  'answer': 'Ancient Egypt and Mesopotamia',
  'question': 'What other civilizations were contemporaneous with the Indus Valley Civilization?'},
 {'options': ['Greater Magadha', 'the Indus Valley', 'Mediterranean', 'Vedic'],
  'answer': 'Vedic',
  'question': 'What religions were associated with the urbanisation of Greater Magadha?'},
 {'options

In [None]:
len(total_quesions)

10

In [None]:
generate_ques=int(number_of_question)

In [None]:
generate_ques

5

In [None]:
if generate_ques > len(total_quesions) or generate_ques == 0: #number_of_question
  print("please give value between 1 to {}".format(len(total_quesions)))
else:
  import random
  other_questions =[]
  display_question = random.sample(total_quesions, k=generate_ques) #number_of_question
  for item in total_quesions:
    if item not in display_question:
        other_questions.append(item)
  import json
  import requests
  url="https://generate-questions.devbyopeneyes.com/api/GenerateQuestions" 
  headers = {'Content-Type':'application/json','Accept':'application/json'}
  myobjs ={"id" : _id,
         "questions" : display_question,
         "other_questions" : other_questions
    }
  yo = requests.post(url,headers=headers,data=json.dumps(myobjs))
  print(len(display_question))
  display_question
  print(len(other_questions))
  other_questions.clear()

5
5
