<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/C4_FAQs_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -Uq transformers sentence-transformers datasets sentencepiece

In [15]:
from datasets import load_dataset
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize # We need a fast sentence-tokenizer
from typing import List
import pandas as pd
import random
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
"""
Structure Basic (list of dict below):
{
  "url": url_link,
  "text": whole_text,
  "faq_pairs": [
    {"question": question, "answer": answer},
    {"question": question, "answer": answer}
  ]
}
"""

In [4]:
dataset = load_dataset("c4", "en", split="train", streaming=True)

Downloading builder script:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.77k [00:00<?, ?B/s]

In [5]:
def tokenize_text(text: str):
  sentence_tokens = sent_tokenize(text)
  return sentence_tokens

def num_word_tokens(text: str):
  return len(word_tokenize(text))

def flow(sentence_tokens: List[str]):
  initial_question_found = False
  questions, answers = [], []
  for i, sent in enumerate(sentence_tokens):
    if sent.endswith("?"):
      questions.append(sent)
      answer_j = []
      for sent_j in sentence_tokens[i+1:]:
        if not sent_j.endswith("?"):
          answer_j.append(sent_j)
        else:
          break
      answer_j_text = " ".join(answer_j)
      answers.append(answer_j_text)
  
  # print(f"Total questions: {len(questions)}")
  # print(f"Total answers: {len(answers)}")
  qa_pairs = list(zip(questions, answers))
  qa_pairs = list(filter(lambda x: num_word_tokens(x[0]) > 2 and \
                         num_word_tokens(x[1]) > 4, qa_pairs))
  return qa_pairs

In [6]:
count_faqs = 0
done_urls = set()
limit = 100
progress_bar = tqdm(range(limit))
dataset_list = []
all_questions, all_answers = [], []
for dict_ in dataset:
  if count_faqs == limit:
    break

  url = dict_["url"]
  text = dict_["text"]
  if "faq" in url.lower():
    dict_ = {}
    dict_["url"] = url
    dict_["text"] = text
    count_faqs += 1
    progress_bar.update(1)
    done_urls.add(url)
    qa_pairs = flow(tokenize_text(text))
    if qa_pairs:
      questions, answers = zip(*qa_pairs)
      questions, answers = list(questions), list(answers)
      faq_list = [{"question": q, "answer": a} for q, a in zip(questions, answers)]
      dict_["faq_pairs"] = faq_list
      dataset_list.append(dict_)

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
print(f"Total filtered faq pages: {len(dataset_list)}")

Total filtered faq pages: 77


In [18]:
# Visualize a random dataset_list entry
random_index = random.sample(range(len(dataset_list)), 1)[0]
dataset_list[random_index]

{'url': 'http://askus.oceancitylibrary.org/faq/243148',
 'text': "Q. How do i check my E mail?\nIf I understand your question, to check your email, go on internet, go to your internet provider's website and log in using your passwork. If you have further questions, please call us at 399-2434, ext. 5231.",
 'faq_pairs': [{'question': 'How do i check my E mail?',
   'answer': "If I understand your question, to check your email, go on internet, go to your internet provider's website and log in using your passwork. If you have further questions, please call us at 399-2434, ext. 5231."}]}

In [20]:
total_faq_pairs = sum([len(x["faq_pairs"]) for x in dataset_list])
print(f"Total faq pairs: {total_faq_pairs}")

Total faq pairs: 607


In [11]:
df = pd.DataFrame.from_records(dataset_list)
df.to_json(f"c4-faqs.jsonl", orient="records", lines=True)