<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/C4_FAQs_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -Uq transformers sentence-transformers datasets sentencepiece

In [1]:
from datasets import load_dataset
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize # We need a fast sentence-tokenizer
from typing import List
import pandas as pd
import random
from huggingface_hub import notebook_login
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
"""
Structure Basic (list of dict below):
{
  "faq_pairs": [
    {"question": question, "answer": answer},
    {"question": question, "answer": answer}
  ]
}
"""

In [2]:
dataset = load_dataset("c4", "en", split="train", streaming=True)

In [3]:
def tokenize_text(text: str):
  sentence_tokens = sent_tokenize(text)
  return sentence_tokens

def num_sentence_token(text: str):
  return len(tokenize_text(text))

def num_word_tokens(text: str):
  return len(word_tokenize(text))

def flow(sentence_tokens: List[str]):
  initial_question_found = False
  questions, answers = [], []
  for i, sent in enumerate(sentence_tokens):
    if sent.endswith("?"):
      questions.append(sent)
      answer_j = []
      for sent_j in sentence_tokens[i+1:]:
        if not sent_j.endswith("?"):
          answer_j.append(sent_j)
        else:
          break
      answer_j_text = " ".join(answer_j)
      answers.append(answer_j_text)
  
  qa_pairs = list(zip(questions, answers))
  qa_pairs = list(filter(lambda x: num_word_tokens(x[0]) > 2 and \
                         num_word_tokens(x[1]) > 16, qa_pairs))
  return qa_pairs

In [4]:
def save_dataset_list_and_flush(dataset_list, count_save):
  df = pd.DataFrame.from_records(dataset_list)
  df.to_json(f"c4-faqs_{count_save}.jsonl", orient="records", lines=True)

In [11]:
count_faqs = 0
done_urls = set()
limit = 100000
checkpoint_limit = 10000
progress_bar_total = tqdm(range(364868892), desc="total")
progress_bar_faqs = tqdm(range(limit), desc="faq-pages")
progress_bar_pairs = tqdm(range(10**6), desc="qa-pairs")
dataset_list = []
all_questions, all_answers = [], []
count_save = 1
for dict_ in dataset:
  progress_bar_total.update(1)
  if count_faqs == limit:
    break

  url = dict_["url"]
  text = dict_["text"]
  if "faq" in url.lower():
    dict_ = {}
    # dict_["url"] = url
    # dict_["text"] = text
    done_urls.add(url)
    qa_pairs = flow(tokenize_text(text))
    if qa_pairs:
      count_faqs += 1
      progress_bar_faqs.update(1)
      questions, answers = zip(*qa_pairs)
      questions, answers = list(questions), list(answers)
      faq_list = [{"question": q, "answer": a} for q, a in zip(questions, answers)]
      dict_["faq_pairs"] = faq_list
      progress_bar_pairs.update(len(faq_list))
      dataset_list.append(dict_)

      if dataset_list and not (len(dataset_list) % checkpoint_limit):
        save_dataset_list_and_flush(dataset_list, count_save)
        dataset_list.clear()
        count_save += 1
dataset_list.clear()

total:   0%|          | 0/364868892 [00:00<?, ?it/s]

faq-pages:   0%|          | 0/100 [00:00<?, ?it/s]

qa-pairs:   0%|          | 0/1000000 [00:00<?, ?it/s]

In [15]:
# ! rm *.jsonl
hf_dataset = load_dataset("json", data_files="*.jsonl", split="train")



In [17]:
print(f"Total filtered faq pages: {hf_dataset.num_rows}")

Total filtered faq pages: 100


In [18]:
# Visualize a random dataset_list entry
random_index = random.sample(range(hf_dataset.num_rows), 1)[0]
hf_dataset[random_index]

{'faq_pairs': [{'question': 'How do you know which company I am working with?',
   'answer': 'When you are completing your profile in the Leads Manager System you will be able to input the name of the company you are associated with. This prevents the system ever sending you the same leads as anyone else working with your company.'},
  {'question': 'Cool right?',
   'answer': 'You can Create/Login to the Leads Manager here. Our leads are generated via search engine traffic and advertising on large networks. People see the advertisements and are taken to a survey form. Once they fill out the form, they are a "lead" and they are sent to you expecting to be contacted very soon about a home business opportunity. All leads have volunteered the information and have not been given incentives to fill out the form. Sometimes you will contact a lead who claims they were only filling out a survey form for an ipod or some other type of incentive. They well may have done that either before or after

In [19]:
total_faq_pairs = sum([len(x["faq_pairs"]) for x in hf_dataset])
print(f"Total faq pairs: {total_faq_pairs}")

Total faq pairs: 679


In [None]:
hf_dataset

In [None]:
notebook_login()

In [None]:
hf_dataset.push_to_hub("c4-faqs", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]