# Step-1

In [None]:
!pip install datasets 
!pip install transformers[sentencepiece]
!pip install pandas

In [18]:
import pandas as pd

# Step-2: Dataset

In [19]:
from datasets import get_dataset_config_names

In [20]:
domains = get_dataset_config_names("subjqa")

In [21]:
domains

['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

In [22]:
from datasets import load_dataset

.

In [23]:
subjqa = load_dataset("subjqa", name="books")

Downloading and preparing dataset subjqa/books (download: 10.86 MiB, generated: 3.42 MiB, post-processed: Unknown size, total: 14.27 MiB) to C:\Users\Taimoor M. Gondal\.cache\huggingface\datasets\subjqa\books\1.1.0\e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd...


Generating train split:   0%|          | 0/1314 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/345 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/256 [00:00<?, ? examples/s]

Dataset subjqa downloaded and prepared to C:\Users\Taimoor M. Gondal\.cache\huggingface\datasets\subjqa\books\1.1.0\e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

**For our use case, we’ll focus on building a QA system for the "Electronics" domain. To
download the electronics subset, we just need to pass this value to the name argu‐
ment of the load_dataset() function:**

In [24]:
print(subjqa["train"]["answers"][1])

{'text': ['the subject matter would be interesting'], 'answer_start': [262], 'answer_subj_level': [1], 'ans_subj_score': [0.4166666567325592], 'is_ans_subjective': [False]}


**Like other question answering datasets on the Hub, SubjQA stores the answers to
each question as a nested dictionary. For example, if we inspect one of the rows in the
answers column:**

In [25]:
dfs={}
for split, dset in subjqa.flatten().items():
    dfs[split]=dset.to_pandas()


Flatten these nested columns with the flatten() method and convert each split to a Pandas
DataFrame as follows

In [26]:
for split, df in dfs.items():
     print(f"Number of questions in {split}: {df['id'].nunique()}")

Number of questions in train: 1314
Number of questions in test: 345
Number of questions in validation: 256


In [27]:
qa_cols = ["title", "question", "answers.text",
 "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df

Unnamed: 0,title,question,answers.text,answers.answer_start,context
463,312576463,Why none is likeable?,"[The characters were not likable, and it often...","[119, 156]",It's the classic &#34;disconnected from parent...
284,307265439,What is the sentiment of the story?,[],[],I rate books on how much of an impact they hav...


**Let’s focus on these columns and take a look at a few of the training examples. We can
use the sample() method to select a random sample:**

In [28]:
start_idx = sample_df["answers.answer_start"].iloc[0][0]
end_idx = start_idx + len(sample_df["answers.text"].iloc[0][0])
sample_df["context"].iloc[0][start_idx:end_idx]

'The characters were not likable, and it often felt forced and a little pretentious'

In [29]:
counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]

# Step-3: Check Question Type

In [30]:
for q in question_types:
 counts[q] = dfs["train"]["question"].str.startswith(q).value_counts()[True]

In [31]:
counts

{'What': 265,
 'How': 712,
 'Is': 113,
 'Does': 54,
 'Do': 86,
 'Was': 15,
 'Where': 42,
 'Why': 25}

In [32]:
for question_type in ["How", "What", "Is"]:
    for question in (dfs["train"][dfs["train"].question.str.startswith(question_type)].sample(n=3, random_state=42)['question']):
        print(question)

How is the choice?
How is the child?
How do you like the end?
What you can infer about life from this story ?
What do you think about book?
What is author?
Is the story appealing to adults?
Is it a good book?
Is the story true?


In [33]:
from transformers import AutoTokenizer

# Step-4: Tokenizer
To encode our texts, we’ll load the MiniLM model checkpoint from the Hugging Face
Hub

In [34]:
model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [35]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""
inputs = tokenizer(question, context, return_tensors="pt")

**In this tutorial, we will extract answers from small passage**

In [36]:
print(tokenizer.decode(inputs["input_ids"][0]))

[CLS] how much music can this hold? [SEP] an mp3 is about 1 mb / minute, so about 6000 hours depending on file size. [SEP]


In [37]:
import torch
from transformers import AutoModelForQuestionAnswering

# Step-5: Model

In [38]:
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/127M [00:00<?, ?B/s]

In [39]:
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9862, -4.7750, -5.4025, -5.2378, -5.2863, -5.5117, -4.9819, -6.1880,
         -0.9862,  0.2596, -0.2144, -1.7136,  3.7806,  4.8561, -1.0546, -3.9097,
         -1.7374, -4.5944, -1.4278,  3.9949,  5.0391, -0.2018, -3.0193, -4.8549,
         -2.3107, -3.5110, -3.5713, -0.9862]]), end_logits=tensor([[-0.9623, -5.4733, -5.0326, -5.1639, -5.4278, -5.5151, -5.1749, -4.6233,
         -0.9623, -3.7855, -0.8715, -3.7745, -3.0161, -1.1780,  0.1758, -2.7365,
          4.8934,  0.3046, -3.1761, -3.2762,  0.8937,  5.6606, -0.3623, -4.9554,
         -3.2531, -0.0914,  1.6211, -0.9623]]), hidden_states=None, attentions=None)


**Two ways to complete the remaining work**<br>
**1) Conventional way**<br>
**2) HuggingFace Pipeline**

In [40]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [41]:
print(f"Input IDs shape: {inputs.input_ids.size()}")

Input IDs shape: torch.Size([1, 28])


In [42]:
print(f"Start logits shape: {start_logits.size()}")

Start logits shape: torch.Size([1, 28])


In [43]:
print(f"End logits shape: {end_logits.size()}")

End logits shape: torch.Size([1, 28])


In [44]:
start_idx = torch.argmax(start_logits)

In [45]:
end_idx = torch.argmax(end_logits) + 1

In [46]:
answer_span = inputs["input_ids"][0][start_idx:end_idx]

In [47]:
answer = tokenizer.decode(answer_span)

In [48]:
print(f"Question: {question}")

Question: How much music can this hold?


In [49]:
print(f"Answer: {answer}")

Answer: 6000 hours


In [50]:
from transformers import pipeline

**HuggingFace Pipeline to complete the work with only model tokenizer, question and context**

In [51]:
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [52]:
pipe(question=question, context=context, topk=3)



[{'score': 0.26516205072402954,
  'start': 38,
  'end': 48,
  'answer': '6000 hours'},
 {'score': 0.22082941234111786,
  'start': 16,
  'end': 48,
  'answer': '1 MB/minute, so about 6000 hours'},
 {'score': 0.10253491997718811,
  'start': 16,
  'end': 27,
  'answer': '1 MB/minute'}]