# 1. Building a Review-Based QA System

In [1]:
from datasets import get_dataset_config_names

In [2]:
domain = get_dataset_config_names("subjqa")

Downloading builder script:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

In [3]:
domain

['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

In [4]:
from datasets import load_dataset

In [5]:
subjqa = load_dataset("subjqa", name="electronics")

Downloading and preparing dataset subjqa/electronics (download: 10.86 MiB, generated: 3.01 MiB, post-processed: Unknown size, total: 13.86 MiB) to /root/.cache/huggingface/datasets/subjqa/electronics/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1295 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/358 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/255 [00:00<?, ? examples/s]

Dataset subjqa downloaded and prepared to /root/.cache/huggingface/datasets/subjqa/electronics/1.1.0/e5588f9298ff2d70686a00cc377e4bdccf4e32287459e3c6baf2dc5ab57fe7fd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
subjqa["train"]["answers"][1]

{'text': ['Bass is weak as expected',
  'Bass is weak as expected, even with EQ adjusted up'],
 'answer_start': [1302, 1302],
 'answer_subj_level': [1, 1],
 'ans_subj_score': [0.5083333253860474, 0.5083333253860474],
 'is_ans_subjective': [True, True]}

In [7]:
import pandas as pd

In [8]:
subjqa.items()

dict_items([('train', Dataset({
    features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
    num_rows: 1295
})), ('test', Dataset({
    features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
    num_rows: 358
})), ('validation', Dataset({
    features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
    num_rows: 255
}))])

In [9]:
subjqa.flatten().items()

dict_items([('train', Dataset({
    features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start', 'answers.answer_subj_level', 'answers.ans_subj_score', 'answers.is_ans_subjective'],
    num_rows: 1295
})), ('test', Dataset({
    features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start', 'answers.answer_subj_level', 'answers.ans_subj_score', 'answers.is_ans_subjective'],
    num_rows: 358
})), ('validation', Dataset({
    features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers.text', 'answ

In [10]:
dset_flatten = subjqa.flatten().items()

In [11]:
dfs = {split: dset.to_pandas() for split, dset in dset_flatten}

In [12]:
dfs

{'train':            domain       nn_mod         nn_asp   query_mod   query_asp  \
 0     electronics        great  bass response   excellent        bass   
 1     electronics        harsh           high  not strong        bass   
 2     electronics      neutral          sound     present        bass   
 3     electronics        muddy           bass     awesome        bass   
 4     electronics      perfect           bass  incredible       sound   
 ...           ...          ...            ...         ...         ...   
 1290  electronics        great     impression     rubbery        feel   
 1291  electronics        tight           case       great  protection   
 1292  electronics         good        texture        good        grip   
 1293  electronics  easy to use       keyboard   removable    keyboard   
 1294  electronics  easy to use       keyboard   removable    keyboard   
 
                           q_reviews_id  question_subj_level  ques_subj_score  \
 0     0514ee34b6726

In [22]:
for split, df in dfs.items():
    print(f"number of qs in {split}: {df['id'].nunique()}")

number of qs in train: 1295
number of qs in test: 358
number of qs in validation: 255


In [23]:
qa_cols = ["title", "question",
           "answers.text", "answers.answer_start",
           "context"]

### 1.2 Extracting Answers from Text

In [19]:
from transformers import AutoTokenizer

In [20]:
model_ckpt = "deepset/minilm-uncased-squad2"

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [73]:
inputs = tokenizer(
    text = "how much music can this hold?",
    text_pair = "an mp3 is about 1mb/minute, about 6000 hours",
    return_tensors="pt"
)

In [74]:
inputs

{'input_ids': tensor([[  101,  2129,  2172,  2189,  2064,  2023,  2907,  1029,   102,  2019,
         23378,  2003,  2055,  1015, 14905,  1013,  3371,  1010,  2055, 25961,
          2847,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [48]:
inputs.input_ids.shape

torch.Size([1, 22])

In [49]:
tokenizer.decode(inputs["input_ids"][0])

'[CLS] how much music can this hold? [SEP] an mp3 is about 1mb / minute, about 6000 hours [SEP]'

In [51]:
import torch
from transformers import AutoModelForQuestionAnswering

In [52]:
model_ckpt

'deepset/minilm-uncased-squad2'

In [53]:
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/127M [00:00<?, ?B/s]

In [54]:
with torch.no_grad():
    outputs = model(**inputs)

In [55]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-1.2935, -4.7204, -5.3716, -5.2326, -5.2625, -5.3654, -4.9590, -6.0611,
         -1.2935, -0.2044, -0.1700, -1.9096,  4.1928,  5.7759, -2.2901, -3.7464,
         -1.2213, -3.4085,  2.6456,  3.3822, -0.8894, -1.2935]]), end_logits=tensor([[-1.2809, -5.4610, -5.0965, -5.2911, -5.5386, -5.4595, -5.3126, -4.6202,
         -1.2809, -3.4540, -0.8159, -3.7237, -2.7964, -1.0135,  1.1267, -2.5709,
          5.6335,  0.0113, -3.9191,  0.5685,  5.2458, -1.2809]]), hidden_states=None, attentions=None)

In [67]:
outputs.start_logits

tensor([[-1.2935, -4.7204, -5.3716, -5.2326, -5.2625, -5.3654, -4.9590, -6.0611,
         -1.2935, -0.2044, -0.1700, -1.9096,  4.1928,  5.7759, -2.2901, -3.7464,
         -1.2213, -3.4085,  2.6456,  3.3822, -0.8894, -1.2935]])

In [68]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [69]:
start_idx = torch.argmax(start_logits)

In [71]:
end_idx = torch.argmax(end_logits) + 1

In [72]:
end_idx

tensor(17)

In [76]:
answer_span = inputs["input_ids"][0][start_idx:end_idx]

In [77]:
answer = tokenizer.decode(answer_span)

In [78]:
answer

'1mb / minute'

### Dealing with long passages

##### Example 1

In [28]:
example = dfs["train"].iloc[0][["question", "context"]]

In [31]:
example

question                                     How is the bass?
context     I have had Koss headphones in the past, Pro 4A...
Name: 0, dtype: object

In [32]:
tokenized_example = tokenizer(
    text=example["question"], text_pair=example["context"],
    return_overflowing_tokens=True, max_length=100, stride=25
)

In [33]:
tokenized_example

{'input_ids': [[101, 2129, 2003, 1996, 3321, 1029, 102, 1045, 2031, 2018, 12849, 4757, 2132, 19093, 1999, 1996, 2627, 1010, 4013, 26424, 2050, 1998, 1053, 2480, 1011, 5585, 1012, 1996, 12849, 4757, 3417, 9331, 3217, 2003, 12109, 1998, 2038, 2307, 3321, 3433, 1012, 1996, 2147, 2307, 2007, 2026, 11924, 3042, 1998, 2064, 2022, 1000, 4565, 2039, 1000, 2000, 2022, 3344, 1999, 2026, 9055, 6598, 2030, 3274, 4524, 2302, 2893, 24514, 2098, 1012, 2027, 2024, 2200, 2422, 1998, 2079, 2025, 2514, 3082, 2030, 4562, 2091, 2006, 2115, 5551, 2130, 2044, 5962, 2000, 2189, 2007, 2068, 2006, 2035, 2154, 1012, 1996, 2614, 2003, 102], [101, 2129, 2003, 1996, 3321, 1029, 102, 1998, 2079, 2025, 2514, 3082, 2030, 4562, 2091, 2006, 2115, 5551, 2130, 2044, 5962, 2000, 2189, 2007, 2068, 2006, 2035, 2154, 1012, 1996, 2614, 2003, 2305, 1998, 2154, 2488, 2084, 2151, 4540, 1011, 13007, 2071, 2022, 1998, 2024, 2471, 2004, 2204, 2004, 1996, 4013, 26424, 2050, 1012, 2027, 2024, 1000, 2330, 2250, 1000, 2132, 19093, 2061,

In [34]:
for idx, window in enumerate(tokenized_example["input_ids"]):
    print(f"window #{idx} has {len(window)} tokens")

window #0 has 100 tokens
window #1 has 88 tokens


In [35]:
for window in tokenized_example["input_ids"]:
    print(f"{tokenizer.decode(window)}\n")

[CLS] how is the bass? [SEP] i have had koss headphones in the past, pro 4aa and qz - 99. the koss portapro is portable and has great bass response. the work great with my android phone and can be " rolled up " to be carried in my motorcycle jacket or computer bag without getting crunched. they are very light and don't feel heavy or bear down on your ears even after listening to music with them on all day. the sound is [SEP]

[CLS] how is the bass? [SEP] and don't feel heavy or bear down on your ears even after listening to music with them on all day. the sound is night and day better than any ear - bud could be and are almost as good as the pro 4aa. they are " open air " headphones so you cannot match the bass to the sealed types, but it comes close. for $ 32, you cannot go wrong. [SEP]

