# Question answering (PyTorch)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [21]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

!apt install git-lfs

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transforme

Loader

In [22]:
from datasets import load_dataset
import datasets

Load Model

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "deepset/deberta-v3-large-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Preprocessing Train Data

For Compatible with model

In [23]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Load Dataset

In [24]:
import pandas as pd
df_train=pd.read_csv('/content/train.csv')
df_test=pd.read_csv('/content/test.csv')
df_train

Unnamed: 0,item_id,domain,nn_mod,nn_asp,query_mod,query_asp,q_review_id,q_reviews_id,question,question_subj_level,ques_subj_score,is_ques_subjective,review_id,review,human_ans_spans,human_ans_indices,answer_subj_level,ans_subj_score,is_ans_subjective
0,B00BVMXBDO,movies,addictive,show,full,series,d9a9615d45df2f6e6108db4ca46bfded,399f1046fe6bd97990107f9d7aa86f4a,Who is the author of this series?,1,0.0,False,090671369dddfeb02db9bf7125a47c79,Whether it be in her portrayal of a nerdy lesb...,ANSWERNOTFOUND,"(251, 265)",1,0.000,False
1,1404918051,movies,enough simple,film,charming,movie,06ffe37a8023636a3ce00b020a517e87,42d9dd5b0c67150cac1e13308811cbb5,Can we enjoy the movie along with our family ?,1,0.5,False,a29821121e74d319cb93f77101e99c88,"An outstanding romantic comedy, 13 Going on 30...",ANSWERNOTFOUND,"(1195, 1209)",1,0.000,False
2,B0000633ZP,movies,weak,plot,bad,one,3b625c68e91b9e6987a08b84a9a9d234,32d06ccf2132cda644aea791fa688c53,Does this one good?,5,0.6,True,12a1b821f761bd19a75be7b16cef4a7c,"To let the truth be known, I watched this movi...",ANSWERNOTFOUND,"(1476, 1490)",5,0.000,False
3,B0000AQS0F,movies,outstanding,show,wonderful,series,f3abfa98b011127e7cb49bcd07f8deeb,e546636f0bb9f93d5f24b4ade9ebab45,Is this series good and excelent?,1,0.6,True,cd0f92322e67cc9d70de6674caace78c,"At the time of my review, there had been 910 c...",this show is OUTSTANDING,"(296, 320)",1,0.875,True
4,B003Y5H5FG,movies,great,production design,great,costume design,1b03744e764b257592c2c768345c14bc,a0a97e460a194bcb3286fe68d20aadc2,How is the costume design?,1,0.0,False,f6b5024393ebc70287befdaf47a50b75,"""Fright Night"" is great! This is how the story...",The costume design by Susan Matheson is great,"(1254, 1299)",1,0.750,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,B00FZM8Z7I,movies,awesome,soundtrack,more,respect,41b28a42a25d9bbf466bee35db18fda8,0b61aca21a7a7a7330281087e837ab65,Who do you have more respect for?,1,0.5,False,5f4940e66691b0414fb4abbfa15dcc78,This movie is just great. The soundtrack is aw...,ANSWERNOTFOUND,"(371, 385)",1,0.000,False
2497,B00005JNTI,movies,good,moment,great,potential,4aad1be24631680e40b174f2cba620d9,27a4780e3330a6f47e71ce952d7bfe76,How is the potential?,1,1.0,True,765f22bb7d2b7e8b98f905e8a4c1a442,"Tom Cruise (Collateral, Vanilla Sky) stars as ...",ANSWERNOTFOUND,"(1787, 1801)",1,0.000,False
2498,0790731487,movies,few,laugh,funny,bit,a4d1520852e799ab1b0830956fcead5b,1cc8f6187a6cc2e3a264538479ed42d2,How is the bit?,1,0.0,False,705900af450309f7328ed78e812092bd,This review is for the Blu Ray version of Blaz...,ANSWERNOTFOUND,"(1339, 1353)",1,0.000,False
2499,630575067X,movies,excellent,story,fine,story,59036d77f1a9e034ec1c2ba46aeede9b,9fc9edc022f6eba0a00ed228636c8d84,Do you know Maria's story?,5,0.0,False,26caf6113662f003c0a1bc46509bcb58,"Hey, i know youve already seen this movie, if ...",ANSWERNOTFOUND,"(1063, 1077)",5,0.000,False


In [25]:
df_train.iloc[0].question

'Who is the author of this series?'

In [26]:
df_train.iloc[0].review

"Whether it be in her portrayal of a nerdy lesbian or a punk rock rebel, Maslany's plural personalities, (though very stereotypical), are entertaining eye-candy. Combined with a complex and unpredictable plot line, this show is surprisingly addictive. ANSWERNOTFOUND"

In [27]:
df_train.iloc[0].human_ans_indices

'(251, 265)'

In [28]:
df_train.iloc[0].review[251:265]

'ANSWERNOTFOUND'

Customization in Dataset

In [29]:
df_train=df_train[['question','human_ans_indices','review','human_ans_spans']]
df_test=df_test[['question','human_ans_indices','review','human_ans_spans']]

In [30]:
import numpy as np

df_train['id']=np.linspace(0,len(df_train)-1,len(df_train))
df_test['id']=np.linspace(0,len(df_test)-1,len(df_test))

df_train['id']=df_train['id'].astype(str)
df_test['id']=df_test['id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['id']=np.linspace(0,len(df_train)-1,len(df_train))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['id']=df_train['id'].astype(str)


In [31]:
df_train['answers']=df_train['human_ans_spans']
df_test['answers']=df_test['human_ans_spans']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['answers']=df_train['human_ans_spans']


In [32]:
for i in range(0,len(df_train)):
  answer1={}
  si=int(df_train.iloc[i].human_ans_indices.split('(')[1].split(',')[0])
  ei=int(df_train.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])
  answer1['text']=[df_train.iloc[i].review[si:ei]]
  answer1['answer_start']=[si]
  df_train.at[i, 'answers']=answer1
  # print(df_train.iloc[i].answers,df_train.iloc[i].human_ans_spans)

In [33]:
for i in range(0,len(df_test)):
  answer1={}
  si=int(df_test.iloc[i].human_ans_indices.split('(')[1].split(',')[0])
  ei=int(df_test.iloc[i].human_ans_indices.split('(')[1].split(',')[1].split(' ')[1].split(')')[0])
  answer1['text']=[df_test.iloc[i].review[si:ei]]
  answer1['answer_start']=[si]
  df_test.at[i, 'answers']=answer1
  #print(df_train.iloc[i].answers,df_train.iloc[i].human_ans_spans)

In [34]:
df_train.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans', 'id',
       'answers']

df_test.columns=['question', 'human_ans_indices', 'context', 'human_ans_spans','id',
       'answers']

In [35]:
val_dataset2 = datasets.Dataset.from_pandas(df_test)
train_dataset2 = datasets.Dataset.from_pandas(df_train)

Datset Mapping

In [None]:
train_dataset = train_dataset2.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_dataset2.column_names,
)

Preprocessing Validation data

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
validation_dataset = val_dataset2.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=val_dataset2.column_names,
)

AutoModelForQuestionAnswering Pipline

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering


In [None]:
import collections



In [None]:
import evaluate

metric = evaluate.load("squad")

Evaluate metrics

In [None]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

Training

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "roberta-finetuned-subjqa-movies_2",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=35,
    weight_decay=0.01,
    fp16=True
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)


In [None]:
import numpy as np
n_best=20
max_answer_length = 30

Evaluation before Training

In [None]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, val_dataset2)

  0%|          | 0/582 [00:00<?, ?it/s]

{'exact_match': 2.7491408934707904, 'f1': 9.852373992728378}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.2841,No log
2,1.0444,No log
3,0.8994,No log
4,0.7658,No log
5,0.6674,No log
6,0.5889,No log
7,0.5237,No log
8,0.4945,No log
9,0.4723,No log
10,0.4327,No log


TrainOutput(global_step=21280, training_loss=0.4245112999937588, metrics={'train_runtime': 6066.4597, 'train_samples_per_second': 28.051, 'train_steps_per_second': 3.508, 'total_flos': 3.334865182032384e+16, 'train_loss': 0.4245112999937588, 'epoch': 35.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/Colab Notebooks")

Evaluation after Training

In [None]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, val_dataset2)

  0%|          | 0/582 [00:00<?, ?it/s]

{'exact_match': 65.97938144329896, 'f1': 66.91166797703713}

In [9]:
!pip install tensorflow_probability==0.12.2

Collecting tensorflow_probability==0.12.2
  Downloading tensorflow_probability-0.12.2-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_probability
  Attempting uninstall: tensorflow_probability
    Found existing installation: tensorflow-probability 0.22.0
    Uninstalling tensorflow-probability-0.22.0:
      Successfully uninstalled tensorflow-probability-0.22.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dopamine-rl 4.0.6 requires tensorflow-probability>=0.13.0, but you have tensorflow-probability 0.12.2 which is incompatible.[0m[31m
[0mSuccessfully installed tensorflow_probability-0.12.2


#Inference!

In [10]:
from transformers import pipeline

In [11]:
# Replace this with your own checkpoint
model_checkpoint2 = "/content/drive/MyDrive/drive/Roberta-finturned-subjqa"
question_answerer = pipeline("question-answering", model=model_checkpoint2)

In [13]:
import pandas as pd
df_train1=pd.read_csv('/content/train.csv')
df_test1=pd.read_csv('/content/test.csv')

In [14]:
df_train1.iloc[3].question

'Is this series good and excelent?'

In [15]:
context = df_train1.iloc[3].review
question = df_train1.iloc[3].question
question_answerer(question=question, context=context)

{'score': 0.2530508041381836,
 'start': 296,
 'end': 320,
 'answer': 'this show is OUTSTANDING'}

In [16]:
# Replace this with your own checkpoint
model_checkpoint_o = "deepset/deberta-v3-large-squad2"
question_answerer_old = pipeline("question-answering", model=model_checkpoint_o)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [17]:
# context = df_train1.iloc[3].review
# question = df_train1.iloc[3].question
question_answerer_old(question=question, context=context)

{'score': 0.7293888330459595,
 'start': 308,
 'end': 321,
 'answer': ' OUTSTANDING!'}

In [18]:
qa_new = question_answerer(question=question, context=context)
qa_old = question_answerer_old(question=question, context=context)
print(f'''Finetuned_model-{qa_new['answer']}, Original_model-{qa_old['answer']}''')

Finetuned_model-this show is OUTSTANDING, Original_model- OUTSTANDING!


In [37]:
df_train.iloc[3].question

'Is this series good and excelent?'

In [38]:
ans = df_train.iloc[3].answers
ans['text']

['this show is OUTSTANDING']

In [39]:
df_train[['id','question','context','answers']].head()

Unnamed: 0,id,question,context,answers
0,0.0,Who is the author of this series?,Whether it be in her portrayal of a nerdy lesb...,"{'text': ['ANSWERNOTFOUND'], 'answer_start': [..."
1,1.0,Can we enjoy the movie along with our family ?,"An outstanding romantic comedy, 13 Going on 30...","{'text': ['ANSWERNOTFOUND'], 'answer_start': [..."
2,2.0,Does this one good?,"To let the truth be known, I watched this movi...","{'text': ['ANSWERNOTFOUND'], 'answer_start': [..."
3,3.0,Is this series good and excelent?,"At the time of my review, there had been 910 c...","{'text': ['this show is OUTSTANDING'], 'answer..."
4,4.0,How is the costume design?,"""Fright Night"" is great! This is how the story...",{'text': ['The costume design by Susan Matheso...


In [None]:
import csv
import os

base_folder = 'static/'
if not os.path.isdir(base_folder):
    os.mkdir(base_folder)
output_file = base_folder+"QA.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["Question", "Answer", "Finetuned_Answer", "Base_Answer"])  # Writing the header row

    print("Question: ", question)
    print("Answer: ", ans['text'])
    print("Finetuned_Answer:", qa_new['answer'])
    print("Base_Answer:", qa_old['answer'])
    print("--------------------------------------------------\n\n")

    # Save answer to CSV file
    csv_writer.writerow([question, ans['text'], qa_new['answer'], qa_old['answer']])