In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer,TrainingArguments, Trainer
import seaborn as sns
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')
import collections

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
batch_size = 16

In [None]:
df=pd.read_csv("/content/drive/MyDrive/qa_data.csv")
df=df.drop(columns=['Unnamed: 0'])
df=df.dropna()

In [None]:
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pad_on_right = tokenizer.padding_side == "right"

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
#Making it into SquaD foramt for data seed forward
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }
df = df.sample(frac=1, random_state=42)
df['answers'] = df[['answer_start', 'text']].apply(convert_answers, axis=1)

In [None]:
df_train=df[:40000].reset_index(drop=True)
df_test = df[40000:50000].reset_index(drop=True)
df_val=df[85000:].reset_index(drop=True)

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
valid_dataset = Dataset.from_pandas(df_val)

In [None]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
          
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


In [None]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1820 [00:00<?, ? examples/s]

In [None]:
tokenized_train_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 40546
})

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
args = TrainingArguments(
    f"qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# trainer.train()

In [None]:
# trainer.save_model("bert-trained")

In [None]:
import torch
checkpoint=torch.load('/content/drive/MyDrive/bert-trained/pytorch_model.bin')
trainer=model.load_state_dict(checkpoint)

POSTPROCESSING

In [None]:
def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
validation_features = valid_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=valid_dataset.column_names
)

Map:   0%|          | 0/1820 [00:00<?, ? examples/s]

In [None]:
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 1835
})

In [None]:
valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
valid_feats_small

Map:   0%|          | 0/1835 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1835
})

In [None]:
test_features = test_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=valid_dataset.column_names
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

Map:   0%|          | 0/10135 [00:00<?, ? examples/s]

In [None]:
test_predictions = trainer.predict(test_feats_small)

In [None]:
raw_predictions = trainer.predict(valid_feats_small)

In [None]:
max_answer_length = 30

In [None]:
import collections

examples = valid_dataset
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [None]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
  
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None 
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
    
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

      
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score


            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["text"]

    return predictions


In [None]:
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

Post-processing 1820 example predictions split into 1835 features.


  0%|          | 0/1820 [00:00<?, ?it/s]

In [None]:
references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in valid_dataset]

In [None]:
def jaccard(row): 
    str1 = row[0]
    str2 = row[1]
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res

Unnamed: 0,id,answer,prediction,jaccard
0,5726a7a0f1498d1400e8e64c,1523,1523,1.000000
1,57278b875951b619008f8d03,"""conservatives.""","Party ""conservatives",0.000000
2,56e132ebcd28a01900c67699,14th century,14th century,1.000000
3,57291cb21d0469140077905e,genetic,genetic,1.000000
4,57060ec552bb891400689829,Liberal Democrats,Liberal Democrats,1.000000
...,...,...,...,...
1815,56d99d69dc89441400fdb5f4,English Mastiff,English Mastiff,1.000000
1816,572818e3ff5b5019007d9d31,"Graphite, OpenType, or AAT technologies","Graphite, OpenType, or AAT technologies",1.000000
1817,5733bc38d058e614000b6187,resistance of bacteria,resistance,0.333333
1818,56cbdea66d243a140015edae,7,1817,0.000000


In [None]:
res.jaccard.mean()

0.8627660148748413

In [None]:
final_test_predictions = postprocess_qa_predictions(test_dataset, test_features, test_predictions.predictions)

In [None]:
final_test_predictions

In [None]:
test_references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in test_dataset]

In [None]:
test_res = pd.DataFrame(test_references)
test_res['prediction'] = test_res['id'].apply(lambda r: final_test_predictions[r])
test_res['jaccard'] = test_res[['answer', 'prediction']].apply(jaccard, axis=1)
test_res

In [None]:
test_res.jaccard.mean()

0.8639086062102972

# **DEMO**

In [None]:
Example_number = 12

In [None]:
test_dataset[Example_number-1]

In [None]:
raw_pred = trainer.predict(test_feats_small.select(range(Example_number-1, Example_number)))

In [None]:
QA_id = test_features[Example_number-1:Example_number]['example_id'][0]

'57325fade17f3d14004228f9'

In [None]:
test_predictions = postprocess_qa_predictions(test_dataset, test_features.select(range(Example_number-1, Example_number)), raw_pred.predictions)

Post-processing 10000 example predictions split into 1 features.


  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
test_predictions[QA_id]

'Oveta Culp Hobby'

# **Custom data**

In [None]:
demo_data = pd.read_csv('demo.csv')
pd.set_option('display.max_colwidth', None)

In [None]:
demo_data

Unnamed: 0,context,question,id,answer_start,text
0,"Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend ""Venite Ad Me Omnes"". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.",'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',xyz,,Saint Bernadette Soubirous
1,"In recent years, deep learning has become a powerful tool for solving a wide range of problems in various fields, such as computer vision, natural language processing, and speech recognition. Deep learning models are able to learn complex representations of data and often outperform traditional machine learning methods. However, they require large amounts of data and computing resources to train.",What is deep learning?,abc,,"Deep learning is a powerful tool for solving a wide range of problems in various fields, such as computer vision, natural language processing, and speech recognition."
2,"The Great Barrier Reef is the world's largest coral reef system, located in the Coral Sea, off the coast of Australia. It stretches for over 2,300 kilometers and is made up of over 2,900 individual reefs and 900 islands. The reef is home to an incredible diversity of marine life, including over 1,500 species of fish, 30 species of whales and dolphins, and 6 of the world's 7 species of marine turtle.",Where is the Great Barrier Reef located?,opn,,"The Great Barrier Reef is located in the Coral Sea, off the coast of Australia."
3,"The Amazon rainforest is the largest tropical rainforest in the world, spanning over 2.1 million square miles. It is home to an incredible diversity of plant and animal life, with many species found nowhere else on Earth. The rainforest plays a vital role in regulating the Earth's climate, producing oxygen, and storing carbon dioxide. However, the Amazon is facing many threats, including deforestation, mining, and climate change.",What threats is the Amazon rainforest facing?,hjk,,"The Amazon rainforest is facing threats such as deforestation, mining, and climate change."
4,"France is a Western European country famous for its wines, food, and fashion. Its capital city is Paris.",What is the capital city of France?,ikl,,capital city is Paris
5,John is a software developer who works for a company that develops mobile applications. He is responsible for creating new features and fixing bugs in the code. John enjoys his work and is always eager to learn new programming languages and techniques.,What are some of the qualities that make John a good software developer?,fgh,,"he is responsible, enjoys his work, and is always eager to learn new programming languages and techniques."
6,"Music has been an important part of human culture for thousands of years. From ancient civilizations to modern times, people have used music to express themselves, tell stories, and connect with others. Music has the power to evoke strong emotions, whether it's joy, sadness, or nostalgia. It can also have therapeutic benefits, helping people to relax, reduce stress, and improve their mental health.",What are some of the benefits of listening to music?,rty,,"herapeutic benefits, helping people to relax, reduce stress, and improve their mental health"
7,"IIIT Dharwad is an institute of national importance situated in Dharwad, Karnataka. It trains students under the disciplines of ECE, CSE and DSAI.",How many departments does the institute have?,fdk,,three
8,"Penguins are a group of flightless birds that are highly adapted to life in the water. There are 18 different species of penguins, ranging in size from the small fairy penguin (which is only about 16 inches tall) to the emperor penguin (which can grow up to 4 feet tall). Penguins are found primarily in the Southern Hemisphere, and many species live in the frigid waters surrounding Antarctica.",What do pandas eat?,invalid,,Bamboo


In [None]:
demo_dataset = Dataset.from_pandas(demo_data)
demo_features = demo_dataset.map(
    prepare_validation_features,
    batched=True,
    # remove_columns=valid_dataset.column_names
)

# demo_features = demo_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [None]:
demo_raw_preds = trainer.predict(demo_features)

In [None]:
demo_dataset

Dataset({
    features: ['context', 'question', 'id', 'answer_start', 'text'],
    num_rows: 9
})

In [None]:
demo_features

Dataset({
    features: ['context', 'question', 'id', 'answer_start', 'text', 'input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 9
})

In [None]:
demo_post = postprocess_qa_predictions(demo_dataset, demo_features, demo_raw_preds.predictions)

Post-processing 9 example predictions split into 9 features.


  0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
demo_post

OrderedDict([('xyz', 'Saint Bernadette Soubirous'),
             ('abc', 'powerful tool for solving a wide range of problems'),
             ('opn', 'Coral Sea'),
             ('hjk', 'deforestation, mining, and climate change'),
             ('ikl', 'Paris'),
             ('fgh', 'creating new features and fixing bugs in the code'),
             ('rty',
              'therapeutic benefits, helping people to relax, reduce stress, and improve their mental health'),
             ('fdk', 'ECE, CSE and DSAI'),
             ('invalid', 'water')])