<a href="https://colab.research.google.com/github/upashanadutta23/DLPROJECT/blob/main/sp25_UpaHarYam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets transformers torch

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
import re
import torch
from datasets import load_dataset, load_from_disk, DatasetDict
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

In [None]:
raw_dataset = load_dataset("rajpurkar/squad_v2")
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [None]:
split_dataset = raw_dataset["train"].train_test_split(
    test_size  = 10000,
    shuffle = True,
    seed = 42,
)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 120319
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10000
    })
})


In [None]:
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]
test_dataset = raw_dataset["validation"]
print(f"Train Dataset is {train_dataset}\n")
print(f"Validation Dataset is {validation_dataset}\n")
print(f"Test Dataset is {test_dataset}\n")

Train Dataset is Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 120319
})

Validation Dataset is Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10000
})

Test Dataset is Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})



In [None]:
def clean_text(text):
  text = re.sub('\s+', ' ', text)
  text = text.strip()
  return text

In [None]:
def apply_clean_text(examples):
  cleaned_contexts = [clean_text(c) for c in examples['context']]
  cleaned_questions = [clean_text(q) for q in examples['question']]
  return {
      "context": cleaned_contexts,
      "question": cleaned_questions
  }


In [None]:
train_dataset = train_dataset.map(
    apply_clean_text,
    batched = True,
    num_proc = 4
)
validation_dataset = validation_dataset.map(
    apply_clean_text,
    batched = True,
    num_proc = 4
)
test_dataset = test_dataset.map(
    apply_clean_text,
    batched = True,
    num_proc = 4
)

Map (num_proc=4):   0%|          | 0/120319 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def preprocess_training_examples(examples):
  tokenized = tokenizer(
      examples["question"],
      examples["context"],
      truncation = True,
      stride = 128,
      return_overflowing_tokens = True,
      return_offsets_mapping = True,
      padding = "max_length"
  )
  offset_mapping = tokenized["offset_mapping"]
  sample_map = tokenized.pop("overflow_to_sample_mapping")
  start_positions = []
  end_positions = []
  for i, offsets in enumerate(offset_mapping):
    sample_idx = sample_map[i]
    answers = examples["answers"][sample_idx]
    if len(answers["text"]) == 0:
      start_positions.append(0)
      end_positions.append(0)
      continue
    answer_text = answers["text"][0]
    answer_start_char = answers["answer_start"][0]
    answer_end_char = answer_start_char + len(answer_text)
    start_token_idx = 0
    end_token_idx = 0
    for idx, (start,end) in enumerate(offsets):
      if start <= answer_start_char < end:
        start_token_idx = idx
      if start < answer_end_char <= end:
        end_token_idx = idx
        break
    start_positions.append(start_token_idx)
    end_positions.append(end_token_idx)
  tokenized["start_positions"] = start_positions
  tokenized["end_positions"] = end_positions
  return tokenized

In [None]:
tokenized_train = train_dataset.map(
    preprocess_training_examples,
    batched = True,
    remove_columns = train_dataset.column_names
)
tokenized_test = test_dataset.map(
    preprocess_training_examples,
    batched = True,
    remove_columns = test_dataset.column_names
)
tokenized_validation = validation_dataset.map(
    preprocess_training_examples,
    batched = True,
    remove_columns = validation_dataset.column_names
)

Map:   0%|          | 0/120319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
tokenized_train.set_format("torch", columns = ["input_ids", "attention_mask", "start_positions", "end_positions"])
tokenized_test.set_format("torch", columns = ["input_ids", "attention_mask", "start_positions", "end_positions"])
tokenized_validation.set_format("torch", columns = ["input_ids", "attention_mask", "start_positions", "end_positions"])

In [None]:
train_dataloader = DataLoader(tokenized_train, shuffle = True, batch_size = 8)
test_dataloader = DataLoader(tokenized_test, shuffle = True, batch_size = 8)
validation_dataloader = DataLoader(tokenized_validation, shuffle = True, batch_size = 8)

In [None]:
sample_batch = next(iter(train_dataloader))
for k,v in sample_batch.items():
  print(k,v.shape)

input_ids torch.Size([8, 512])
attention_mask torch.Size([8, 512])
start_positions torch.Size([8])
end_positions torch.Size([8])


In [None]:
processed_dataset = DatasetDict(
    {
        "train" : tokenized_train,
        "test" : tokenized_test,
        "validation" : tokenized_validation,
    }
)
processed_dataset.save_to_disk("processed_squad_v2")


Saving the dataset (0/4 shards):   0%|          | 0/120522 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11974 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10022 [00:00<?, ? examples/s]

In [None]:
reload_processed_dataset = load_from_disk("processed_squad_v2")