# Installs

In [None]:
!pip install datasets --quiet

In [None]:
!pip install PyMuPDF pdfminer.six --quiet

In [None]:
!pip install trl --quiet

# Imports

In [None]:
import os

In [None]:
from datasets import Dataset

# Phase 1

In [None]:
from pdfminer.high_level import extract_text

In [None]:
def extract_text_from_pdfs(pdf_paths):
    texts = []
    for path in pdf_paths:
        text = extract_text(path)
        texts.append(text)
    return texts

In [None]:
import glob
pdf_path = "/data"

In [None]:
pdf_paths = glob.glob(pdf_path+"/*.pdf")

In [None]:
texts = extract_text_from_pdfs(pdf_paths)

In [None]:
import re

def clean_text(text):
    # Remove header/footer artifacts
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple whitespaces with single space
    text = re.sub(r'(\n){2,}', '\n', text)  # Replace multiple newlines with a single newline
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters

    # Remove common but unnecessary items like references or excess newlines
    text = text.replace('\n', ' ')  # Replace new lines with space to maintain continuity
    return text

In [None]:
texts = [clean_text(text) for text in texts]

In [None]:
def chunk_text(text, chunk_size = 512, overlap = 50):
  tokens = tokenizer.tokenize(text)
  chunks = []
  for i in range(0, len(tokens), chunk_size - overlap):
    chunk = tokens[i:i + chunk_size]
    chunks.append(tokenizer.convert_tokens_to_string(chunk))
  return chunks

In [None]:
# Define tokenize function
def tokenize_function(examples):
  all_chunks = []
  for example in examples['text']:
    chunks = chunk_text(example)
    for chunk in chunks:
      tokenized_chunk = tokenizer(chunk, padding="max_length", truncation=True, max_length=512)
      all_chunks.append(tokenized_chunk)

  # Transform list of tokenized chunks into a dictionary of lists
  batch = {key: [] for key in all_chunks[0].keys()}
  for chunk in all_chunks:
    for key, value in chunk.items():
      batch[key].append(value)
  return batch

In [None]:
# Create a dataset from the extracted texts
texts_dataset = Dataset.from_dict({"text": texts})
tokenized_dataset = texts_dataset.map(tokenize_function, batched = True, remove_columns=["text"])

In [None]:
# Add labels (in causal language modeling, labels are the same as input_ids)
def add_labels(example):
    example['labels'] = example['input_ids'].copy()
    return example

tokenized_dataset = tokenized_dataset.map(add_labels, batched=False)