DOWNLOAD THE PART OF THE CORPUS \
Repository with all APT notes from 2008 to 2024: https://github.com/aptnotes/data/  # accessed 05.05.2025

tools to download the PDFs:
```py -3.10 -m venv aptnotesvenv                    # apt tools requires older python version
aptnotesvenv\Scripts\Activate.ps1
git clone https://github.com/aptnotes/tools.git  # accessed 05.05.2025
pip install -r .\tools\APTnotes_sync_requirements.txt
pip install lxml --prefer-binary

python tools\APTnotes_sync_download.py

In [35]:
import re
import os
import json
import fitz
import unicodedata

working_dir = 'APTnotes'
output_file = 'aptnotes_dataset.jsonl'
entries = []

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text_list = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        text_cleaned = ''
        for line in text.splitlines():
            if line.strip():

                # Remove unwanted characters and normalize spaces
                # skip figure captions, page numbers, and dates footers
                if re.search(r'^\s*fig.*', line, re.IGNORECASE) or re.search(r'^\s*\d+', line) or re.search(r'^\s*page\s\d+', line, re.IGNORECASE) or re.search(r'^\s*copyright\s', line, re.IGNORECASE) or re.search(r'\s{5,}', line) or \
                    re.search(r'(?i)(Volume\s+\d+|Number\s+\d+|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s*\d{4}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})', line, re.IGNORECASE):
                    continue
                line = unicodedata.normalize("NFKC", line)  # normalize unicode characters
                line = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', line)  # remove control and invisible characters
                line = re.sub(r'[^\x00-\x7F]+', ' ', line)  # remove non-ASCII characters
                line = re.sub(r'\.{2,}', ' ', line)  # remove the repeated dots
                line = re.sub(r'\s{2,}', ' ', line)  # remove extra spaces

                text_cleaned += line.strip() + '\n'

        if len(text_cleaned.splitlines()) > 2:  # skip empty pages or pages with only headers/footers
            text_list.append(text_cleaned.strip())

    doc.close()
    return text_list

used_pdfs = 0
for dir in os.listdir(working_dir):
    file_list = os.listdir(os.path.join(working_dir, dir))
    for file in file_list[:13] + file_list[14:]:  # skip the 14th file in each directory as it causese errors
        used_pdfs += 1
        if not file.endswith('.pdf'):
            continue

        pdf_path = os.path.join(working_dir, dir, file)
        for text in extract_text_from_pdf(pdf_path):
            if text.strip():
                entry = {
                    "instruction": 'Analyse the following APT report',
                    "input":  '',
                    "output": text
                }
                entries.append(entry)

with open(output_file, 'w', encoding='utf-8') as f:
    for entry in entries:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print(f"Extracted {len(entries)} entries from {working_dir} and saved to {output_file}.")
print(f"Used {used_pdfs} PDFs.")
print(f"Max output len: {max(len(entry['output']) for entry in entries)}")  # print the max length of the output text

Extracted 12813 entries from APTnotes and saved to aptnotes_dataset.jsonl.
Used 676 PDFs.
Max output len: 7594
