### https://pymupdf.readthedocs.io/en/latest/recipes-annotations.html

In [None]:
!python -V

Python 3.10.12


In [None]:
!mkdir data
!curl -o data/Singapore.pdf https://en.wikipedia.org/api/rest_v1/page/pdf/Singapore
!curl -o data/Afghanistan.pdf https://en.wikipedia.org/api/rest_v1/page/pdf/Afghanistan

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2882k  100 2882k    0     0   416k      0  0:00:06  0:00:06 --:--:--  778k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3138k  100 3138k    0     0   434k      0  0:00:07  0:00:07 --:--:--  782k


In [None]:
!pip install pypdf pymupdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5 pypdf-4.2.0


In [None]:
from pypdf import PdfReader

reader = PdfReader("data/Singapore.pdf")
number_of_pages = len(reader.pages)

text = ""
for page in range(number_of_pages):
  page_text = reader.pages[page]
  text += page_text.extract_text()

In [None]:
for page in range(0, number_of_pages-1):
  page = reader.pages[page]
  count = 0

  for image_file_object in page.images:
      with open(str(count) + image_file_object.name, "wb") as fp:
          fp.write(image_file_object.data)
          count += 1

In [None]:
text



## Highlight text from pdf

In [58]:
import os
import fitz
import re
import pandas as pd

data = {'Regex': [r"(?i)\bworld\b", r"\$[0-9]+", r"\((.*)\)"]}
df = pd.DataFrame(data=data)

def extract_sensitive_data(page_text, reg):
    compiled = re.compile(reg)
    return [word[:4] for word in page_text if compiled.search(word[4])]

def redaction():
  path = "data"
  file_list = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  for filename in file_list:
    doc = fitz.open(os.path.join(path, filename ))

    for page in doc:
      for reg in df['Regex']:
        sensitive_data = extract_sensitive_data(page.get_text("words"), reg)
        for area in sensitive_data:
          annotation = page.add_highlight_annot(area)
          annotation.set_colors(stroke=[0.5, 0.8, 0.8]) #Change colours RGB/255
          annotation.update()

      doc.save(f"{os.path.splitext(filename)[0]}_edited.pdf")

redaction()

In [53]:
import os
import pymupdf
import re
import pandas as pd

data = {'Regex': [r"(?i)\b[a-zA-Z]+istan\b"]}
df = pd.DataFrame(data=data)

def extract_sensitive_data(page_text, reg):
    compiled = re.compile(reg)
    return [word[:4] for word in page_text if compiled.search(word[4])]

def redaction(df):
  path = ""
  file_list = ['Afghanistan_edited.pdf']
  for filename in file_list:
    doc = pymupdf.open(os.path.join(path, filename ))
    print("Number of pages: ", doc.page_count)
    for page in doc:
      text = page.get_text("words")
      for phrase in df['Regex']:
        sensitive_data = extract_sensitive_data(page.get_text("words"), phrase)
        for area in sensitive_data:
          # page.add_underline_annot(area) Green underline is quite faint
          # page.add_strikeout_annot(area) Red strikeout
          # page.add_squiggly_annot(area) Purple underline squiggly
          # page.add_highlight_annot(area) # Yellow highlight
          # page.add_redact_annot(area, fill=(0, 0, 0), cross_out=False) Red box
          # page.apply_redactions()
    doc.save(f"{os.path.splitext(filename)[0]}_edited.pdf")


redaction(df)

Number of pages:  80


In [None]:
!pip install pipdeptree



## Summarisation

In [None]:
import os
import fitz
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd

punctuation = punctuation + '\n'

def summariser(text):
  nltk.download('punkt')
  nltk.download('stopwords')

  stopWords = set(stopwords.words("english"))

  words = word_tokenize(text)
  freqTable = dict()
  for word in words:
      word = word.lower()
      if word in stopWords:
          continue
      if word in punctuation:
          continue
      if word in freqTable:
          freqTable[word] += 1
      else:
          freqTable[word] = 1

  sentences = sent_tokenize(text)
  sentenceValue = dict()

  for sentence in sentences:
      for word, freq in freqTable.items():
          if word in sentence.lower():
              if sentence in sentenceValue:
                  sentenceValue[sentence] += freq
              else:
                  sentenceValue[sentence] = freq

  sumValues = 0
  for sentence in sentenceValue:
      sumValues += sentenceValue[sentence]

  average = int(sumValues / len(sentenceValue))

  summary = ''
  for sentence in sentences:
      if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
          summary += " " + sentence

  final_summary = [sentence for sentence in sentences if (
      sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average))]
  summary = ' '.join(final_summary)

  return final_summary

def extract_sensitive_data(page_text, reg):
    compiled = re.compile(reg)
    #print(compiled)
    return [word[:4] for word in page_text if compiled.search(word[4])]

_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.&~#\t\n\r\v\f'}

def escape(pattern):
    """
    Escape special characters in a string.
    """
    if isinstance(pattern, str):
        return pattern.translate(_special_chars_map)
    else:
        pattern = str(pattern, 'latin1')
        return pattern.translate(_special_chars_map).encode('latin1')

def redaction():
  path = "data"
  file_list = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  for filename in file_list:
    doc = fitz.open(os.path.join(path, filename ))
    page_text = ""
    for page in doc:
      text = page.get_text("text")
      page_text += text

    page_text = re.sub(r"\[[a-zA-Z0-9]+\]", "", page_text)

    reg_list = [r"(?i)\b{}\b".format(escape(item)).replace('\\n','') for item in summariser(page_text)]
    for reg in reg_list:
        sensitive_data = extract_sensitive_data(page.get_text("words"), reg)
        print(reg)
        for area in sensitive_data:
          page.add_highlight_annot(area)

    doc.save(f"{filename}_edited.pdf")
    return
text = redaction()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(?i)\bIslamic Emirate of Afghanistan\
امارت يملاسا افغانستان د \(Pashto\)\
Də Afġānistān Islāmī Imārat\
افغانستان اسالمی امارت \(Dari\)\
Imārat\-i Islāmī\-yi Afğānistān\
Flag\
Emblem\
Motto: الله رسول محمد ،الله إال إله ال\
Lā ʾilāha ʾillā llāh, Muhammadun rasūlu llāh\
"There is no god but God; Muhammad is the\
messenger of God\."\b
(?i)\b\(Shahadah\)\
Anthem: کور ونباتورا د دا\
"Dā Də Bātorāno Kor"\
"This Is the Home of the Brave"\
Afghanistan\
Afghanistan, officially the Islamic Emirate of\
Afghanistan, is a landlocked country located at the\
crossroads of Central Asia and South Asia\.\b
(?i)\bAccording to the World Population review, as of 2023,\
Afghanistan's population is 43 million\.\b
(?i)\bThe National\
Statistics Information Authority of Afghanistan estimated\
the population to be 32\.9 million as of 2020\.\b
(?i)\bAfghanistan also served as the source from\
which the Greco\-Bactrians and the Mughals, amongst\
others, rose to form major empires\.\b
(?i)\bThe modern state of Af

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(?i)\bRepublic of Singapore\
Malay:\
Republik Singapura\
Mandarin:\
新加坡共和国\
Tamil:\
சிங்கப்பூர் குடியரசு\
Flag\
Coat of arms\
Motto: Majulah Singapura \(Malay\)\
"Onward Singapore"\
Anthem: Majulah Singapura \(Malay\)\
"Onward Singapore"\
Capital\
Singapore \(city\-\
state\)\
1°17′N 103°50′E\
Largest planning\
area by population\
Bedok\
Official languages\
English · Malay ·\
Mandarin · Tamil\
National language\
Malay\
Ethnic groups\
\(2020\)\
74\.3% Chinese\
13\.5% Malay\
9\.0% Indian\
3\.2% other\
Religion \(2020\)\
31\.1% Buddhism\
20\.0% no religion\
18\.9% Christianity\
Singapore\
Singapore, officially the Republic of Singapore, is an island\
country and city\-state in maritime Southeast Asia\.\b
(?i)\bIt is located about\
one degree of latitude \(137 kilometres or 85 miles\) north of the\
equator, off the southern tip of the Malay Peninsula, bordering the\
Strait of Malacca to the west, the Singapore Strait to the south along\
with the Riau Islands in Indonesia, the South China Se

In [None]:
import os
import fitz
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

def redaction():
  path = "data"
  file_list = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
  for filename in file_list:
    doc = fitz.open(os.path.join(path, filename ))
    page_text = ""
    for page in doc:
      text = page.get_text("text")
      page_text += text
    return page_text

text = redaction()

stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
tokens = [token.text for token in doc]
punctuation = punctuation + '\n'


word_frequencies = {}
for word in doc:
  if word.text.lower() not in stopwords:
    if word.text.lower() not in punctuation:
      if word.text not in word_frequencies.keys():
        word_frequencies[word.text] = 1
      else:
        word_frequencies[word.text] += 1

max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
  word_frequencies[word] = word_frequencies[word]/max_frequency
sentence_tokens = [sent for sent in doc.sents]


sentence_scores = {}
for sent in sentence_tokens:
  for word in sent:
    if word.text.lower() in word_frequencies.keys():
      if sent not in sentence_scores.keys():
        sentence_scores[sent] = word_frequencies[word.text.lower()]
      else:
        sentence_scores[sent] += word_frequencies[word.text.lower()]

select_length = int(len(sentence_tokens)*0.4)
summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)

final_summary = [word.text for word in summary]
summary = ' '.join(final_summary)

print(summary)

Afghanistan's gross domestic product (GDP) is $81
• Restoration of
the Emirate
1842–1926
• Dost Mohammad
unites
Afghanistan
27 May 1863
• Anglo-Afghan
Agreement
26 May 1879
• Independence
19 August 1919
• Kingdom
9 June 1926
• Republic
17 July 1973
• Democratic
Republic
27–28 April 1978
• Islamic State
28 April 1992
• Islamic Emirate
27 September 1996
• Islamic Republic
26 January 2004
• Restoration of
Islamic Emirate
15 August 2021
Area
• Total
652,867[19] km2
(252,073 sq mi) (40th)
• Water (%)
negligible
Population
• 2023 estimate
 41,128,771 [20]
(37th)
• Density
48.08/km2
(124.5/sq mi)
GDP (PPP)
2020 estimate
• Total
$81.007 billion[21]
• Per capita
$2,459[21]
GDP (nominal)
2020 estimate
• Total
$20.136 billion[21]
• Per capita
$611[21]
HDI (2021)
  On 7 October 2023, a 6.3 magnitude earthquake struck northwest
of Herat, killing over 1,400 people.[271]
Afghanistan has a continental climate with harsh winters
in the central highlands, the glaciated northeast (around
Nuristan), and t