<a href="https://colab.research.google.com/github/wsGit7/Invest/blob/main/kisPodobszaryHE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.1-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.1-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.1


In [8]:
import os
import glob
import difflib
from PyPDF2 import PdfReader
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# Parameters:
SIMILARITY_THRESHOLD = 0.001       # Only consider paragraphs with similarity ratio of 0.8 or above
MIN_SUBSTRING_LENGTH = 6        # Minimal length (in characters) of the common phrase to be noted

def extract_paragraphs(pdf_path):
    """
    Extracts text from each page of a PDF and splits the text into paragraphs.
    Returns a list of tuples: (page_number, paragraph_index, paragraph_text).
    """
    paragraphs = []
    reader = PdfReader(pdf_path)
    for page_idx, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        # We split the page text based on newlines.
        # Depending on your PDFs, you might want to refine the splitting mechanism.
        para_list = [p.strip() for p in text.split("\n") if p.strip() != ""]
        for par_idx, para in enumerate(para_list):
            paragraphs.append((page_idx + 1, par_idx + 1, para))
    return paragraphs

def longest_common_substring(s1, s2):
    """
    Uses difflib to retrieve the longest common substring between two strings.
    """
    matcher = difflib.SequenceMatcher(None, s1, s2)
    match = matcher.find_longest_match(0, len(s1), 0, len(s2))
    return s1[match.a: match.a + match.size]

def main():
    # Extract paragraphs from asia.pdf.
    asia_pdf_path = "asia.pdf"
    asia_paragraphs = extract_paragraphs(asia_pdf_path)

    # List all PDF files in the folder /andrzej.
    pdf_files = glob.glob(os.path.join("/andrzej", "*.pdf"))

    results = []  # This will hold all matching results

    # Process each document in /andrzej.
    for file_path in pdf_files:
        doc_name = os.path.basename(file_path)
        doc_paragraphs = extract_paragraphs(file_path)

        # Compare every paragraph from asia.pdf with every paragraph from the current document.
        for asia_page, asia_par_index, asia_text in asia_paragraphs:
            for doc_page, doc_par_index, doc_text in doc_paragraphs:
                # Compute a similarity ratio.
                similarity = difflib.SequenceMatcher(None, asia_text, doc_text).ratio()
                if similarity >= SIMILARITY_THRESHOLD:
                    common_phrase = longest_common_substring(asia_text, doc_text)
                    # Only record the result if the common phrase is of meaningful length.
                    if len(common_phrase) >= MIN_SUBSTRING_LENGTH:
                        results.append({
                            "asia_page": asia_page,
                            "asia_paragraph": asia_par_index,
                            "doc_name": doc_name,
                            "doc_page": doc_page,
                            "doc_paragraph": doc_par_index,
                            "common_phrase": common_phrase,
                            "similarity": similarity
                        })

    # Write the matching results to ewa.pdf using ReportLab.
    output_pdf_path = "ewa.pdf"
    c = canvas.Canvas(output_pdf_path, pagesize=letter)
    width, height = letter
    c.setFont("Helvetica", 10)

    text_object = c.beginText(40, height - 40)
    text_object.textLine("Comparison Results Between asia.pdf and PDFs in /andrzej")
    text_object.textLine("Similarity Threshold: {:.2f}".format(SIMILARITY_THRESHOLD))
    text_object.textLine("")

    # Each result line holds the match details.
    for result in results:
        line = ("asia.pdf (Page {}, Paragraph {}) <-> {} (Page {}, Paragraph {}), "
                "Similarity: {:.2f}, Common Phrase: '{}'".format(
            result["asia_page"],
            result["asia_paragraph"],
            result["doc_name"],
            result["doc_page"],
            result["doc_paragraph"],
            result["similarity"],
            result["common_phrase"]
        ))
        text_object.textLine(line)

        # If we are low on space, start a new page.
        if text_object.getY() < 50:
            c.drawText(text_object)
            c.showPage()
            text_object = c.beginText(40, height - 40)
            text_object.setFont("Helvetica", 10)

    c.drawText(text_object)
    c.save()
    print(f"Comparison completed. Results written to {output_pdf_path}")

if __name__ == "__main__":
    main()


Comparison completed. Results written to ewa.pdf


In [9]:
import PyPDF2
import re

def extract_text_from_pdf(pdf_path):
    """Extracts text from all pages of a PDF and returns a single concatenated string."""
    full_text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text = page.extract_text() or ""
            full_text += text + " "
    return full_text

def get_trigrams(text):
    """
    Given a string of text, returns a set of all three-word sequences.
    Words are extracted using regex (considering alphanumeric characters) and normalized to lowercase.
    """
    # Extract words and lower-case them.
    words = re.findall(r'\w+', text.lower())
    # Generate three-word sequences (trigrams)
    trigrams = {" ".join(words[i:i+3]) for i in range(len(words) - 2)}
    return trigrams

def main():
    # Extract text from a.pdf and b.pdf in the current directory.
    text_a = extract_text_from_pdf("a.pdf")
    text_b = extract_text_from_pdf("b.pdf")

    # Generate trigrams for both documents.
    trigrams_a = get_trigrams(text_a)
    trigrams_b = get_trigrams(text_b)

    # Compute the common three-word sequences.
    common_trigrams = trigrams_a.intersection(trigrams_b)

    if common_trigrams:
        print(f"Found {len(common_trigrams)} common three-word sequences from a.pdf present in b.pdf:")
        for trigram in common_trigrams:
            print(trigram)
    else:
        print("No matching three-word sequences found.")

if __name__ == "__main__":
    main()


Found 203 common three-word sequences from a.pdf present in b.pdf:
products such as
to provide the
in the field
for the implementation
systems and communication
the work of
life cycle of
reduction of the
do not meet
the scope of
take into account
the availability of
which they are
elements of the
order to achieve
that do not
the needs of
necessary for the
improve the efficiency
be considered as
legal and administrative
of applications and
the development of
and exploitation of
of high quality
accordance with the
taking into account
the context of
the participation of
depending on the
in the eu
facilities and or
early detection of
research areas in
and effectiveness of
in the work
in the early
learning algorithms and
to the development
to monitor and
in line with
aspects of the
of the full
for the purposes
taken into account
for the detection
the quality of
carrying out the
one of the
needs of the
it is the
to achieve the
the eu s
use of materials
the costs of
the research and
access to

In [10]:
import re
import os
import PyPDF2
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def extract_paragraphs(pdf_path):
    """
    Extracts text from each page of a PDF file and splits the text into paragraphs
    (using newlines). Returns a list of tuples: (page_number, paragraph_number, paragraph_text).
    """
    paragraphs = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page_num, page in enumerate(reader.pages, start=1):
            text = page.extract_text() or ""
            # Split text on newlines (adjust splitting if necessary)
            para_list = [p.strip() for p in text.split("\n") if p.strip() != ""]
            for para_idx, para in enumerate(para_list, start=1):
                paragraphs.append((page_num, para_idx, para))
    return paragraphs

def get_trigrams_from_text(text):
    """
    Given a paragraph text, returns a list of trigrams (three consecutive words).
    The text is normalized to lowercase and split using regex to capture words.
    """
    words = re.findall(r'\w+', text.lower())
    trigrams = [" ".join(words[i:i+3]) for i in range(len(words) - 2)]
    return trigrams

def build_trigram_map(paragraphs):
    """
    For each paragraph in the list, compute its trigrams and store them in a dictionary.
    The dictionary's keys are the trigrams and the value is a list of occurrences,
    where an occurrence is a tuple (page_number, paragraph_number).
    """
    trigram_map = {}
    for page, para_idx, text in paragraphs:
        trigrams = get_trigrams_from_text(text)
        for trigram in trigrams:
            if trigram not in trigram_map:
                trigram_map[trigram] = []
            trigram_map[trigram].append((page, para_idx))
    return trigram_map

def compare_trigrams(map_a, map_b):
    """
    Finds common trigrams between two dictionaries. For each common trigram,
    creates a record for every occurrence combination between a.pdf and b.pdf.
    Returns a list of tuples:
      (common_trigram, (page_a, para_a), (page_b, para_b))
    """
    common_matches = []
    common_keys = set(map_a.keys()).intersection(set(map_b.keys()))
    for trigram in common_keys:
        occurrences_a = map_a[trigram]
        occurrences_b = map_b[trigram]
        for occ_a in occurrences_a:
            for occ_b in occurrences_b:
                common_matches.append((trigram, occ_a, occ_b))
    return common_matches

def write_results_to_pdf(output_path, matches):
    """
    Writes the results (each match with trigram, page and paragraph numbers in a.pdf and b.pdf)
    to an output PDF (using ReportLab).
    """
    c = canvas.Canvas(output_path, pagesize=letter)
    width, height = letter
    c.setFont("Helvetica", 10)
    y = height - 50    # Starting y position
    line_height = 12

    header = "Comparison Results between a.pdf and b.pdf\n" \
             "Matching three-word sequences with their page and paragraph positions:\n"
    c.drawString(50, y, header)
    y -= 30

    for match in matches:
        trigram, pos_a, pos_b = match
        line = (f"Trigram: '{trigram}' | "
                f"a.pdf - Page {pos_a[0]}, Paragraph {pos_a[1]} | "
                f"b.pdf - Page {pos_b[0]}, Paragraph {pos_b[1]}")
        c.drawString(50, y, line)
        y -= line_height

        # Check if we've reached near the bottom of the page
        if y < 50:
            c.showPage()
            y = height - 50
            c.setFont("Helvetica", 10)

    c.save()

def main():
    # Extract paragraphs for each PDF file.
    paragraphs_a = extract_paragraphs("a.pdf")
    paragraphs_b = extract_paragraphs("b.pdf")

    # Build trigram mappings from paragraphs.
    trigram_map_a = build_trigram_map(paragraphs_a)
    trigram_map_b = build_trigram_map(paragraphs_b)

    # Compare the two maps to get matching three-word sequences.
    matches = compare_trigrams(trigram_map_a, trigram_map_b)

    # Write the results into 'apple.pdf'.
    output_pdf_path = "apple.pdf"
    write_results_to_pdf(output_pdf_path, matches)

    print(f"Comparison completed. Found {len(matches)} matches. Results saved to {output_pdf_path}")

if __name__ == "__main__":
    main()


Comparison completed. Found 1491 matches. Results saved to apple.pdf
