In [12]:
import PyPDF2
import regex as re

In [85]:
# Request user to input the filename without the .pdf
# filename = input('What is the file name?\n')
filename = 'F16'

# Enter the sections to be extracted here
section_to_extract = [18,6,7,8,9,12,13,14,15,16,17,20,21,22,23,24,25,27]
section_to_extract.sort()

## Getting the PDF File Text

This section of code will handle obtaining the file and the text within that pdf file.

In [96]:
# Open the File
file = open(f'data/{filename}.pdf','rb')

# Create PyPDF2 Reader Object
reader = PyPDF2.PdfFileReader(file)

# Get the Text from the file
text = ''
text_arr = []

for i in range(reader.numPages):
    # Get the current page
    page = reader.getPage(i)
    # Extract the text
    current_text = page.extract_text()
    # Append to master text variable
    text += current_text
    text += '\n'

print(text)

Overview of Criminal Justice Act 2003: Hearsay Exceptions and Additional 
Safeguards
Blackstone's Criminal Practice 2022  >  PART F EVIDENCE  >  Section F17 Exceptions to the Rule 
against Hearsay (Excluding Confessions)  >  INTRODUCTION
Section F17Exceptions to the Rule against Hearsay (Excluding Confessions)
INTRODUCTION
F17.1
The rule against hearsay, as described in F16, has never been an absolute prohibition. The provisions of the CJA 
2003, Part II, ch. 2, while retaining the concept of the hearsay rule as a rule of exclusion, are designed to ensure 
that, subject to the necessary safeguards, relevant hearsay evidence should be admitted where it is in the interests 
of justice. The provisions of the CJA 2003 constitute a 'crafted code' which, properly applied, is consistent with the 
right to fair trial accorded by the ECHR, Article 6(3)(d) (Horncastle [2009] UKSC 14, [2010] 2 AC 373: see F17.89). 
The rulings of the Grand Chamber in Al-Khawaja and Tahery v UK (2012) 54 EHRR 23 (

## Separate the text by End of Document

This section of code will separate the text by the phrase "End of Document"

In [87]:
section_regex=r"\.\d{1,3}\n"
print(type(section_regex))

<class 'str'>


In [90]:
# EOD Regex
eod_regex = r"End\sof\sDocument"
eod_text = re.split(eod_regex,text)

# Main and Sub Headings for the pages within the document
main_heading = ''
sub_heading = ''

main_dict = {}

for item in eod_text:

    # Title Regex
    title_regex = r"\nBlackstone's\sCriminal\sPractice\s2022"
    item_arr = re.split(title_regex,item)
    
    # Only work on arrays that have a length more than 1
    # At the very end of the document, there will be "End of Document" and the next thing will only be a \n

    
    if len(item_arr) > 1:
        page_heading = item_arr[0].replace('\n','')
        page_text = item_arr[-1]
    
        
        if page_heading.isupper():
            # This is a main heading
            # Update main_heading
            main_heading = page_heading
        else:
            # This is a sub heading
            # Update sub_heading
            sub_heading = page_heading
        
        # Check if rest of text on the page contains sections
        # Section Regex
        section_regex = r'\n' + re.escape(filename) + r'\.\d{1,3}'

        # Create a Regex to Identify section numbers
        sections = re.findall(section_regex,page_text)
        sections = [s.replace('\n','') for s in sections]

        # Split the text into an array
        section_text = re.split(section_regex,page_text)
        
        # Check if there are any sections on the page
        if len(sections) > 0:
            # There are sections. Proceed to split
            # Pop the first item of the array (The text in between the page title and first section)
            section_text.pop(0)
            
            for section, text_item in zip(sections, section_text):
                print(section)
                print(text_item)
                print('-------------')
                text_item = text_item.replace(f'{sub_heading}\n','')
                current_dict = {
                    section: {
                        "section_heading": main_heading,
                        "section_subheading": sub_heading,
                        "section_text": text_item
                    }
                }
                
                main_dict.update(current_dict)
            


F17.1

The rule against hearsay, as described in F16, has never been an absolute prohibition. The provisions of the CJA 
2003, Part II, ch. 2, while retaining the concept of the hearsay rule as a rule of exclusion, are designed to ensure 
that, subject to the necessary safeguards, relevant hearsay evidence should be admitted where it is in the interests 
of justice. The provisions of the CJA 2003 constitute a 'crafted code' which, properly applied, is consistent with the 
right to fair trial accorded by the ECHR, Article 6(3)(d) (Horncastle [2009] UKSC 14, [2010] 2 AC 373: see F17.89). 
The rulings of the Grand Chamber in Al-Khawaja and Tahery v UK (2012) 54 EHRR 23 (807) and Horncastle v UK 
(2015) 60 EHRR 31 (1331) (see F17.89) accept, contrary to previous Strasbourg case law, that the CJA 2003 
contains sufficient safeguards against the risk of wrongful conviction. It is convenient to begin with an overview of 
the Act's provisions before embarking on the detail.
The CJA 2003 applie

## Separate the text by Section

This section of code will separate the text by section and return a dictionary of section and the corresponding text.

In [60]:
# Section Regex
section_regex = re.escape(filename) + r'\.\d{1,2}\n'

# Create a Regex to Identify section numbers
sections_full = re.findall(section_regex,text)

# Split the text into an array
section_text = re.split(section_regex,text)

# Get a list of Sections
sections = [s.strip('\n') for s in sections_full]

# Remove the first section (Before D#.1)
section_text.pop(0)

# Create Dictionary
doc_dict = dict(zip(sections,section_text))
print(doc_dict["D6.1"])

Criminal trials in England and Wales are either trials on indictment in the Crown Court or summary trials in a 
magistrates' court. This section deals with (a) the classification of offences according to whether they: (i) must be 
tried on indictment, or (ii) may be tried either on indictment or summarily, or (iii) must be tried summarily; and (b) the 
procedure for determining the appropriate mode of trial in those cases where there is a choice.
End of Document
CLASSIFICATION OF OFFENCES
Blackstone's Criminal Practice 2022  >  PART D PROCEDURE  >  Section D6 Classification of 
Offences and Determining Allocation (Mode of Trial)
End of Document
Definition of the Classes of Offences
Blackstone's Criminal Practice 2022  >  PART D PROCEDURE  >  Section D6 Classification of 
Offences and Determining Allocation (Mode of Trial)  >  CLASSIFICATION OF OFFENCES



## Extract Target Data

This section of code will search for the sections specified by the user and return it as a string.

In [5]:
def remove_linespace_OLD(text):
    # Split by .\n to get an array
    # linespace_regex = r'(?<=[a-z)])\.\n(?=[A-Z(])'
    linespace_regex = r'[\.;:]\n'
    
    delimiter_arr = re.findall(linespace_regex,text)
    text_arr = re.split(linespace_regex,text)
    
    # Strip '\n'
    text_arr_formatted = [s.replace('\n','') for s in text_arr]
    
    # Replace '\n' with '\r\n'
    delimiter_arr_formatted = [s.replace('\n','') for s in delimiter_arr]
    
    # Join together
    full_text_arr = []
    
    for idx, text_formatted in enumerate(text_arr_formatted):
        if idx == len(text_arr_formatted) - 1:
            
            # Check if the section starts with "End of Document"
            if text_formatted.find("End of Document") == -1:
                # End of Document not Found
                full_text = text_formatted
                
        elif idx > 0:
            # Start implementing delimiter + text
            full_text = delimiter_arr_formatted[idx-1]
            full_text += text_formatted
        
        else:
            # First section of the text won't have a delimiter
            full_text += text_formatted
        
        full_text_arr.append(full_text)

        
    return full_text

In [6]:
def remove_linespace(text_raw) -> list:
    
    linespace_regex = r'[\.;:]\n'
    
    delimiter_arr = re.findall(linespace_regex,text_raw)
    text_arr = re.split(linespace_regex,text_raw)
    
    # Strip '\n'
    text_arr_formatted = [s.replace('\n','') for s in text_arr]
    
    # Replace '\n' with '\r\n'
    delimiter_arr_formatted = [s.replace('\n','') for s in delimiter_arr]
    
    text_delim_zip = zip(text_arr_formatted,delimiter_arr_formatted)
    
    text_arr = []
    
    for text, delimiter in text_delim_zip:
        text_arr.append(text + delimiter)
    
    return text_arr

## Generating Word File

This section will start creating the word file based on the text obtained and sections highlighted.

In [7]:
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_BREAK

In [8]:
doc = Document()

In [9]:
# Create Heading for Document
doc.add_heading(f"{filename} Compilation",0)

<docx.text.paragraph.Paragraph at 0x7faa817b18e0>

In [10]:
# Add Sections and Text to Document
for target in section_to_extract:
    section_heading = f'{filename}.{target}'
    section_text = remove_linespace(doc_dict[section_heading])
    
    # Add Heading
    doc.add_heading(section_heading)
    
    # Add text
    for text_item in section_text:
        paragraph = doc.add_paragraph(text_item)
        paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
        paragraph.paragraph_format.space_after = Pt(6)

In [11]:
# Save Document
doc.save(f'output/{filename}.docx')