### Set up

In [4]:
import re
import pandas as pd
from PyPDF2 import PdfReader

# Function to extract text from the PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text() + '\n\n'  # Adding extra newline for better separation
    return text

# Function to parse sections considering multiline headings
def parse_sections(text):
    # Adjusted pattern to capture numeric section identifiers possibly followed by multiline titles
    section_pattern = re.compile(r'(\d+\.\n(?:[A-Z][a-z]+\s*)+\n)')
    sections = section_pattern.split(text)[1:]  # Split text, keeping delimiters

    # Pairing section numbers with their titles and content
    sections_data = []
    for i in range(0, len(sections), 2):
        title = sections[i].strip()
        content = sections[i + 1].strip()
        sections_data.append((title, content))

    return sections_data  # Returns a list of tuples (title, content)

# Function to convert sections to Excel
def sections_to_excel(sections_data, output_file_path):
    df = pd.DataFrame(sections_data, columns=['Section Title', 'Content'])
    df.to_excel(output_file_path, index=False)

# Example usage with placeholders for file paths
pdf_file_path = "../../Data/Guide/guide_test.pdf"  # Replace with the path to your PDF file
excel_file_path = "../../Data/Guide/guide_test.xlsx"

# Extracting text from PDF
text = extract_text_from_pdf(pdf_file_path)

# Parsing sections from the text
sections_data = parse_sections(text)

# Exporting parsed sections to an Excel file
sections_to_excel(sections_data, excel_file_path)


In [None]:
path = "../../Data/Guide/guide_test.pdf"


# PDF reader

In [5]:
reader = PdfReader(path)
text = ''
for page in reader.pages:
    text += page.extract_text() + '\n'

print(text)

NameError: name 'path' is not defined

In [None]:
text = text.replace("Programmi insegnamenti - Divisione Didattica ", '')
text = text.replace("© Università Bocconi - Via Sarfatti, 25 Milano - ","")
date_pattern = re.compile(r'(Last change \d{2}/\d{2}/\d{4} \d{2}:\d{2})')
# Add a newline character after each matched pattern
modified_text = date_pattern.sub(r'\1\n', text)

modified_text

## General text cleaner 

In [9]:
from pdfminer.high_level import extract_text

text = extract_text("../../Data/Guide/4layerstrial.pdf")

In [10]:
text = text.replace("""Programmi insegnamenti - Divisione Didattica

GUIDES TO THE UNIVERSITY BOCCONI

2023-2024 A.Y.
BACHELOR OF SCIENCE PROGRAMS (3-Y) ""","")
text = text.replace("Programmi insegnamenti - Divisione Didattica", '')
text = text.replace("© Università Bocconi - Via Sarfatti, 25 Milano - PI 03628350153","---")
pagination_pattern = re.compile(r'Pag\. \d+/\d+')

# Replace the matched pattern with an empty string
modified_text = pagination_pattern.sub('', text)


In [11]:
modified_text

'\n\n1.\n\nSTUDENT RESOURCES\n\nLast change 01/06/2023 08:00\n\n1.1.\n\nUseful tools for students\n\nTo manage their academic careers, each student can use the following tools:\n\npersonal credentials to access University services;\nyoU@B student Diary;\nPunto Blu;\nEmail account;\nMicrosoft365 applications;\nPersonalized ID card with photo.\n\nLast change 30/06/2023 10:12\n\n1.2.\n\nBocconi personal credentials\n\nAll students are equipped with credentials allowing them to access the network and the\nUniversity IT services.\n\nThe credentials are:\n\nUserID: as a general rule, this is the student ID number assigned to the first\n\n---\n\n\n\n       \n \n \n\x0c\n\nacademic career at Bocconi;\nPassphrase: this is the password initially used for the initial enrollment procedure\nonline.\n\nUsing the credentials allows access to:\n\nyoU@B Student Diary (youatb.unibocconi.it);\nBocconi email account (outlook.unibocconi.it);\nMicrosoft365 applications (office.unibocconi.it);\nWi-Fi Network

# pdfminer

In [None]:
from pdfminer.high_level import extract_text

text = extract_text("../../Data/Guide/guide_test.pdf")


In [None]:
text = modified_text
text 

In [None]:
def parse_text_with_depth(text):
    # Define regular expression patterns
    header_pattern = r'(\d+(?:\.\d+){0,3})\.\s*([^\n]+)'
    last_change_pattern = r'Last change (\d{2}/\d{2}/\d{4} \d{2}:\d{2})'

    # Find headers in the text
    headers = [(match.group(), match.start()) for match in re.finditer(header_pattern, text)]

    last_change_for_next_section = ""
    data = []

    for i, (header, start_index) in enumerate(headers):
        # Extract section number and title
        section_number, title = re.match(header_pattern, header).groups()
        levels = section_number.split('.')

        # Assign category, section, subsection, and subsubsection based on depth
        category = title if len(levels) == 1 else "General"
        section = "General" if len(levels) < 2 else title if len(levels) == 2 else "General"
        subsection = "General" if len(levels) < 3 else title if len(levels) == 3 else "General"
        subsubsection = "General" if len(levels) < 4 else title

        # Determine text range for the current section
        end_index = headers[i + 1][1] if i + 1 < len(headers) else len(text)
        section_text = text[start_index:end_index].strip()
        section_text = re.sub(header_pattern, '', section_text, 1).strip()

        # Search for "Last change" within this section
        last_change_match = re.search(last_change_pattern, section_text)
        if last_change_match:
            last_change_for_this_section = last_change_match.group(1)
            section_text = section_text[:last_change_match.start()].strip()
        else:
            last_change_for_this_section = last_change_for_next_section

        last_change_for_next_section = last_change_for_this_section if last_change_match else ""

        # Append structured data
        data.append({
            'Number': section_number,
            'Category': category,
            'Section': section,
            'Subsection': subsection,
            'Subsubsection': subsubsection,
            'Text': section_text,
            'Last Change': last_change_for_this_section
        })

    return data

In [None]:
modified_text

In [None]:
parsed_data_with_depth = parse_text_with_depth(modified_text)

# Convert the parsed data into a DataFrame for better visualization
df_with_depth = pd.DataFrame(parsed_data_with_depth)

# Display the DataFrame to verify the structure
df_with_depth

In [None]:
edge_text = "1.6.2.\n\nTheft and loss of ID card\n\nIf the personalized photo ID card has been lost, stolen, damaged or if the magnetic strip no\nlonger works, you should immediately:\n\n1. deactivate it using the “Your ID Card” widget. This will ensure that your card is not\nused improperly by third parties (e.g. to illegitimately take out book loans from the\nLibrary);\nin addition, if the card has been activated for banking operations, you should\nimmediately contact Banca Popolare di Sondrio to block the card at the phone\nnumber: 800.822.056 (+39 02 60843768 for international calls).\n\n2.\n\n---\n\n\n\n \n \n \n\x0c\n\nTo issue a new Bocconi ID card:\n\nif the card is lost or stolen, a replacement fee is charged;\nif the magnetic strip no longer works or the card is damaged, a replacement is free of\ncharge only upon submission of the damaged card when collecting the new one.\n\nFor details about the procedure for re-issuing the Bocconi ID card (students), see\nRequesting ID Card Replacement.\n\nLast change 30/06/2023 12:12\n\n1.6.3.\n\nRequesting ID card replacement\n\nOnly if the ID Card is lost or stolen can a request for a replacement be made from yoU@B\nthrough the “Your ID Card” widget.\nIn this case, a €26 fee will be charged for the issue of a new Bocconi ID card.\nIf a previously lost card is found after the replacement request has been made, it cannot\nbe reactivated.\n\nIf the card is damaged or the magnetic strip is no longer working, the student should\ntake the damaged card to the B in Touch Point so that it can be checked. In this case, there\nis no charge for a replacement. If the student does not submit the damaged card, however,\nthe replacement fee will be charged.\n\nLast change 30/06/2023 12:13\n\n1.7.\n\nPersonal computers\n\nLast change 01/06/2023 08:00\n\n---\n\n\n\n \n    \n \n\x0c\n\n1.7.1.\n\nStudent\'s personal laptop computer\n\nEach student must have a personal laptop computer, with the following minimum\ncharacteristics:\n\nWi-Fi with at least IEEE 802.11ac standard;\nintegrated video camera and microphone;\nWindows 10 or later operating system or macOS 10.12 and later\n\nThis computer should be taken to the classroom whenever required by faculty members. Due\nto the progressive innovations to teaching methods adopted, in fact, the use of computers\ncould constitute an essential condition both to participate in the lectures of some courses and\nto take exams. For recommended hardware requirements: Current Students > Services >\nICT > Wi-Fi Activation > Hardware Requirements.\n\nLast change 30/06/2023 12:13\n\n1.7.2" 
edge_text 

In [None]:
import re

def parse_text_with_depth(text):
    # Improved header pattern to minimize false positives
    header_pattern = r'(?<=\n|^)(\d+(?:\.\d+){0,3})\.\s+([^\n]+)'
    last_change_pattern = r'Last change (\d{2}/\d{2}/\d{4} \d{2}:\d{2})'

    # Function to check if current section is logically after the previous one
    def is_new_section(previous, current):
        prev_nums = [int(num) for num in previous.split('.')]
        curr_nums = [int(num) for num in current.split('.')]
        for p, c in zip(prev_nums, curr_nums):
            if c > p:
                return True
            elif c < p:
                return False
        return len(curr_nums) > len(prev_nums)

    headers = [(match.group(), match.start()) for match in re.finditer(header_pattern, text)]
    last_change_for_next_section = ""
    data = []
    previous_section = None

    for i, (header, start_index) in enumerate(headers):
        section_number, title = re.match(header_pattern, header).groups()

        # Skip if not a logically sequential section
        if previous_section and not is_new_section(previous_section, section_number):
            continue

        levels = section_number.split('.')
        previous_section = section_number

        category = title if len(levels) == 1 else "General"
        section = "General" if len(levels) < 2 else title if len(levels) == 2 else "General"
        subsection = "General" if len(levels) < 3 else title if len(levels) == 3 else "General"
        subsubsection = "General" if len(levels) < 4 else title

        end_index = headers[i + 1][1] if i + 1 < len(headers) else len(text)
        section_text = text[start_index:end_index].strip()
        section_text = re.sub(header_pattern, '', section_text, 1).strip()

        last_change_match = re.search(last_change_pattern, section_text)
        if last_change_match:
            last_change_for_this_section = last_change_match.group(1)
            section_text = section_text[:last_change_match.start()].strip()
        else:
            last_change_for_this_section = last_change_for_next_section

        last_change_for_next_section = last_change_for_this_section if last_change_match else ""

        data.append({
            'Number': section_number,
            'Category': category,
            'Section': section,
            'Subsection': subsection,
            'Subsubsection': subsubsection,
            'Text': section_text,
            'Last Change': last_change_for_this_section
        })

    return data

# Example usage with your provided text

parsed_data = parse_text_with_depth(edge_text)


In [None]:
def is_new_section(previous, current):
    prev_nums = [int(num) for num in previous.split('.')]
    curr_nums = [int(num) for num in current.split('.')]
    for p, c in zip(prev_nums, curr_nums):
        if c > p:
            return True
        elif c < p:
            return False
    return len(curr_nums) > len(prev_nums)

In [None]:
is_new_section("1.5","2")

In [None]:
header_pattern = r'(?<=\n|^)(\d+(?:\.\d+){0,3})\.\s+([^\n]+)'
last_change_pattern = r'Last change (\d{2}/\d{2}/\d{4} \d{2}:\d{2})'

In [None]:
header_pattern = r'(?<=\n|^)(\d+(?:\.\d+){0,3})\.\s+([^\n]+)'
last_change_pattern = r'Last change (\d{2}/\d{2}/\d{4} \d{2}:\d{2})'