In [1]:
import os
import pandas as pd
from PyPDF2 import PdfMerger, PdfWriter, PdfReader

base_dir = os.getcwd() 

input_folder = os.path.join(base_dir,"data/legaldocs_redo/buchi_muller_ldocs")
output_folder = os.path.join(base_dir,"data/legaldocs_redo/ldocs_new")
extend_with = os.path.join(base_dir,"data/legaldocs_redo/extend_with")



In [None]:
## find all files with certain keyword in all subfolders of base directory

def search_files_and_folders(base_dir, keyword):
    matches = {'files': [], 'folders': []}
    
    for root, dirs, files in os.walk(base_dir):
        for d in dirs:
            if keyword.lower() in d.lower():
                matches['folders'].append(os.path.join(root, d))

        for f in files:
            if keyword.lower() in f.lower():
                matches['files'].append(os.path.join(root, f))

    return matches

base_directory = "/home/lillemor/"

# keyword = "Baersol"
# results = search_files_and_folders(base_directory, keyword)
        
# for folder in results['folders']:
#     print(folder)

# for file in results['files']:
#     print(file)


In [None]:
## extend legaldocs of büchi mülle with legal docs of csd

csv_file = os.path.join(base_dir, "data/legaldocs_redo/Büchi_müller_ag.csv")
df = pd.read_csv(csv_file)

sgs_ids = df['sgs_id'].dropna().astype(int).astype(str).tolist()

print("$sgsIds = @(")
for i, sgs_id in enumerate(sgs_ids):
    end = "," if i < len(sgs_ids) - 1 else ""  # Add comma except for the last item
    print(f'    "{sgs_id}"{end}')
print(")")

In [4]:
import os

pdf_files = [file.lower() for file in os.listdir(input_folder) if file.lower().endswith(".pdf")]
print(len(pdf_files))

missing_docs = []

# Check if each sgs_id is found in the filenames
for sgs_id in sgs_ids:
    if not any(f"{sgs_id}" in file for file in pdf_files):  # Fixed issue here
        missing_docs.append(sgs_id)  # Fixed issue here

print(f"Total missing legal docs: {len(missing_docs)}")
print("Missing asset IDs:", missing_docs)


179
Total missing legal docs: 3
Missing asset IDs: ['44102', '44136', '44194']


In [5]:
pdfs_to_merge = [
    ("CSD-Liebefeld-20060620.pdf", [0]),        # First page
    ("CSD-VD-19860624.pdf", [0, 2]),            # First and third page
    ("CSD-VD-20070702.pdf", [0, 2]),            # First and third page
    ("CSD-Liebefeld-19810911.pdf", [0]),        # First page
    ("CSD-19730309-2.pdf", [0]),                # First page
    ("CSD-19730309-1.pdf", [0,1]),                # First page
    ("CSD-19720525.pdf", [0])                   # First page
]

In [6]:
##create merged CSD legal doc
pdf_writer = PdfWriter()
merged_output_pdf = os.path.join(os.getcwd(), "data/legaldocs_redo/extend_with/CSD_LegalDocs_merged.pdf")

for pdf_file, pages in pdfs_to_merge:
    pdf_path = os.path.join(extend_with, pdf_file)
    
    if os.path.exists(pdf_path):  # Check if file exists
        pdf_reader = PdfReader(pdf_path)
        
        for page_num in pages:
            if page_num < len(pdf_reader.pages):  # Ensure page exists
                pdf_writer.add_page(pdf_reader.pages[page_num])
            else:
                print(f"Warning: {pdf_file} does not have page {page_num + 1}")
    else:
        print(f"Warning: File not found: {pdf_file}")

# Save the merged PDF
with open(merged_output_pdf, "wb") as output_file:
    pdf_writer.write(output_file)

print(f"Merged PDF created at: {merged_output_pdf}")

Merged PDF created at: /home/lillemor/Documents/lgd-utils/asset-data-extraction/data/legaldocs_redo/extend_with/CSD_LegalDocs_merged.pdf


In [None]:
merged_csd_pdf = os.path.join(base_dir, "data/legaldocs_redo/extend_with/CSD_LegalDocs_merged.pdf")
csd_overtake = os.path.join(base_dir, "data/legaldocs_redo/extend_with/CSD_uebernahme.pdf")

#check that output directory exists
os.makedirs(output_folder, exist_ok=True)
# Check if merged CSD legal doc and csd overtake doc exist
if not os.path.exists(merged_csd_pdf):
    raise FileNotFoundError(f"Merged CSD document not found: {merged_csd_pdf}")
if not os.path.exists(csd_overtake):
    raise FileNotFoundError(f"CSD takeover document not found: {csd_overtake}")

csd_overtake_reader = PdfReader(csd_overtake)
csd_reader = PdfReader(merged_csd_pdf)

for file in os.listdir(input_folder):
    if file.lower().endswith(".pdf"):
        input_pdf_path = os.path.join(input_folder, file)
        output_pdf_path = os.path.join(output_folder, file)

        # Read input pdf
        input_reader = PdfReader(input_pdf_path)
        pdf_writer = PdfWriter()

        # Add all pages from input pdf
        for page in input_reader.pages:
            pdf_writer.add_page(page)
            
        # Add the CSD takeover document
        for page in csd_overtake_reader.pages:
            pdf_writer.add_page(page)
            
        # Append all pages from the merged CSD legaldoc
        for page in csd_reader.pages:
            pdf_writer.add_page(page)

        # Save the extended PDF
        with open(output_pdf_path, "wb") as output_file:
            pdf_writer.write(output_file)

        print(f"Extended and saved: {file}")

print("\nAll PDFs have been extended with the CSD legal docs.")