In [3]:
%load_ext autoreload
%autoreload 2

import os
import fitz
import boto3
import pandas as pd
from tqdm import tqdm
import random
import sys
from pathlib import Path

repo_root = Path.cwd().parent.resolve()
sys.path.append(str(repo_root))


from src.utils import is_digitally_born

"""Notebook to retrieve files from the S3 bucket"""

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'Notebook to retrieve files from the S3 bucket'

In [2]:
s3_input = "asset/asset_files_new_ocr/"
s3_aws_profile = "779726271945_swissgeol-assets-ro"
s3_session = boto3.Session(profile_name=s3_aws_profile)
s3 = s3_session.resource('s3')
bucket = s3.Bucket('swissgeol-assets-swisstopo')

In [3]:
objs = list(bucket.objects.filter(Prefix=s3_input))

In [4]:
output_dir =  repo_root / "/data/text_files"

def key_to_filename(key):
    return key.split("/")[-1]

print(output_dir)

/data/text_files


In [None]:
# count page number of pdfs and save to data
output_path = repo_root / "/data/page_numbers.csv"

page_data = []
with tqdm(total=len(objs)) as pbar:

    for obj in objs:
        pbar.set_postfix_str(obj.key)
        
        filename = key_to_filename(obj.key)
        if filename.endswith(".pdf"):
                    tmp_file_path = repo_root / f"data/{filename}"
                    bucket.download_file(obj.key, tmp_file_path)

                    doc = fitz.open(tmp_file_path)
                    
                    num_pages = len(doc)
                    #count digitally born pages
                    digital_pages = sum(is_digitally_born(page) for page in doc)

                    digital_ratio = digital_pages / num_pages
                    if digital_ratio >= 0.8:
                        category = "digital"
                    elif digital_ratio <= 0.2:
                        category = "scanned"
                    else:
                        category = "both"

                    page_data.append({
                        "filename": filename,
                        "pages": num_pages,
                        "digital_pages": digital_pages,
                        "scanned_pages": num_pages - digital_pages,
                        "category": category
                    })

                    os.remove(tmp_file_path)   

        
        pbar.update(1)
        
df = pd.DataFrame(page_data)
df.to_csv(output_path, index=False)

In [None]:
#write text of pdf into text files
with tqdm(total=len(objs)) as pbar:

    for obj in objs:
        pbar.set_postfix_str(obj.key)
        
        filename = key_to_filename(obj.key)
        if filename.endswith(".pdf"):
                    tmp_file_path =  repo_root / f"data/{filename}"
                    bucket.download_file(obj.key, tmp_file_path)

                    doc = fitz.open(tmp_file_path)
                    output_path = repo_root / f"data/{filename}.txt"
                    
                    with open(output_path, "w", encoding = "utf-8") as text_file:
                        for _,page in enumerate(doc):
                            text= page.get_text()
                            text_file.write(text)

                    os.remove(tmp_file_path)   

        
        pbar.update(1)


In [None]:
#download random sample

sampled_objs = random.sample(objs, 10)

with tqdm(total=len(sampled_objs)) as pbar:
    for obj in sampled_objs:
        pbar.set_postfix_str(obj.key)
        
        filename = key_to_filename(obj.key)
        if filename.endswith(".pdf"):
            file_path = repo_root / f"data/input/reports_no_gt/{filename}"
            bucket.download_file(obj.key, file_path)
        
        pbar.update(1)