In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import shutil
import zipfile
from pathlib import Path
from tqdm import tqdm

In [3]:
# --- CONFIGURATION ---
# 1. Source (Permanent Storage)
PROJECT_DIR = Path(os.environ.get("PROJECT"))  # Auto-detects $PROJECT
SOURCE_ZIPS = PROJECT_DIR / "tligawa/mshauri-fedha-store/cbk/zipped-store"

# 2. Destination (Fast Scratch Storage)
SCRATCH_DIR = Path(os.environ.get("SCRATCH"))  # Auto-detects $SCRATCH
WORK_DIR = SCRATCH_DIR / "mshauri-fedha/data/cbk"
FINAL_PDF_DIR = WORK_DIR / "text"
TEMP_EXTRACT_DIR = WORK_DIR / "temp-unzip-cbk"

In [4]:
os.path.exists(WORK_DIR)

True

In [5]:
# Setup directories
if FINAL_PDF_DIR.exists():
    print(f"‚ö†Ô∏è Warning: Target folder {FINAL_PDF_DIR} already exists.")
else:
    FINAL_PDF_DIR.mkdir(parents=True, exist_ok=True)
    
if TEMP_EXTRACT_DIR.exists(): shutil.rmtree(TEMP_EXTRACT_DIR)
TEMP_EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

# --- EXECUTION ---
zips = sorted(list(SOURCE_ZIPS.glob("*.zip")))
print(f"üöÄ Found {len(zips)} batches in {SOURCE_ZIPS}")
print(f"üìÇ Flattening to: {FINAL_PDF_DIR} ...")

üöÄ Found 111 batches in /capstor/store/cscs/director2/g164/tligawa/mshauri-fedha-store/cbk/zipped-store
üìÇ Flattening to: /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/text ...


In [6]:
# Unzip and flatten
count = 0
for zip_path in tqdm(zips, desc="Unzipping & Flattening"):
    batch_name = zip_path.stem  # e.g., "knbs_batch_1"
    
    try:
        # 1. Unzip to a temp folder
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extractall(TEMP_EXTRACT_DIR)
        
        # 2. Find the 'pdfs' subfolder inside that batch
        # We look recursively because structure might vary slightly
        pdf_files = list(TEMP_EXTRACT_DIR.rglob("*.txt"))
        
        # 3. Move and Rename
        for pdf in pdf_files:
            # Create unique name: batch_name + original_name
            # Example: knbs_batch_1_annual_report_2020.pdf
            new_name = f"{batch_name}_{pdf.name}"
            dest_path = FINAL_PDF_DIR / new_name
            
            shutil.move(str(pdf), str(dest_path))
            count += 1
            
    except Exception as e:
        print(f"‚ùå Error processing {zip_path.name}: {e}")
    finally:
        # Clean temp folder for next batch
        for item in TEMP_EXTRACT_DIR.iterdir():
            if item.is_dir(): shutil.rmtree(item)
            else: item.unlink()

print(f"\n‚ú® Done! {count} files are ready in {FINAL_PDF_DIR}")
print(f"üßπ Cleaning up temp dirs...")
if TEMP_EXTRACT_DIR.exists(): shutil.rmtree(TEMP_EXTRACT_DIR)

Unzipping & Flattening: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 111/111 [00:22<00:00,  4.84it/s]


‚ú® Done! 58 files are ready in /capstor/scratch/cscs/tligawa/mshauri-fedha/data/cbk/text
üßπ Cleaning up temp dirs...



