<a href="https://colab.research.google.com/github/waniabbeer/ai-doc-processing-suite/blob/main/Initial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

# SECTION 1: SETUP & INSTALLATIONS
# Copy this cell and run it first
# ============================================================================

print("📦 Installing required packages...")
print("This may take 2-3 minutes...\n")

# Install OCR and document processing libraries
!pip install -q pytesseract pdf2image pillow
!pip install -q pypdf2 python-docx pyyaml

# Install ML/AI libraries
!pip install -q transformers torch torchvision
!pip install -q sentence-transformers faiss-cpu
!pip install -q langchain langchain-community

# Install system dependencies
!apt-get install -q tesseract-ocr poppler-utils

print("\n✅ All packages installed successfully!")
print("⚠️  If you see any warnings, they're usually safe to ignore.")
print("\n➡️  Now run SECTION 2")

📦 Installing required packages...
This may take 2-3 minutes...

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==

In [3]:
# ============================================================================
# SECTION 2: IMPORTS
# Run this cell second
# ============================================================================

print("📚 Importing libraries...\n")

# Standard libraries
import os
import yaml
import numpy as np
from pathlib import Path
from typing import List, Dict, Any
import io

# Image and OCR
from PIL import Image
import pytesseract
from pdf2image import convert_from_path

# Document processing
from PyPDF2 import PdfReader

# ML/AI Libraries
import torch
from transformers import pipeline, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import faiss
print("✅ All imports successful!")
print("➡️  Now run SECTION 3")

📚 Importing libraries...

✅ All imports successful!
➡️  Now run SECTION 3


In [4]:
# SECTION 3: PROJECT STRUCTURE SETUP
# Run this cell third
# ============================================================================

print("🏗️  Creating project structure...\n")

def create_project_structure():
    """Creates the folder structure for the project"""
    folders = [
        'ai-doc-processing-suite/data',
        'ai-doc-processing-suite/src/ocr',
        'ai-doc-processing-suite/src/classification',
        'ai-doc-processing-suite/src/retrieval',
        'ai-doc-processing-suite/src/llm',
        'ai-doc-processing-suite/outputs'
    ]

    for folder in folders:
        Path(folder).mkdir(parents=True, exist_ok=True)

    print("✅ Project structure created!\n")
    print("📦 Your folder structure:")
    for folder in folders:
        print(f"  └── {folder}")

create_project_structure()
print("\n➡️  Now run SECTION 4")

🏗️  Creating project structure...

✅ Project structure created!

📦 Your folder structure:
  └── ai-doc-processing-suite/data
  └── ai-doc-processing-suite/src/ocr
  └── ai-doc-processing-suite/src/classification
  └── ai-doc-processing-suite/src/retrieval
  └── ai-doc-processing-suite/src/llm
  └── ai-doc-processing-suite/outputs

➡️  Now run SECTION 4


In [5]:
# ============================================================================
# SECTION 4: CONFIGURATION
# Run this cell fourth
# ============================================================================

print("⚙️  Setting up configuration...\n")

# Create configuration dictionary
config = {
    'ocr': {
        'language': 'eng',
        'dpi': 300,
        'preprocessing': True
    },
    'classification': {
        'model': 'distilbert-base-uncased',
        'categories': ['loan', 'bank_statement', 'contract', 'invoice', 'other']
    },
    'retrieval': {
        'embedding_model': 'all-MiniLM-L6-v2',
        'chunk_size': 500,
        'chunk_overlap': 50,
        'top_k': 3
    },
    'llm': {
        'model': 'google/flan-t5-base',
        'max_length': 512,
        'temperature': 0.7
    }
}

# Save config to file
with open('ai-doc-processing-suite/config.yaml', 'w') as f:
    yaml.dump(config, f)

print("✅ Configuration saved to config.yaml!")
print("\n📋 Your settings:")
for key, value in config.items():
    print(f"  • {key}: {list(value.keys())}")

print("\n➡️  Now run SECTION 5")



⚙️  Setting up configuration...

✅ Configuration saved to config.yaml!

📋 Your settings:
  • ocr: ['language', 'dpi', 'preprocessing']
  • classification: ['model', 'categories']
  • retrieval: ['embedding_model', 'chunk_size', 'chunk_overlap', 'top_k']
  • llm: ['model', 'max_length', 'temperature']

➡️  Now run SECTION 5
