# Government Contract Processing - DeepSeek Vision with Simple Prompt

This notebook uses DeepSeek Vision-Language Model with your specified prompt:

```
This image is contract form.
Provide a detailed breakdown of the form's information.
For elements that are a set of checkboxes, return the name of the checked box as the value.
Provide the results as a json file.
```

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import time
import re
import base64
import io
from PIL import Image
import pdf2image
import pandas as pd
from tqdm.auto import tqdm
from typing import List, Dict, Any

print("✅ Libraries imported successfully")

In [None]:
# Configuration
CONFIG = {
    "model_name": "deepseek-ai/deepseek-vl-7b-chat",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "max_tokens": 2048,
    "temperature": 0.1,
    "cache_dir": "./deepseek_cache",
}

os.makedirs(CONFIG["cache_dir"], exist_ok=True)
print(f"Device: {CONFIG['device']}")
print(f"Model: {CONFIG['model_name']}")

In [None]:
class SimpleDeepSeekProcessor:
    """Simple DeepSeek processor with your specific prompt"""
    
    def __init__(self):
        self.tokenizer = None
        self.model = None
        
        # Your exact prompt
        self.prompt = """This image is contract form.
Provide a detailed breakdown of the form's information.
For elements that are a set of checkboxes, return the name of the checked box as the value.
Provide the results as a json file."""
    
    def load_model(self):
        """Load DeepSeek model"""
        if self.model is not None:
            return
            
        print("Loading DeepSeek Vision model...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            CONFIG["model_name"],
            cache_dir=CONFIG["cache_dir"],
            trust_remote_code=True
        )
        
        self.model = AutoModelForCausalLM.from_pretrained(
            CONFIG["model_name"],
            cache_dir=CONFIG["cache_dir"],
            torch_dtype=torch.float16 if CONFIG["device"] == "cuda" else torch.float32,
            device_map="auto" if CONFIG["device"] == "cuda" else None,
            trust_remote_code=True
        )
        
        if CONFIG["device"] == "cpu":
            self.model = self.model.to(CONFIG["device"])
        
        self.model.eval()
        print("✓ Model loaded successfully")
    
    def prepare_image(self, image: Image.Image) -> str:
        """Convert image to base64"""
        # Resize if too large
        max_size = 1024
        if max(image.size) > max_size:
            ratio = max_size / max(image.size)
            new_size = tuple(int(dim * ratio) for dim in image.size)
            image = image.resize(new_size, Image.Resampling.LANCZOS)
        
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()
    
    def process_image(self, image: Image.Image) -> Dict:
        """Process single image with DeepSeek"""
        try:
            # Prepare image
            image_b64 = self.prepare_image(image)
            
            # Create conversation
            conversation = [{
                "role": "user",
                "content": [
                    {"type": "text", "text": self.prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_b64}"
                        }
                    }
                ]
            }]
            
            # Generate
            inputs = self.tokenizer.apply_chat_template(
                conversation,
                add_generation_prompt=True,
                tokenize=True,
                return_tensors="pt"
            ).to(CONFIG["device"])
            
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=CONFIG["max_tokens"],
                    temperature=CONFIG["temperature"],
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            response = self.tokenizer.decode(
                outputs[0][inputs.shape[1]:],
                skip_special_tokens=True
            ).strip()
            
            return self.parse_response(response)
            
        except Exception as e:
            return {"error": str(e), "raw_response": ""}
    
    def parse_response(self, response: str) -> Dict:
        """Parse JSON from model response"""
        try:
            # Try to find JSON in response
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group(0))
            return json.loads(response)
        except json.JSONDecodeError:
            return {"raw_response": response, "parse_error": "Could not parse JSON"}
    
    def process_document(self, file_path: str) -> Dict:
        """Process document (PDF or image)"""
        start_time = time.time()
        
        result = {
            "filename": os.path.basename(file_path),
            "status": "processing",
            "processing_time": 0,
            "extracted_data": {},
            "raw_response": "",
            "error": None
        }
        
        try:
            # Load model if needed
            if self.model is None:
                self.load_model()
            
            # Load image(s)
            if file_path.lower().endswith('.pdf'):
                images = pdf2image.convert_from_path(file_path, dpi=300, first_page=1, last_page=3)
            else:
                images = [Image.open(file_path)]
            
            if not images:
                raise ValueError("No images found")
            
            # Process first page (can extend to multiple pages)
            print(f"Processing {result['filename']}...")
            extracted = self.process_image(images[0])
            
            result.update({
                "status": "success",
                "processing_time": time.time() - start_time,
                "extracted_data": extracted,
                "raw_response": extracted.get("raw_response", "")
            })
            
        except Exception as e:
            result.update({
                "status": "failed",
                "error": str(e),
                "processing_time": time.time() - start_time
            })
        
        return result

# Initialize processor
processor = SimpleDeepSeekProcessor()
print("✓ SimpleDeepSeekProcessor initialized")
print(f"📝 Prompt: {processor.prompt}")

In [None]:
# Test on single document
def test_document(file_path: str):
    """Test processing on single document"""
    
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        return None
    
    print(f"🔄 Testing DeepSeek on: {file_path}")
    print("=" * 50)
    
    result = processor.process_document(file_path)
    
    print(f"Status: {result['status']}")
    print(f"Processing time: {result['processing_time']:.2f}s")
    
    if result['status'] == 'success':
        print("\n📋 Extracted Data:")
        if 'parse_error' in result['extracted_data']:
            print("⚠️ JSON parsing failed")
            print(f"Raw response: {result['extracted_data']['raw_response'][:300]}...")
        else:
            print(json.dumps(result['extracted_data'], indent=2))
    else:
        print(f"❌ Error: {result['error']}")
    
    return result

# Test file path
sample_file = "../../data/raw/_exampleforms/83501-000.pdf"

print("💡 Ready to test!")
print("Uncomment the line below to test:")
print("# test_result = test_document(sample_file)")

# Uncomment to test:
# test_result = test_document(sample_file)

In [None]:
# Batch processing
def process_directory(input_dir: str) -> pd.DataFrame:
    """Process all documents in directory"""
    
    # Find files
    file_paths = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg')):
                file_paths.append(os.path.join(root, file))
    
    if not file_paths:
        print(f"❌ No files found in {input_dir}")
        return pd.DataFrame()
    
    print(f"📁 Found {len(file_paths)} files")
    print(f"⏱️ Estimated time: {len(file_paths) * 2:.0f} minutes")
    
    # Process files
    results = []
    
    for file_path in tqdm(file_paths, desc="Processing contracts"):
        result = processor.process_document(file_path)
        results.append(result)
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    
    # Summary
    successful = (df['status'] == 'success').sum()
    print(f"\n📊 Results: {successful}/{len(df)} successful ({successful/len(df)*100:.1f}%)")
    
    return df

# Directory to process
INPUT_DIR = "../../data/raw/_exampleforms"

print(f"Ready to process directory: {INPUT_DIR}")
print("Uncomment to start batch processing:")
print("# df_results = process_directory(INPUT_DIR)")

# Uncomment to process:
# df_results = process_directory(INPUT_DIR)

In [None]:
# Export results
def export_results(df: pd.DataFrame, output_dir: str = "./results"):
    """Export results to files"""
    
    if df.empty:
        print("No results to export")
        return
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Export to CSV
    csv_path = os.path.join(output_dir, "deepseek_simple_results.csv")
    df.to_csv(csv_path, index=False)
    print(f"✓ Exported to CSV: {csv_path}")
    
    # Export extracted data as separate JSON files
    json_dir = os.path.join(output_dir, "extracted_json")
    os.makedirs(json_dir, exist_ok=True)
    
    for _, row in df.iterrows():
        if row['status'] == 'success' and 'parse_error' not in row['extracted_data']:
            filename = os.path.splitext(row['filename'])[0] + '.json'
            json_path = os.path.join(json_dir, filename)
            
            with open(json_path, 'w') as f:
                json.dump(row['extracted_data'], f, indent=2)
    
    print(f"✓ Exported JSON files to: {json_dir}")
    
    # Summary report
    successful = df[df['status'] == 'success']
    avg_time = successful['processing_time'].mean() if not successful.empty else 0
    
    with open(os.path.join(output_dir, "summary.txt"), 'w') as f:
        f.write(f"DeepSeek Simple Processing Summary\n")
        f.write(f"Total files: {len(df)}\n")
        f.write(f"Successful: {len(successful)}\n")
        f.write(f"Success rate: {len(successful)/len(df)*100:.1f}%\n")
        f.write(f"Average time: {avg_time:.1f}s\n")
        f.write(f"\nPrompt used:\n{processor.prompt}\n")
    
    print(f"✓ Summary saved to: {os.path.join(output_dir, 'summary.txt')}")

print("📁 Export functions ready")
print("Use: export_results(df_results) after processing")