In [None]:
# Install required packages (if not already installed)
!pip install requests -q
!pip install transformers -q
!pip install pillow -q

import requests
import json
import base64
from google.colab import files
import os
from PIL import Image
import io

# Function to upload an image in Colab
def upload_image():
  uploaded = files.upload()
  file_path = list(uploaded.keys())[0]
  print(f"Uploaded image: {file_path}")
  return file_path

# Function to extract data from image using Hugging Face API
def extract_data_from_image(image_path, api_token):
  # Read and encode image
  with open(image_path, "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode('utf-8')

  # Use a more widely available multimodal model
  model_id = "microsoft/git-base-textvqa"  # Alternative model

  # API endpoint
  api_url = f"https://api-inference.huggingface.co/models/{model_id}"

  # Headers
  headers = {
    "Authorization": f"Bearer {api_token}",
    "Content-Type": "application/json"
  }

  # First, let's test if the model is accessible
  try:
    test_response = requests.get(api_url, headers=headers)
    if test_response.status_code != 200:
      print(f"Model check failed with status code: {test_response.status_code}")
      print("Trying alternative model...")
      # Fall back to another model
      model_id = "Salesforce/blip-image-captioning-base"
      api_url = f"https://api-inference.huggingface.co/models/{model_id}"

  except Exception as e:
    print(f"Model check error: {e}")
    # Fall back to another model
    model_id = "Salesforce/blip-image-captioning-base"
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"

  print(f"Using model: {model_id}")

  # Prepare payload based on model type
  if "blip" in model_id:
    # BLIP models expect this format
    payload = {
      "inputs": {
        "image": f"data:image/jpeg;base64,{image_data}"
      },
      "parameters": {
        "task": "image-to-text"
      }
    }
  else:
    # Standard format for other models
    payload = {
      "inputs": {
        "image": f"data:image/jpeg;base64,{image_data}",
        "prompt": "Extract the data from this image and return it in JSON format."
      }
    }

  # Make API call
  try:
    response = requests.post(api_url, headers=headers, json=payload, timeout=60)

    if response.status_code == 200:
      result = response.json()

      # Format the result into a structured JSON
      if isinstance(result, list) and len(result) > 0:
        # Some models return a list of captions/descriptions
        extracted_text = result[0].get("generated_text", str(result))
        return {
          "status": "success",
          "model": model_id,
          "extracted_data": extracted_text,
          "raw_response": result
        }
      else:
        return {
          "status": "success",
          "model": model_id,
          "extracted_data": result,
          "raw_response": result
        }
    else:
      return {
        "status": "error",
        "error": f"{response.status_code} {response.reason}: {response.text}",
        "model": model_id
      }

  except requests.exceptions.RequestException as e:
    return {
      "status": "error",
      "error": str(e),
      "model": model_id
    }

# Main execution cell
# 1. Get Hugging Face API token
api_token = input("Enter your Hugging Face API token: ")

# 2. Upload an image
print("Please upload an image:")
image_path = upload_image()

# 3. Extract data from the image
print("Extracting data from image...")
result = extract_data_from_image(image_path, api_token)

# 4. Display the results
print("\nExtracted Data:")
print(json.dumps(result, indent=2))

# 5. Optionally save results to a file
save_output = input("Save output to file? (y/n): ").lower().strip() == 'y'
if save_output:
  output_file = "extracted_data.json"
  with open(output_file, 'w') as f:
    json.dump(result, f, indent=2)
  print(f"Results saved to {output_file}")
  # Download the file
  files.download(output_file)

Enter your Hugging Face API token: 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
Please upload an image:


Saving table_World_Population_by_Region_screenshot.png to table_World_Population_by_Region_screenshot.png
Uploaded image: table_World_Population_by_Region_screenshot.png
Extracting data from image...
Using model: microsoft/git-base-textvqa

Extracted Data:
{
  "status": "error",
  "error": "503 Service Temporarily Unavailable: <!DOCTYPE html>\n<html class=\"\" lang=\"en\">\n<head>\n    <meta charset=\"utf-8\" />\n    <meta\n            name=\"viewport\"\n            content=\"width=device-width, initial-scale=1.0, user-scalable=no\"\n    />\n    <meta\n            name=\"description\"\n            content=\"We're on a journey to advance and democratize artificial intelligence through open source and open science.\"\n    />\n    <meta property=\"fb:app_id\" content=\"1321688464574422\" />\n    <meta name=\"twitter:card\" content=\"summary_large_image\" />\n    <meta name=\"twitter:site\" content=\"@huggingface\" />\n    <meta\n            property=\"og:title\"\n            content=\"Hug

In [None]:
# Install required system packages
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract easyocr pillow numpy

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,237 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,692 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Pac

In [None]:
# Install required packages

import os
import json
import numpy as np
from PIL import Image
import pytesseract
import easyocr
from google.colab import files
import io
import time

# Function to make numpy data types JSON serializable
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

# Function to upload an image in Colab
def upload_image():
  uploaded = files.upload()
  file_path = list(uploaded.keys())[0]
  print(f"Uploaded image: {file_path}")
  return file_path

def extract_data_from_image(image_path):
  results = {}

  # Open the image
  try:
    image = Image.open(image_path)
    # Convert image to RGB if it's in another mode (like RGBA)
    if image.mode != 'RGB':
      image = image.convert('RGB')

    # Save the image dimensions
    width, height = image.size
    results["image_info"] = {
      "width": width,
      "height": height,
      "format": image.format,
      "mode": image.mode
    }

    # Convert to numpy array for EasyOCR
    image_np = np.array(image)

    # Method 1: Use Tesseract OCR (good for clear text documents)
    print("Extracting text with Tesseract OCR...")
    tesseract_text = pytesseract.image_to_string(image)

    # Method 2: Use EasyOCR (often better for natural scene text)
    print("Extracting text with EasyOCR...")
    reader = easyocr.Reader(['en'])  # Initialize with English language
    easyocr_results = reader.readtext(image_np)

    # Extract text and positions from EasyOCR results
    easyocr_extracted = []
    for detection in easyocr_results:
      bbox, text, score = detection
      if score > 0.2:  # Only include reasonably confident detections
        # Convert all numpy values to native Python types
        converted_bbox = [[float(x) for x in point] for point in bbox]
        easyocr_extracted.append({
          "text": text,
          "confidence": float(score),
          "position": converted_bbox
        })

    # Prepare structured output
    results["text_extraction"] = {
      "tesseract": {
        "full_text": tesseract_text.strip(),
        "lines": [line for line in tesseract_text.split('\n') if line.strip()]
      },
      "easyocr": {
        "detections": easyocr_extracted,
        "extracted_text": " ".join([item["text"] for item in easyocr_extracted])
      }
    }

    # Try to detect tables
    # This is a simplified approach - table detection usually requires more complex logic
    print("Checking for possible tabular data...")
    possible_table = False
    # Heuristic: If EasyOCR finds multiple text items with similar y-coordinates, it might be a table
    y_positions = {}
    for item in easyocr_extracted:
      # Get average y-coordinate of the detection
      avg_y = sum([p[1] for p in item["position"]]) / 4
      y_bracket = round(avg_y / 20) * 20  # Group by 20px bands
      if y_bracket not in y_positions:
        y_positions[y_bracket] = []
      y_positions[y_bracket].append(item["text"])

    # If we have multiple lines with 3+ text items, it might be tabular
    table_rows = []
    for y, texts in sorted(y_positions.items()):
      if len(texts) >= 3:
        possible_table = True
        table_rows.append(texts)

    if possible_table and len(table_rows) >= 2:
      results["possible_tabular_data"] = table_rows

    return {
      "status": "success",
      "extracted_data": results
    }

  except Exception as e:
    return {
      "status": "error",
      "error": str(e)
    }

# Main execution
print("Please upload an image:")
image_path = upload_image()

print("Extracting data from image...")
print("This might take a minute for the OCR processing...")
start_time = time.time()
result = extract_data_from_image(image_path)
end_time = time.time()
print(f"Processing completed in {end_time - start_time:.2f} seconds")

# Display the results
print("\nExtracted Data Summary:")
if result["status"] == "success":
  data = result["extracted_data"]

  # Print image info
  print(f"Image size: {data['image_info']['width']}x{data['image_info']['height']}")

  # Print extracted text summary
  tesseract_text = data["text_extraction"]["tesseract"]["full_text"]
  easyocr_text = data["text_extraction"]["easyocr"]["extracted_text"]

  print("\nText extracted (first 150 chars):")
  if tesseract_text:
    print(f"Tesseract: {tesseract_text[:150]}..." if len(tesseract_text) > 150 else f"Tesseract: {tesseract_text}")

  if easyocr_text:
    print(f"EasyOCR: {easyocr_text[:150]}..." if len(easyocr_text) > 150 else f"EasyOCR: {easyocr_text}")

  # Print info about possible tables
  if "possible_tabular_data" in data:
    print(f"\nPossible table detected with {len(data['possible_tabular_data'])} rows")
else:
  print(f"Error: {result['error']}")

# Format full output as JSON using the custom encoder
formatted_json = json.dumps(result, indent=2, cls=NumpyEncoder)

# Save full detailed results to a file
save_output = input("\nSave complete output to file? (y/n): ").lower().strip() == 'y'
if save_output:
  output_file = "extracted_data.json"
  with open(output_file, 'w') as f:
    f.write(formatted_json)
  print(f"Complete results saved to {output_file}")
  files.download(output_file)

# Display HTML version of the content if requested
display_html = input("Generate HTML preview of extracted content? (y/n): ").lower().strip() == 'y'
if display_html and result["status"] == "success":
  from IPython.display import HTML, display

  data = result["extracted_data"]

  html_content = f"""
  <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; border: 1px solid #ccc;">
    <h2>Extracted Content</h2>

    <h3>Image Information</h3>
    <p>Dimensions: {data['image_info']['width']} x {data['image_info']['height']} pixels</p>
    <p>Format: {data['image_info']['format']}</p>

    <h3>Extracted Text</h3>
    <div style="margin-bottom: 20px;">
      <h4>Tesseract OCR</h4>
      <pre style="background-color: #f5f5f5; padding: 10px; border-radius: 5px; white-space: pre-wrap;">{data['text_extraction']['tesseract']['full_text']}</pre>
    </div>

    <div style="margin-bottom: 20px;">
      <h4>EasyOCR</h4>
      <pre style="background-color: #f5f5f5; padding: 10px; border-radius: 5px; white-space: pre-wrap;">{data['text_extraction']['easyocr']['extracted_text']}</pre>
    </div>
  """

  if "possible_tabular_data" in data:
    html_content += """
    <h3>Possible Tabular Data</h3>
    <table style="border-collapse: collapse; width: 100%; margin-top: 10px;">
    """

    for row in data["possible_tabular_data"]:
      html_content += "<tr>"
      for cell in row:
        html_content += f"<td style='border: 1px solid #ddd; padding: 8px;'>{cell}</td>"
      html_content += "</tr>"

    html_content += "</table>"

  html_content += "</div>"

  display(HTML(html_content))

Please upload an image:


Saving table_World_Population_by_Region_screenshot.png to table_World_Population_by_Region_screenshot.png
Uploaded image: table_World_Population_by_Region_screenshot.png
Extracting data from image...
This might take a minute for the OCR processing...
Extracting text with Tesseract OCR...




Extracting text with EasyOCR...
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Generate HTML preview of extracted content? (y/n): y


0,1,2,3,4,5,6,7,8,9,10
Population,Yearly,Net,Density,Land Area,Migrants,Fert:,Med:,Urban,World,
Region,(2024),Change,Change,(PIKm?),(Km?),(net),Rate,Age,Share,Pop
Asia,4806898007,0.6 %,28893521,155,31033131,2335416,1.9,32,52.9 %,58.9 %
Africa,1515140849,2.32 %,34370324,29648481,644272,19,44.5 %,18.6 %,,
Europe,745083824,0.07 %,-519051,22134900,1566027,1.4,43 75.6 %,9.1 %,,
Latin America and the Caribbean,663466072,0.69 %,4574555,33,20139378,382944,1.8,31,85.2 %,8.1 %
Northern America,385295105,0.62 %,2392363,18651660,1654440,1.6,39,82.2 %,4.7 %,
Oceania,46088716,1.15 %,525929,8486460,142167,2.1,33,66.1 %,0.6 %,


In [None]:
# extract data using google gen-ai api key:
# api-key: AIzaSyBQZ5Dnedsn62074kojZBQKVAUVvuh8Z54

from google import genai
from google.genai import types

import PIL.Image

image = PIL.Image.open('/content/table_World_Population_2025_and_historical_screenshot.png')

client = genai.Client(api_key="") ##enter your gemini api key here
response = client.models.generate_content(
    model="gemini-2.0-flash-lite-001",
    contents=["Provide a json output for this image.", image])

print(response.text)


```json
[
  {
    "Year (July 1)": 2025,
    "Population": 8231613070,
    "Yearly % Change": "0.85%",
    "Yearly Change": 69640498,
    "Median Age": 30.9,
    "Fertility Rate": 2.24,
    "Density (P/Km²)": 55
  },
  {
    "Year (July 1)": 2024,
    "Population": 8161972572,
    "Yearly % Change": "0.87%",
    "Yearly Change": 70237642,
    "Median Age": 30.6,
    "Fertility Rate": 2.25,
    "Density (P/Km²)": 55
  },
  {
    "Year (July 1)": 2023,
    "Population": 8091734930,
    "Yearly % Change": "0.88%",
    "Yearly Change": 70327738,
    "Median Age": 30.4,
    "Fertility Rate": 2.25,
    "Density (P/Km²)": 54
  },
  {
    "Year (July 1)": 2022,
    "Population": 8021407192,
    "Yearly % Change": "0.84%",
    "Yearly Change": 66958801,
    "Median Age": 30.1,
    "Fertility Rate": 2.27,
    "Density (P/Km²)": 54
  },
  {
    "Year (July 1)": 2021,
    "Population": 7954448391,
    "Yearly % Change": "0.86%",
    "Yearly Change": 67447099,
    "Median Age": 29.8,
    "Fertility