# Handwritten Museum Collections — Azure OCR Notebook

In [None]:
!pip -q install azure-storage-blob azure-ai-vision-imageanalysis pandas matplotlib wordcloud pillow PyMuPDF pdf2image

In [None]:
import os, io, re, json
import pandas as pd
from azure.storage.blob import BlobServiceClient
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
from PIL import Image
import fitz  # PyMuPDF

AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING", "YOUR_BLOB_STORAGE_CONNECTION_STRING")
AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "https://YOUR-RESOURCE-NAME.cognitiveservices.azure.com")
AZURE_VISION_KEY = os.getenv("AZURE_VISION_KEY", "YOUR_VISION_KEY")

blob_service = BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING)
vision_client = ImageAnalysisClient(endpoint=AZURE_VISION_ENDPOINT, credential=AzureKeyCredential(AZURE_VISION_KEY))

container_name = "museum-records"  # change if needed
container_client = blob_service.get_container_client(container_name)
print("Sample listing (first 5):", [b.name for _, b in zip(range(5), container_client.list_blobs())])

In [None]:
def pdf_bytes_to_images(pdf_bytes, zoom=2.0):
    images = []
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    mat = fitz.Matrix(zoom, zoom)
    for page in doc:
        pix = page.get_pixmap(matrix=mat, alpha=False)
        images.append(pix.tobytes("png"))
    doc.close()
    return images

def ocr_image_bytes(img_bytes: bytes) -> str:
    result = vision_client.analyze(image_data=img_bytes, visual_features=[VisualFeatures.READ])
    text = []
    if result.read and result.read.blocks:
        for line in result.read.blocks[0].lines:
            text.append(line.text)
    return " ".join(text).strip()

results = []
for blob in container_client.list_blobs():
    bclient = container_client.get_blob_client(blob)
    data = bclient.download_blob().readall()
    if blob.name.lower().endswith(".pdf"):
        pages = pdf_bytes_to_images(data)
        for i, img_b in enumerate(pages, start=1):
            print(f"OCR: {blob.name} page {i}")
            text = ocr_image_bytes(img_b)
            results.append({"source_file": blob.name, "page": i, "raw_ocr_text": text})
    elif blob.name.lower().endswith((".jpg",".jpeg",".png",".tif",".tiff",".bmp",".gif",".webp")):
        print(f"OCR: {blob.name}")
        text = ocr_image_bytes(data)
        results.append({"source_file": blob.name, "page": 1, "raw_ocr_text": text})

df_raw_ocr = pd.DataFrame(results)
print("Pages OCRed:", len(df_raw_ocr))
df_raw_ocr.head()

In [None]:
df = df_raw_ocr.copy()
for c in ["accession_number","object_name","provenance","site_location","materials","dimensions","date","notes"]:
    if c not in df.columns: df[c] = ""

import re
FIELD_PATTERNS = {
    "accession_number": r"(accession|acc\.?\s*no\.?|catalog\s*no\.?|cat\.?\s*no\.?)[:\s]*([A-Za-z0-9\-./]+)",
    "object_name":      r"(object\s*name|artifact|item)[:\s]+(.{1,60})",
    "provenance":       r"(provenance|provenience)[:\s]+(.{1,200})",
    "site_location":    r"(site|find\s*spot|location)[:\s]+(.{1,120})",
    "materials":        r"(materials?|medium)[:\s]+(.{1,120})",
    "dimensions":       r"(dimensions?|size)[:\s]+(.{1,120})",
    "date":             r"(date|dated)[:\s]+(.{1,60})",
    "notes":            r"(notes?|remarks?)[:\s]+(.{1,240})",
}

def extract_field(text: str, key: str):
    m = re.search(FIELD_PATTERNS[key], text, flags=re.IGNORECASE)
    return m.group(m.lastindex).strip(" .;:") if m else ""

for idx, row in df.iterrows():
    txt = (row.get("raw_ocr_text") or "").strip()
    for k in FIELD_PATTERNS:
        val = extract_field(txt, k)
        if val: df.at[idx, k] = val

df.head()

In [None]:
def needs_review(row):
    if not row.get("accession_number"): return True
    if isinstance(row.get("raw_ocr_text"), str) and len(row["raw_ocr_text"]) < 50: return True
    return False

df["needs_review"] = df.apply(needs_review, axis=1)
df[df["needs_review"]].head()

In [None]:
import re

def tidy(s):
    if pd.isna(s): return s
    s = str(s).strip()
    s = re.sub(r"\s{2,}", " ", s)
    return s

for col in ["accession_number","object_name","provenance","site_location","materials","dimensions","date","notes"]:
    if col in df.columns:
        df[col] = df[col].apply(tidy)

if "object_name" in df.columns:
    df["object_name"] = df["object_name"].str.title()

df.to_csv("museum_handwritten_records_azure.csv", index=False)
df.to_json("museum_handwritten_records_azure.json", orient="records", indent=2, force_ascii=False)
"Exported museum_handwritten_records_azure.csv and museum_handwritten_records_azure.json"

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import string

def top_counts(series, n=10, title="Top values"):
    vc = series.dropna().astype(str).str.strip().value_counts().head(n)
    display(vc)
    plt.figure()
    vc.sort_values().plot(kind="barh", edgecolor="black")
    plt.title(title); plt.xlabel("Count"); plt.ylabel(series.name or "Value")
    plt.tight_layout(); plt.show()

for col, label in [("materials","Top 10 Materials"),
                   ("object_name","Top 10 Object Names"),
                   ("provenance","Top 10 Provenance Phrases (rough)")]:
    if col in df.columns:
        top_counts(df[col], n=10, title=label)

if "date" in df.columns:
    df["year"] = df["date"].astype(str).str.extract(r"(\d{4})")
    year_counts = df["year"].value_counts().sort_index()
    if not year_counts.empty:
        plt.figure(); year_counts.plot(kind="line", marker="o")
        plt.title("Objects by Year"); plt.xlabel("Year"); plt.ylabel("Count")
        plt.tight_layout(); plt.show()

missing = df.isna().sum().sort_values(ascending=True)
plt.figure(); missing.plot(kind="barh", edgecolor="black")
plt.title("Missing Values by Field"); plt.xlabel("Missing count")
plt.tight_layout(); plt.show()