<a href="https://colab.research.google.com/github/viviandonohoe/machinelearningproject/blob/main/ml_spanishnotarialdocuments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is the code I'm working on for optimizing OCR on Spanish notarial documents.


These imports were a mess so just loading the things up front:

In [None]:
import cv2
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report
)
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


Data stuff

In [None]:
!git clone https://github.com/viviandonohoe/DeepLearningSpanishAmericanCopy.git

In [None]:
!ls /content/DeepLearningSpanishAmericanCopy/

In [None]:

repo_path = "/content/DeepLearningSpanishAmericanCopy"
dataset_path = os.path.join(repo_path, "Dataset")
character_dataset_path = os.path.join(repo_path, "CharactersDataset")

print(os.listdir(dataset_path))
print(os.listdir(character_dataset_path))

In [None]:
def load_dataset(root_path, size=(32,32)):
  X = []
  y = []
  for label in sorted(os.listdir(root_path)):
    label_path = os.path.join(root_path, label)
    if not os.path.isdir(label_path):
      continue

    for filename in os.listdir(label_path):
      if filename in [".DS_Store", "__MACOSX"]:
        continue
      file_path = os.path.join(label_path, filename)
      img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
      if img is None:
        continue
      img = cv2.resize(img, size)
      img = img.flatten()
      X.append(img)
      y.append(label)
  return np.array(X), np.array(y)

In [None]:
X, y = load_dataset(character_dataset_path)
train_path = os.path.join(character_dataset_path, "TrainingSet")
test_path = os.path.join(character_dataset_path, "TestSet")

X_train, y_train = load_dataset(train_path)
X_test, y_test = load_dataset(test_path)

label_counts = collections.Counter(y_train)

labels = sorted(label_counts.keys())
counts = [label_counts[label] for label in labels]


In [None]:
plt.figure(figsize=(12, 5))
plt.bar(labels, counts)
plt.title("Number of Images per Letter")
plt.xlabel("Letter")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()


print("Train:", X_train.shape, "Labels:", y_train.shape)
print("Test:", X_test.shape, "Labels:", y_test.shape)

KNN Approach

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred_knn))

KNN Approach with added PCA Dimension Reduction

In [None]:
pca = PCA(n_components=50)
X_train_PCA = pca.fit_transform(X_train)
X_test_PCA = pca.transform(X_test)

knnwithpca = KNeighborsClassifier(n_neighbors=4)
knnwithpca.fit(X_train_PCA, y_train)

y_pred_pca = knnwithpca.predict(X_test_PCA)
print("Test accuracy (using PCA):", accuracy_score(y_test, y_pred_pca))

CNN Approach (using pytorch)

In [None]:
class CharacterDataset(Dataset):
    def __init__(self, root_path, img_size=(32, 32), label_encoder=None):
        self.image_paths = []
        self.labels = []
        self.img_size = img_size

        for label in sorted(os.listdir(root_path)):
            label_path = os.path.join(root_path, label)
            if not os.path.isdir(label_path):
                continue
            for filename in os.listdir(label_path):
                if filename in [".DS_Store", "__MACOSX"]:
                    continue
                file_path = os.path.join(label_path, filename)
                self.image_paths.append(file_path)
                self.labels.append(label)

        if label_encoder is None:
            self.label_encoder = LabelEncoder()
            self.label_encoder.fit(self.labels)
        else:
            self.label_encoder = label_encoder

        self.encoded_labels = self.label_encoder.transform(self.labels)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = cv2.imread(self.image_paths[idx], cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, self.img_size)
        img = img.astype(np.float32) / 255.0
        img = torch.tensor(img).unsqueeze(0)
        label = torch.tensor(self.encoded_labels[idx], dtype=torch.long)
        return img, label

def train_and_evaluate(model, model_name, train_loader, test_loader,
                       label_encoder, device, epochs=10, lr=0.001):
    print(f"Training: {model_name}")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        train_acc = 100. * correct / total
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Train Acc={train_acc:.2f}%")

    model.eval()

    correct = 0
    total = 0

    y_true = []
    y_pred = []

    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc="Testing"):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)

            correct += preds.eq(labels).sum().item()
            total += labels.size(0)

            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

    model.eval()

    test_acc = 100. * correct / total
    params = sum(p.numel() for p in model.parameters())

    print(f"\n{model_name} Results:")
    print(f"  Test Accuracy: {test_acc:.2f}%")
    print(f"  Parameters: {params:,}")

    return {
        'model_name': model_name,
        'test_accuracy': test_acc,
        'train_accuracy': train_acc,
        'parameters': params,
        'model': model,
        'y_true': y_true,
        'y_pred': y_pred
    }

In [None]:
train_dataset = CharacterDataset(train_path)
test_dataset = CharacterDataset(test_path, label_encoder=train_dataset.label_encoder)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = len(train_dataset.label_encoder.classes_)
print(f"Number of classes: {num_classes}")
print(f"Classes: {train_dataset.label_encoder.classes_}")

shallow_cnn = nn.Sequential(
    nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(64 * 8 * 8, 256), nn.ReLU(), nn.Dropout(0.5),
    nn.Linear(256, num_classes)
)

basic_cnn = nn.Sequential(
    nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(128 * 4 * 4, 256), nn.ReLU(), nn.Dropout(0.5),
    nn.Linear(256, num_classes)
)

deep_cnn = nn.Sequential(
    nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(256 * 2 * 2, 512), nn.ReLU(), nn.Dropout(0.5),
    nn.Linear(512, num_classes)
)

models = {
    'Shallow CNN (2 layers)': shallow_cnn,
    'Basic CNN (3 layers)': basic_cnn,
    'Deep CNN (4 layers)': deep_cnn
}

results = []

for name, model in models.items():
    result = train_and_evaluate(
        model, name,
        train_loader, test_loader,
        train_dataset.label_encoder, device,
        epochs=10
    )

    results.append(
        compute_metrics(
            result["y_true"],
            result["y_pred"],
            name
        )
    )


In [None]:
def compute_metrics(y_true, y_pred, model_name):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted", zero_division=0
    )

    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }


In [None]:

results.append(
    compute_metrics(y_test, y_pred_knn, "KNN (k=4)")
)

results.append(
    compute_metrics(y_test, y_pred_pca, "KNN + PCA (50 comps)")
)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values("F1 Score", ascending=False)

results_df


In [None]:
metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]

results_df.set_index("Model")[metrics].plot(
    kind="bar",
    figsize=(12, 6)
)

plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.xticks(rotation=0, ha="right")
plt.tight_layout()
plt.show()
