In [2]:
import os
import shutil
import cv2
import pandas as pd
import random
import numpy as np
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


from torchvision import transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from ultralytics import YOLO
from PIL import Image


In [35]:

def get_character_boxes(img, min_width=5, min_height=10):

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, bw = cv2.threshold(gray, 0, 255,
                         cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opened = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel, iterations=1)
    contours, _ = cv2.findContours(opened, cv2.RETR_EXTERNAL,
                                   cv2.CHAIN_APPROX_SIMPLE)
    boxes = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w >= min_width and h >= min_height:
            boxes.append((x, y, w, h))
    return sorted(boxes, key=lambda b: b[0])


def get_fixed_boxes(img, num_chars, min_width=5, min_height=10):

    h, w = img.shape[:2]
    slice_w = w // num_chars
    boxes = []
    for i in range(num_chars):
        x1 = i * slice_w
        x2 = w if i == num_chars - 1 else (i + 1) * slice_w
        w_box = x2 - x1
        if w_box >= min_width and h >= min_height:
            boxes.append((x1, 0, w_box, h))
    return boxes


def normalize_and_pad(crop, size=32, border_color=(255,255,255)):

    h0, w0 = crop.shape[:2]
    # escala para caber em size
    scale = min(size / h0, size / w0)
    new_w, new_h = int(w0 * scale), int(h0 * scale)
    # evita zero
    new_w = max(1, new_w)
    new_h = max(1, new_h)
    resized = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
    # calcula padding
    delta_w = size - new_w
    delta_h = size - new_h
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)
    padded = cv2.copyMakeBorder(resized, top, bottom, left, right,
                                cv2.BORDER_CONSTANT, value=list(border_color))
    return padded


def segment_and_save_split(csv_path, img_dir, output_base_dir,
                           train_ratio=0.8, max_width=25, seed=42):

    random.seed(42)
    df = pd.read_csv(csv_path)

    # Cria pastas para splits e classes
    classes = sorted({c for lbl in df['label'].astype(str) for c in lbl})
    for split in ('train', 'val'):
        for char in classes:
            os.makedirs(os.path.join(output_base_dir, split, char), exist_ok=True)

    for _, row in df.iterrows():
        fname = row['filename']
        label = str(row['label'])
        img_path = os.path.join(img_dir, fname)
        img = cv2.imread(img_path)
        if img is None:
            print(f"Aviso: não encontrei {img_path}")
            continue

        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 

        # Escolhe split
        split = 'train' if random.random() < train_ratio else 'val'

        # Tenta segmentação por contornos
        boxes = get_character_boxes(img)
        # Se não bate com o número de caracteres, usa divisão fixa
        if len(boxes) != len(label):
            boxes = get_fixed_boxes(img, len(label))

        # Salva crops filtrando largura máxima e padronizando tamanho
        for i, ((x, y, w, h), char) in enumerate(zip(boxes, label)):
            crop = img_gray[y:y+h, x:x+w]
            norm_crop = normalize_and_pad(crop, size=32, border_color=(255,255,255))
            char_dir = os.path.join(output_base_dir, split, char)
            base, _ = os.path.splitext(fname)
            out_name = f"{base}_{i:02d}.png"
            cv2.imwrite(os.path.join(char_dir, out_name), norm_crop)

    print(f"Segmentação concluída! Pastas:")
    print(f"  {os.path.join(output_base_dir, 'train')}")
    print(f"  {os.path.join(output_base_dir, 'val')}")


if __name__ == '__main__':
    CSV_PATH = 'TextBasedWave/train.csv'
    IMG_DIR  = 'TextBasedWave/train'
    OUT_DIR  = 'TextBasedWave/chars_by_class'
    segment_and_save_split(CSV_PATH, IMG_DIR, OUT_DIR,
                           train_ratio=0.8, max_width=25, seed=42)


Segmentação concluída! Pastas:
  TextBasedWave/chars_by_class\train
  TextBasedWave/chars_by_class\val


In [4]:

# ─── Configurações ──────────────────────────────────────────────
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ALPHABET    = "abcdefghijklmnopqrstuvwxyz0123456789"
NUM_CLASSES = len(ALPHABET)
BATCH_SIZE  = 64
EPOCHS      = 20
LR          = 1e-3

# ─── Transforms ─────────────────────────────────────────────────
train_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
val_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# ─── Datasets & Loaders ────────────────────────────────────────
train_ds = ImageFolder("TextBased/chars_by_class/train", transform=train_tf)
val_ds   = ImageFolder("TextBased/chars_by_class/val",   transform=val_tf)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# ─── Modelo CNN + LSTM ─────────────────────────────────────────
class CharCNN_LSTM(nn.Module):
    def __init__(self, num_classes, lstm_hidden=256):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2,2),              # 32→16
            nn.Conv2d(64,128,3,padding=1), nn.ReLU(),
            nn.MaxPool2d(2,2)               # 16→8
        )
        self.lstm = nn.LSTM(input_size=128*8,
                            hidden_size=lstm_hidden,
                            batch_first=True)
        self.classifier = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        feat = self.cnn(x)              # [B,128,8,8]
        B, C, H, W = feat.size()
        seq = feat.permute(0,3,1,2).contiguous().view(B, W, C*H)
        out, (hn, cn) = self.lstm(seq)  # [B, W, lstm_hidden]
        last = out[:, -1, :]
        return self.classifier(last)

# ─── Instanciação ──────────────────────────────────────────────
model     = CharCNN_LSTM(NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ─── Loop de Treino/Validação ──────────────────────────────────
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss, total, correct = 0, 0, 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        preds = out.argmax(1)
        total   += labels.size(0)
        correct += (preds == labels).sum().item()
    train_acc = correct/total

    model.eval()
    v_total, v_correct = 0, 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            out = model(imgs)
            preds = out.argmax(1)
            v_total   += labels.size(0)
            v_correct += (preds == labels).sum().item()
    val_acc = v_correct / v_total

    print(f"Ep {epoch:02d}  Loss={running_loss/len(train_loader):.3f}  "
          f"TrainAcc={train_acc:.3f}  ValAcc={val_acc:.3f}")

# ─── Salvando o modelo ─────────────────────────────────────────
os.makedirs("models", exist_ok=True)
save_path = os.path.join("models", "charcnn_lstm.pt")
torch.save(model.state_dict(), save_path)
print(f"Modelo salvo em: {save_path}")


Ep 01  Loss=3.036  TrainAcc=0.179  ValAcc=0.354
Ep 02  Loss=2.171  TrainAcc=0.455  ValAcc=0.498
Ep 03  Loss=1.909  TrainAcc=0.539  ValAcc=0.537
Ep 04  Loss=1.775  TrainAcc=0.571  ValAcc=0.549
Ep 05  Loss=1.683  TrainAcc=0.591  ValAcc=0.554
Ep 06  Loss=1.601  TrainAcc=0.606  ValAcc=0.558
Ep 07  Loss=1.519  TrainAcc=0.623  ValAcc=0.558
Ep 08  Loss=1.453  TrainAcc=0.638  ValAcc=0.567
Ep 09  Loss=1.374  TrainAcc=0.650  ValAcc=0.563
Ep 10  Loss=1.301  TrainAcc=0.666  ValAcc=0.565
Ep 11  Loss=1.224  TrainAcc=0.683  ValAcc=0.564
Ep 12  Loss=1.149  TrainAcc=0.699  ValAcc=0.567
Ep 13  Loss=1.077  TrainAcc=0.713  ValAcc=0.564
Ep 14  Loss=1.009  TrainAcc=0.730  ValAcc=0.558
Ep 15  Loss=0.930  TrainAcc=0.751  ValAcc=0.562
Ep 16  Loss=0.862  TrainAcc=0.770  ValAcc=0.555
Ep 17  Loss=0.803  TrainAcc=0.785  ValAcc=0.548
Ep 18  Loss=0.819  TrainAcc=0.777  ValAcc=0.553
Ep 19  Loss=0.708  TrainAcc=0.809  ValAcc=0.546
Ep 20  Loss=0.648  TrainAcc=0.830  ValAcc=0.546
Modelo salvo em: models\charcnn_lstm.pt


In [5]:
# ─── Configurações ──────────────────────────────────────────────
VAL_DIR     = "TextBased/val"          # Pasta com CAPTCHAs originais
OUTPUT_DIR  = "TextBased/segments"     # Pasta raiz onde cada imagem terá sua própria pasta
MIN_W, MIN_H = 5, 10                   # Tamanho mínimo de cada segmento
SPLIT_W     = 30                       # Largura acima da qual dividimos em dois
TARGET_SIZE = 32                       # tamanho final com padding

def get_character_boxes(img, min_w=MIN_W, min_h=MIN_H):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, bw = cv2.threshold(gray, 0, 255,
                         cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opened = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
    cnts, _ = cv2.findContours(opened, cv2.RETR_EXTERNAL,
                               cv2.CHAIN_APPROX_SIMPLE)
    boxes = [
        (x, y, w, h)
        for c in cnts
        for x, y, w, h in [cv2.boundingRect(c)]
        if w >= min_w and h >= min_h
    ]
    return sorted(boxes, key=lambda b: b[0])

# ─── Loop sobre cada CAPTCHA ────────────────────────────────────
for fname in sorted(os.listdir(VAL_DIR)):
    name, ext = os.path.splitext(fname)
    img_path = os.path.join(VAL_DIR, fname)
    img = cv2.imread(img_path)
    if img is None:
        print(f"[ERRO] não foi possível ler {fname}")
        continue

    boxes = get_character_boxes(img)
    crops = []
    for (x, y, w, h) in boxes:
        if w > SPLIT_W:
            mid = w // 2
            crops.append(img[y:y+h, x    :x+mid])
            crops.append(img[y:y+h, x+mid:x+w])
        else:
            crops.append(img[y:y+h, x:x+w])

    # só continua se bate com o nome
    if len(crops) != len(name):
        continue

    out_folder = os.path.join(OUTPUT_DIR, name)
    os.makedirs(out_folder, exist_ok=True)

    for idx, crop in enumerate(crops, start=1):
        h, w = crop.shape[:2]

        # 1) redimensiona se maior que TARGET_SIZE, mantendo proporção
        scale = min(1.0, TARGET_SIZE / h, TARGET_SIZE / w)
        new_w, new_h = int(w * scale), int(h * scale)
        if scale < 1.0:
            crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
        else:
            new_w, new_h = w, h

        # 2) cria canvas branco 32×32
        canvas = np.ones((TARGET_SIZE, TARGET_SIZE, 3), dtype=crop.dtype) * 255

        # 3) centraliza
        y_off = (TARGET_SIZE - new_h) // 2
        x_off = (TARGET_SIZE - new_w) // 2
        canvas[y_off:y_off+new_h, x_off:x_off+new_w] = crop

        # 4) salva
        out_path = os.path.join(out_folder, f"{idx:02d}.png")
        cv2.imwrite(out_path, canvas)

print("Processamento de segmentação e padding concluído.")


Processamento de segmentação e padding concluído.


In [6]:
import os
import cv2
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder

# ── Configurações ────────────────────────────────────────────────
BASE_FOLDER    = "TextBased/segments"
MODEL_PATH     = "models/charcnn_lstm.pt"
DEVICE         = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EXTENSIONS     = (".png", ".jpg", ".jpeg")

# ── Carrega modelo ───────────────────────────────────────────────
model = CharCNN_LSTM(num_classes=NUM_CLASSES).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# ── Índice→rótulo ────────────────────────────────────────────────
train_ds    = ImageFolder("TextBased/chars_by_class/train", transform=train_tf)
class_names = train_ds.classes

# ── Transform de inferência ───────────────────────────────────────
infer_tf = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
def prepare_img(gray_np):
    return infer_tf(gray_np)

# ── Inicializa contadores ────────────────────────────────────────
total_sequences        = 0
correct_sequences      = 0
overall_char_correct   = 0
overall_char_total     = 0

# ── Loop em cada CAPTCHA (pasta) ────────────────────────────────
for label in sorted(os.listdir(BASE_FOLDER)):
    folder = os.path.join(BASE_FOLDER, label)
    if not os.path.isdir(folder):
        continue

    total_sequences += 1
    seq_correct = 0
    seq_total   = 0

    # percorre os segmentos dessa pasta
    for fname in sorted(os.listdir(folder)):
        if not fname.lower().endswith(EXTENSIONS):
            continue

        # determina o caractere “verdadeiro” pelo índice do arquivo
        seg_idx   = int(os.path.splitext(fname)[0]) - 1
        true_char = label[seg_idx]

        # inferência
        gray      = cv2.imread(os.path.join(folder, fname), cv2.IMREAD_GRAYSCALE)
        inp       = prepare_img(gray).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            out       = model(inp)
            pred_char = class_names[out.argmax(dim=1).item()]

        # atualiza contagens de segmento
        is_corr = (pred_char == true_char)
        seq_correct += is_corr
        seq_total   += 1
        overall_char_correct += is_corr
        overall_char_total   += 1

    # se todos os segmentos ficaram corretos, conta como sequência certa
    if seq_correct == seq_total and seq_total > 0:
        correct_sequences += 1

# ── Cálculo das métricas ────────────────────────────────────────
sequence_accuracy  = correct_sequences / total_sequences if total_sequences else 0
character_accuracy = overall_char_correct / overall_char_total if overall_char_total else 0

# ── Impressão dos resultados ────────────────────────────────────
print(f"Acurácia de sequência (todas as letras corretas): "
      f"{correct_sequences}/{total_sequences} = {sequence_accuracy:.2%}")
print(f"Acurácia de caractere (individual): "
      f"{overall_char_correct}/{overall_char_total} = {character_accuracy:.2%}")


Acurácia de sequência (todas as letras corretas): 3076/3745 = 82.14%
Acurácia de caractere (individual): 17928/18725 = 95.74%


# Treinamento YOLO segmentação + classificação de caracteres

In [19]:
# ── Configurações ────────────────────────────────────────────────
BASE_FOLDER    = "C:/Users/joao_/captcha/TextBased/segments"
DATA_DIR       = "../TextBased/chars_by_class"              # seu diretório de treino
PROJECT        = "../runs/captcha_yolo"
EXP_NAME       = "exp"
DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"
EXTENSIONS     = (".png", ".jpg", ".jpeg")

In [None]:
model = YOLO('yolo11n-cls')

model.train(
    data=DATA_DIR,
    epochs=50,
    imgsz=32,
    batch=64,
    lrf=1e-3,
    project=PROJECT,
    name=EXP_NAME
)



In [15]:
# ── 2) CARREGA O MODELO TREINADO ────────────────────────────────
best_weight = os.path.join(PROJECT, EXP_NAME, "weights", "best.pt")

model = YOLO(best_weight)

class_names = [model.names[i] for i in sorted(model.names)]


# ── Função de predição para cada segmento ──────────────────────


def predict_char(segment_path):
    results = model.predict(
        source=segment_path,
        imgsz=32,
        device=DEVICE,
        verbose=False
    )
    # 1) acessa o tensor interno (torch.Tensor)
    probs_tensor = results[0].probs.data  
    # 2) move pra CPU e converte pra numpy de verdade
    probs_array  = probs_tensor.cpu().numpy()  
    # 3) pega o índice do valor máximo
    idx = int(probs_array.argmax())
    # 4) retorna o nome da classe
    return class_names[idx]

# ── 3) INFERÊNCIA & MÉTRICAS ────────────────────────────────────
total_seq      = 0  # número de pastas processadas
correct_seq    = 0  # quantas tiveram todos os segmentos corretos
total_chars    = 0  # total de segmentos
correct_chars  = 0  # total de segmentos corretos

for label in sorted(os.listdir(BASE_FOLDER)):
    folder = os.path.join(BASE_FOLDER, label)
    if not os.path.isdir(folder):
        continue

    total_seq += 1
    seq_preds = []

    # percorre cada arquivo de segmento
    for fname in sorted(os.listdir(folder)):
        if not fname.lower().endswith(EXTENSIONS):
            continue

        seg_idx   = int(os.path.splitext(fname)[0]) - 1
        true_char = label[seg_idx]

        path      = os.path.join(folder, fname)
        pred_char = predict_char(path)

        seq_preds.append((true_char, pred_char))
        total_chars += 1
        if pred_char == true_char:
            correct_chars += 1

    # se todos os pares (true,pred) baterem → sequência correta
    if all(t == p for t, p in seq_preds) and seq_preds:
        correct_seq += 1

# ── 4) RESULTADOS ───────────────────────────────────────────────
seq_acc  = correct_seq / total_seq    if total_seq  else 0
char_acc = correct_chars / total_chars if total_chars else 0

print(f"Acurácia de sequência (tudo certo): {correct_seq}/{total_seq} = {seq_acc:.2%}")
print(f"Acurácia de caractere (isolado):   {correct_chars}/{total_chars} = {char_acc:.2%}")

Acurácia de sequência (tudo certo): 3175/3745 = 84.78%
Acurácia de caractere (isolado):   18071/18725 = 96.51%


# Treinamento com ResNet50

In [22]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import datasets, transforms

# ── 0) Defina DATA_DIR e carregue o dataset para descobrir num_classes ──
DATA_DIR = "C:/Users/joao_/captcha/TextBased/chars_by_class"
train_tf = transforms.Compose([transforms.Resize((64,64)),
                               transforms.ToTensor(),
                               transforms.Normalize([0.485,0.456,0.406],
                                                    [0.229,0.224,0.225])])
train_ds = datasets.ImageFolder(os.path.join(DATA_DIR, "train"), transform=train_tf)
num_classes = len(train_ds.classes)  # ex: 36

# ── 1) Definição da classe sem usar valor padrão indefinido ───────────
class TemporalResNet50(nn.Module):
    def __init__(self, num_classes):
        super(TemporalResNet50, self).__init__()
        base_model = models.resnet50()
        self.backbone = nn.Sequential(*list(base_model.children())[:-2])
        self.lstm     = nn.LSTM(input_size=2048, hidden_size=256,
                                 num_layers=1, batch_first=True,
                                 bidirectional=True)
        self.classifier = nn.Linear(256 * 2, num_classes)

    def forward(self, x):
        x = self.backbone(x)          # [B, 2048, H, W]
        b, c, h, w = x.size()
        x = x.permute(0, 2, 3, 1).contiguous()  # [B, H, W, C]
        x = x.view(b, h * w, c)                # [B, T, C]
        x, _ = self.lstm(x)                    # [B, T, 512]
        x = x[:, -1, :]                        # último timestep
        return self.classifier(x)

# ── 2) Instancie o modelo passando num_classes ────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TemporalResNet50(num_classes=num_classes).to(device)


In [None]:
# ── CÉLULA: TREINAMENTO DO TemporalResNet50 ────────────────────────
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm import tqdm

# ── Configurações ─────────────────────────────────────────────────
DATA_DIR      = "C:/Users/joao_/captcha/TextBased/chars_by_class"                # estrutura: chars_by_class/train e chars_by_class/val
PROJECT       = "models/captcha_resnet50"
EXP_NAME      = "exp"
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs    = 50
batch_size    = 64
learning_rate = 1e-3
patience      = 15
gamma         = 0.97    # para ExponentialLR

# ── Transforms ────────────────────────────────────────────────────
train_tf = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
val_tf   = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# ── Datasets & DataLoaders ────────────────────────────────────────
train_ds   = datasets.ImageFolder(os.path.join(DATA_DIR, "train"), transform=train_tf)
val_ds     = datasets.ImageFolder(os.path.join(DATA_DIR, "val"),   transform=val_tf)
trainloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=4, pin_memory=True)
testloader  = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# ── Diretórios de saída ───────────────────────────────────────────
weights_dir = os.path.join(PROJECT, EXP_NAME, "weights")
os.makedirs(weights_dir, exist_ok=True)

# ── Modelo, Otimizador, Scheduler, Critério ────────────────────────
model     = TemporalResNet50(num_classes=len(train_ds.classes)).to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
criterion = nn.CrossEntropyLoss()

# ── Loop de Treino com Early Stopping ──────────────────────────────
best_val_acc     = 0.0
epochs_no_improve = 0
best_state_dict  = None

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss, running_corrects, running_total = 0.0, 0, 0
    
    for images, labels in tqdm(trainloader, desc=f"[Epoch {epoch}] Treino", leave=False):
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss     += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        running_corrects += (preds == labels).sum().item()
        running_total    += labels.size(0)
    
    train_loss = running_loss / running_total
    train_acc  = 100 * running_corrects / running_total
    
    # --- Validação ---
    model.eval()
    val_corrects, val_total = 0, 0
    with torch.no_grad():
        for images, labels in tqdm(testloader, desc=f"[Epoch {epoch}] Validação", leave=False):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            preds = outputs.argmax(dim=1)
            val_corrects += (preds == labels).sum().item()
            val_total    += labels.size(0)
    
    val_acc = 100 * val_corrects / val_total
    scheduler.step()
    
    print(f"Epoch {epoch:2d} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Val Acc: {val_acc:.2f}%")
    
    # Early Stopping
    if val_acc > best_val_acc:
        best_val_acc      = val_acc
        epochs_no_improve = 0
        best_state_dict   = model.state_dict()
        torch.save(best_state_dict, os.path.join(weights_dir, "best.pth"))
        print(f"📈 Nova melhor Val Acc: {best_val_acc:.2f}% — modelo salvo.")
    else:
        epochs_no_improve += 1
        print(f"⏸️ Sem melhora por {epochs_no_improve} época(s).")
        if epochs_no_improve >= patience:
            print(f"🛑 Early stopping ativado após {patience} épocas sem melhora.")
            break

# ── Restaurar melhor modelo ───────────────────────────────────────
model.load_state_dict(best_state_dict)
print("✔️ Treinamento concluído. Melhor modelo carregado.")


In [26]:
# ── CÉLULA: VALIDAÇÃO SEGMENTO-A-SEGMENTO COM TemporalResNet50 ──

# ── 1) Configurações ─────────────────────────────────────────────
BASE_FOLDER    = "TextBased/segments"               # pastas com rótulos de sequência
EXTENSIONS     = (".png", ".jpg", ".jpeg")
best_model_path = "models/ResNet_TextBase.pth"         # ajuste para o seu caminho
device         = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ── 2) Carrega o modelo treinado ─────────────────────────────────
model = TemporalResNet50(num_classes=num_classes).to(device)
model.load_state_dict(torch.load(best_model_path, map_location=device))
model.eval()

# ── 3) Transforms de validação (mesmo padrão do treino) ──────────
val_tf = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

# ── 4) Função de predição para um segmento ───────────────────────
def predict_char(segment_path):
    img = Image.open(segment_path).convert("RGB")
    x   = val_tf(img).unsqueeze(0).to(device)            # [1,C,H,W]
    with torch.no_grad():
        logits = model(x)                                # [1,num_classes]
        idx    = int(logits.argmax(dim=1).cpu().item())
    return train_ds.classes[idx]                         # lista de classes do ImageFolder

# ── 5) Inferência & métricas ────────────────────────────────────
total_seq     = 0   # total de sequências (pastas)
correct_seq   = 0   # quantas sequências tiveram TODOS os chars corretos
total_chars   = 0   # total de segmentos avaliados
correct_chars = 0   # total de segmentos corretos

for label in sorted(os.listdir(BASE_FOLDER)):
    folder = os.path.join(BASE_FOLDER, label)
    if not os.path.isdir(folder):
        continue

    total_seq += 1
    seq_preds = []

    for fname in sorted(os.listdir(folder)):
        if not fname.lower().endswith(EXTENSIONS):
            continue

        seg_idx    = int(os.path.splitext(fname)[0]) - 1
        true_char  = label[seg_idx]
        path       = os.path.join(folder, fname)
        pred_char  = predict_char(path)

        seq_preds.append((true_char, pred_char))
        total_chars += 1
        if pred_char == true_char:
            correct_chars += 1

    if seq_preds and all(t == p for t, p in seq_preds):
        correct_seq += 1

# ── 6) Resultados ────────────────────────────────────────────────
seq_acc  = correct_seq   / total_seq   if total_seq   else 0
char_acc = correct_chars / total_chars if total_chars else 0

print(f"Acurácia de sequência (tudo certo): {correct_seq}/{total_seq} = {seq_acc:.2%}")
print(f"Acurácia de caractere (isolado):   {correct_chars}/{total_chars} = {char_acc:.2%}")


Acurácia de sequência (tudo certo): 2409/3745 = 64.33%
Acurácia de caractere (isolado):   16952/18725 = 90.53%


# Teste com Dataset com Wave Distortion

In [32]:
import os
import random
import shutil
from collections import defaultdict

# Parâmetros
BASE_DIR      = "C:/Users/joao_/captcha/TextBasedWave/chars_by_class"
SPLITS        = ['train', 'val']
TRAIN_RATIO   = 0.8
RANDOM_SEED   = 42

random.seed(RANDOM_SEED)

# Coletar todas as imagens por classe
class_dirs = [d for d in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d)) and d not in SPLITS]

# Preparar estrutura de pastas
for split in SPLITS:
    for cls in class_dirs:
        os.makedirs(os.path.join(BASE_DIR, split, cls), exist_ok=True)

# Divisão train/val
for cls in class_dirs:
    cls_dir = os.path.join(BASE_DIR, cls)
    images  = [f for f in os.listdir(cls_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    random.shuffle(images)

    split_idx = int(len(images) * TRAIN_RATIO)
    train_imgs = images[:split_idx]
    val_imgs   = images[split_idx:]

    for fname in train_imgs:
        src = os.path.join(cls_dir, fname)
        dst = os.path.join(BASE_DIR, 'train', cls, fname)
        shutil.move(src, dst)

    for fname in val_imgs:
        src = os.path.join(cls_dir, fname)
        dst = os.path.join(BASE_DIR, 'val', cls, fname)
        shutil.move(src, dst)

    # Remove pasta antiga da classe
    os.rmdir(cls_dir)

print("✅ Divisão finalizada: 80% treino, 20% validação em subpastas por classe.")


FileNotFoundError: [WinError 3] O sistema não pode encontrar o caminho especificado: 'C:/Users/joao_/captcha/TextBasedWave/chars_by_class'

In [None]:

# ─── Configurações ──────────────────────────────────────────────
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ALPHABET    = "abcdefghijklmnopqrstuvwxyz0123456789"
NUM_CLASSES = len(ALPHABET)
BATCH_SIZE  = 64
EPOCHS      = 20
LR          = 1e-3

# ─── Transforms ─────────────────────────────────────────────────
train_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
val_tf = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# ─── Datasets & Loaders ────────────────────────────────────────
train_ds = ImageFolder("TextBasedWave/chars_by_class/train", transform=train_tf)
val_ds   = ImageFolder("TextBasedWave/chars_by_class/val",   transform=val_tf)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=4)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# ─── Modelo CNN + LSTM ─────────────────────────────────────────
class CharCNN_LSTM(nn.Module):
    def __init__(self, num_classes, lstm_hidden=256):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2,2),              # 32→16
            nn.Conv2d(64,128,3,padding=1), nn.ReLU(),
            nn.MaxPool2d(2,2)               # 16→8
        )
        self.lstm = nn.LSTM(input_size=128*8,
                            hidden_size=lstm_hidden,
                            batch_first=True)
        self.classifier = nn.Linear(lstm_hidden, num_classes)

    def forward(self, x):
        feat = self.cnn(x)              # [B,128,8,8]
        B, C, H, W = feat.size()
        seq = feat.permute(0,3,1,2).contiguous().view(B, W, C*H)
        out, (hn, cn) = self.lstm(seq)  # [B, W, lstm_hidden]
        last = out[:, -1, :]
        return self.classifier(last)

# ─── Instanciação ──────────────────────────────────────────────
model     = CharCNN_LSTM(NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ─── Loop de Treino/Validação ──────────────────────────────────
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss, total, correct = 0, 0, 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        preds = out.argmax(1)
        total   += labels.size(0)
        correct += (preds == labels).sum().item()
    train_acc = correct/total

    model.eval()
    v_total, v_correct = 0, 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            out = model(imgs)
            preds = out.argmax(1)
            v_total   += labels.size(0)
            v_correct += (preds == labels).sum().item()
    val_acc = v_correct / v_total

    print(f"Ep {epoch:02d}  Loss={running_loss/len(train_loader):.3f}  "
          f"TrainAcc={train_acc:.3f}  ValAcc={val_acc:.3f}")

# ─── Salvando o modelo ─────────────────────────────────────────
os.makedirs("models", exist_ok=True)
save_path = os.path.join("models", "charcnn_lstm.pt")
torch.save(model.state_dict(), save_path)
print(f"Modelo salvo em: {save_path}")


In [33]:
# ─── Configurações ──────────────────────────────────────────────
VAL_DIR     = "C:/Users/joao_/captcha/TextBasedWave/train"          # Pasta com CAPTCHAs originais
OUTPUT_DIR  = "C:/Users/joao_/captcha/TextBasedWave/segments"     # Pasta raiz onde cada imagem terá sua própria pasta
MIN_W, MIN_H = 5, 10                   # Tamanho mínimo de cada segmento
SPLIT_W     = 30                       # Largura acima da qual dividimos em dois
TARGET_SIZE = 32                       # tamanho final com padding

def get_character_boxes(img, min_w=MIN_W, min_h=MIN_H):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, bw = cv2.threshold(gray, 0, 255,
                         cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opened = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
    cnts, _ = cv2.findContours(opened, cv2.RETR_EXTERNAL,
                               cv2.CHAIN_APPROX_SIMPLE)
    boxes = [
        (x, y, w, h)
        for c in cnts
        for x, y, w, h in [cv2.boundingRect(c)]
        if w >= min_w and h >= min_h
    ]
    return sorted(boxes, key=lambda b: b[0])

# ─── Loop sobre cada CAPTCHA ────────────────────────────────────
for fname in sorted(os.listdir(VAL_DIR)):
    name, ext = os.path.splitext(fname)
    img_path = os.path.join(VAL_DIR, fname)
    img = cv2.imread(img_path)
    if img is None:
        print(f"[ERRO] não foi possível ler {fname}")
        continue

    boxes = get_character_boxes(img)
    crops = []
    for (x, y, w, h) in boxes:
        if w > SPLIT_W:
            mid = w // 2
            crops.append(img[y:y+h, x    :x+mid])
            crops.append(img[y:y+h, x+mid:x+w])
        else:
            crops.append(img[y:y+h, x:x+w])

    # só continua se bate com o nome
    if len(crops) != len(name):
        # fallback: cortar horizontalmente em len(name) partes iguais
        h, w = img.shape[:2]
        part_w = w // len(name)
        crops = [img[:, i*part_w:(i+1)*part_w] for i in range(len(name))]

    out_folder = os.path.join(OUTPUT_DIR, name)
    os.makedirs(out_folder, exist_ok=True)

    for idx, crop in enumerate(crops, start=1):
        h, w = crop.shape[:2]

        # 1) redimensiona se maior que TARGET_SIZE, mantendo proporção
        scale = min(1.0, TARGET_SIZE / h, TARGET_SIZE / w)
        new_w, new_h = int(w * scale), int(h * scale)
        if scale < 1.0:
            crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
        else:
            new_w, new_h = w, h

        # 2) cria canvas branco 32×32
        canvas = np.ones((TARGET_SIZE, TARGET_SIZE, 3), dtype=crop.dtype) * 255

        # 3) centraliza
        y_off = (TARGET_SIZE - new_h) // 2
        x_off = (TARGET_SIZE - new_w) // 2
        canvas[y_off:y_off+new_h, x_off:x_off+new_w] = crop

        # 4) salva
        out_path = os.path.join(out_folder, f"{idx:02d}.png")
        cv2.imwrite(out_path, canvas)

print("Processamento de segmentação e padding concluído.")


Processamento de segmentação e padding concluído.


In [None]:
import os
import cv2
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder

# ── Configurações ────────────────────────────────────────────────
BASE_FOLDER    = "TextBasedWave/segments"
MODEL_PATH     = "models/charcnn_lstm.pt"
DEVICE         = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EXTENSIONS     = (".png", ".jpg", ".jpeg")

# ── Carrega modelo ───────────────────────────────────────────────
model = CharCNN_LSTM(num_classes=NUM_CLASSES).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# ── Índice→rótulo ────────────────────────────────────────────────
train_ds    = ImageFolder("TextBasedWave/chars_by_class/train", transform=train_tf)
class_names = train_ds.classes

# ── Transform de inferência ───────────────────────────────────────
infer_tf = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])
def prepare_img(gray_np):
    return infer_tf(gray_np)

# ── Inicializa contadores ────────────────────────────────────────
total_sequences        = 0
correct_sequences      = 0
overall_char_correct   = 0
overall_char_total     = 0

# ── Loop em cada CAPTCHA (pasta) ────────────────────────────────
for label in sorted(os.listdir(BASE_FOLDER)):
    folder = os.path.join(BASE_FOLDER, label)
    if not os.path.isdir(folder):
        continue

    total_sequences += 1
    seq_correct = 0
    seq_total   = 0

    # percorre os segmentos dessa pasta
    for fname in sorted(os.listdir(folder)):
        if not fname.lower().endswith(EXTENSIONS):
            continue

        # determina o caractere “verdadeiro” pelo índice do arquivo
        seg_idx   = int(os.path.splitext(fname)[0]) - 1
        true_char = label[seg_idx]

        # inferência
        gray      = cv2.imread(os.path.join(folder, fname), cv2.IMREAD_GRAYSCALE)
        inp       = prepare_img(gray).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            out       = model(inp)
            pred_char = class_names[out.argmax(dim=1).item()]

        # atualiza contagens de segmento
        is_corr = (pred_char == true_char)
        seq_correct += is_corr
        seq_total   += 1
        overall_char_correct += is_corr
        overall_char_total   += 1

    # se todos os segmentos ficaram corretos, conta como sequência certa
    if seq_correct == seq_total and seq_total > 0:
        correct_sequences += 1

# ── Cálculo das métricas ────────────────────────────────────────
sequence_accuracy  = correct_sequences / total_sequences if total_sequences else 0
character_accuracy = overall_char_correct / overall_char_total if overall_char_total else 0

# ── Impressão dos resultados ────────────────────────────────────
print(f"Acurácia de sequência (todas as letras corretas): "
      f"{correct_sequences}/{total_sequences} = {sequence_accuracy:.2%}")
print(f"Acurácia de caractere (individual): "
      f"{overall_char_correct}/{overall_char_total} = {character_accuracy:.2%}")


In [38]:
# ── Configurações ────────────────────────────────────────────────
BASE_FOLDER    = "TextBasedWave/segments"
DATA_DIR       = "TextBasedWave/chars_by_class"              # seu diretório de treino
PROJECT        = "runs/captcha_wave_yolo"
EXP_NAME       = "exp"
DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"
EXTENSIONS     = (".png", ".jpg", ".jpeg")

In [None]:
model = YOLO('yolo11n-cls')
model.train(
    data=DATA_DIR,
    epochs=50,
    imgsz=32,
    batch=64,
    lrf=1e-3,
    project=PROJECT,
    name=EXP_NAME
)


In [None]:
# ── 2) CARREGA O MODELO TREINADO ────────────────────────────────
best_weight = os.path.join(PROJECT, EXP_NAME, "weights", "best.pt")

model = YOLO(best_weight)

class_names = [model.names[i] for i in sorted(model.names)]


# ── Função de predição para cada segmento ──────────────────────


def predict_char(segment_path):
    results = model.predict(
        source=segment_path,
        imgsz=32,
        device=DEVICE,
        verbose=False
    )
    # 1) acessa o tensor interno (torch.Tensor)
    probs_tensor = results[0].probs.data  
    # 2) move pra CPU e converte pra numpy de verdade
    probs_array  = probs_tensor.cpu().numpy()  
    # 3) pega o índice do valor máximo
    idx = int(probs_array.argmax())
    # 4) retorna o nome da classe
    return class_names[idx]

# ── 3) INFERÊNCIA & MÉTRICAS ────────────────────────────────────
total_seq      = 0  # número de pastas processadas
correct_seq    = 0  # quantas tiveram todos os segmentos corretos
total_chars    = 0  # total de segmentos
correct_chars  = 0  # total de segmentos corretos

for label in sorted(os.listdir(BASE_FOLDER)):
    folder = os.path.join(BASE_FOLDER, label)
    if not os.path.isdir(folder):
        continue

    total_seq += 1
    seq_preds = []

    # percorre cada arquivo de segmento
    for fname in sorted(os.listdir(folder)):
        if not fname.lower().endswith(EXTENSIONS):
            continue

        seg_idx   = int(os.path.splitext(fname)[0]) - 1
        true_char = label[seg_idx]

        path      = os.path.join(folder, fname)
        pred_char = predict_char(path)

        seq_preds.append((true_char, pred_char))
        total_chars += 1
        if pred_char == true_char:
            correct_chars += 1

    # se todos os pares (true,pred) baterem → sequência correta
    if all(t == p for t, p in seq_preds) and seq_preds:
        correct_seq += 1

# ── 4) RESULTADOS ───────────────────────────────────────────────
seq_acc  = correct_seq / total_seq    if total_seq  else 0
char_acc = correct_chars / total_chars if total_chars else 0

print(f"Acurácia de sequência (tudo certo): {correct_seq}/{total_seq} = {seq_acc:.2%}")
print(f"Acurácia de caractere (isolado):   {correct_chars}/{total_chars} = {char_acc:.2%}")

In [None]:
import os
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import datasets, transforms

# ── 0) Defina DATA_DIR e carregue o dataset para descobrir num_classes ──
DATA_DIR = "../TextBasedWave/chars_by_class"
train_tf = transforms.Compose([transforms.Resize((32,32)),
                               transforms.ToTensor(),
                               transforms.Normalize([0.485,0.456,0.406],
                                                    [0.229,0.224,0.225])])
train_ds = datasets.ImageFolder(os.path.join(DATA_DIR, "train"), transform=train_tf)
num_classes = len(train_ds.classes)  # ex: 36

# ── 1) Definição da classe sem usar valor padrão indefinido ───────────
class TemporalResNet50(nn.Module):
    def __init__(self, num_classes):
        super(TemporalResNet50, self).__init__()
        base_model = models.resnet50()
        self.backbone = nn.Sequential(*list(base_model.children())[:-2])
        self.lstm     = nn.LSTM(input_size=2048, hidden_size=256,
                                 num_layers=1, batch_first=True,
                                 bidirectional=True)
        self.classifier = nn.Linear(256 * 2, num_classes)

    def forward(self, x):
        x = self.backbone(x)          # [B, 2048, H, W]
        b, c, h, w = x.size()
        x = x.permute(0, 2, 3, 1).contiguous()  # [B, H, W, C]
        x = x.view(b, h * w, c)                # [B, T, C]
        x, _ = self.lstm(x)                    # [B, T, 512]
        x = x[:, -1, :]                        # último timestep
        return self.classifier(x)

# ── 2) Instancie o modelo passando num_classes ────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TemporalResNet50(num_classes=num_classes).to(device)


In [None]:
# ── CÉLULA: TREINAMENTO DO TemporalResNet50 ────────────────────────
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm import tqdm

# ── Configurações ─────────────────────────────────────────────────
DATA_DIR      = "TextBasedWave/chars_by_class"                # estrutura: chars_by_class/train e chars_by_class/val
PROJECT       = "models/captcha_Wave_resnet50"
EXP_NAME      = "exp"
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

num_epochs    = 100
batch_size    = 64
learning_rate = 1e-3
patience      = 15
gamma         = 0.97    # para ExponentialLR

# ── Transforms ────────────────────────────────────────────────────
train_tf = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
val_tf   = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# ── Datasets & DataLoaders ────────────────────────────────────────
train_ds   = datasets.ImageFolder(os.path.join(DATA_DIR, "train"), transform=train_tf)
val_ds     = datasets.ImageFolder(os.path.join(DATA_DIR, "val"),   transform=val_tf)
trainloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=4, pin_memory=True)
testloader  = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# ── Diretórios de saída ───────────────────────────────────────────
weights_dir = os.path.join(PROJECT, EXP_NAME, "weights")
os.makedirs(weights_dir, exist_ok=True)

# ── Modelo, Otimizador, Scheduler, Critério ────────────────────────
model     = TemporalResNet50(num_classes=len(train_ds.classes)).to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
criterion = nn.CrossEntropyLoss()

# ── Loop de Treino com Early Stopping ──────────────────────────────
best_val_acc     = 0.0
epochs_no_improve = 0
best_state_dict  = None

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss, running_corrects, running_total = 0.0, 0, 0
    
    for images, labels in tqdm(trainloader, desc=f"[Epoch {epoch}] Treino", leave=False):
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss     += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        running_corrects += (preds == labels).sum().item()
        running_total    += labels.size(0)
    
    train_loss = running_loss / running_total
    train_acc  = 100 * running_corrects / running_total
    
    # --- Validação ---
    model.eval()
    val_corrects, val_total = 0, 0
    with torch.no_grad():
        for images, labels in tqdm(testloader, desc=f"[Epoch {epoch}] Validação", leave=False):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            preds = outputs.argmax(dim=1)
            val_corrects += (preds == labels).sum().item()
            val_total    += labels.size(0)
    
    val_acc = 100 * val_corrects / val_total
    scheduler.step()
    
    print(f"Epoch {epoch:2d} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
          f"Val Acc: {val_acc:.2f}%")
    
    # Early Stopping
    if val_acc > best_val_acc:
        best_val_acc      = val_acc
        epochs_no_improve = 0
        best_state_dict   = model.state_dict()
        torch.save(best_state_dict, os.path.join(weights_dir, "best.pth"))
        print(f"📈 Nova melhor Val Acc: {best_val_acc:.2f}% — modelo salvo.")
    else:
        epochs_no_improve += 1
        print(f"⏸️ Sem melhora por {epochs_no_improve} época(s).")
        if epochs_no_improve >= patience:
            print(f"🛑 Early stopping ativado após {patience} épocas sem melhora.")
            break

# ── Restaurar melhor modelo ───────────────────────────────────────
model.load_state_dict(best_state_dict)
print("✔️ Treinamento concluído. Melhor modelo carregado.")


In [None]:
# ── CÉLULA: VALIDAÇÃO SEGMENTO-A-SEGMENTO COM TemporalResNet50 ──

# ── 1) Configurações ─────────────────────────────────────────────
BASE_FOLDER    = "TextBasedWave/segments"               # pastas com rótulos de sequência
EXTENSIONS     = (".png", ".jpg", ".jpeg")
best_model_path = "models/captcha_Wave_resnet50/exp/weights/best.pth"         # ajuste para o seu caminho
device         = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ── 2) Carrega o modelo treinado ─────────────────────────────────
model = TemporalResNet50(num_classes=num_classes).to(device)
model.load_state_dict(torch.load(best_model_path, map_location=device))
model.eval()

# ── 3) Transforms de validação (mesmo padrão do treino) ──────────
val_tf = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

# ── 4) Função de predição para um segmento ───────────────────────
def predict_char(segment_path):
    img = Image.open(segment_path).convert("RGB")
    x   = val_tf(img).unsqueeze(0).to(device)            # [1,C,H,W]
    with torch.no_grad():
        logits = model(x)                                # [1,num_classes]
        idx    = int(logits.argmax(dim=1).cpu().item())
    return train_ds.classes[idx]                         # lista de classes do ImageFolder

# ── 5) Inferência & métricas ────────────────────────────────────
total_seq     = 0   # total de sequências (pastas)
correct_seq   = 0   # quantas sequências tiveram TODOS os chars corretos
total_chars   = 0   # total de segmentos avaliados
correct_chars = 0   # total de segmentos corretos

for label in sorted(os.listdir(BASE_FOLDER)):
    folder = os.path.join(BASE_FOLDER, label)
    if not os.path.isdir(folder):
        continue

    total_seq += 1
    seq_preds = []

    for fname in sorted(os.listdir(folder)):
        if not fname.lower().endswith(EXTENSIONS):
            continue

        seg_idx    = int(os.path.splitext(fname)[0]) - 1
        true_char  = label[seg_idx]
        path       = os.path.join(folder, fname)
        pred_char  = predict_char(path)

        seq_preds.append((true_char, pred_char))
        total_chars += 1
        if pred_char == true_char:
            correct_chars += 1

    if seq_preds and all(t == p for t, p in seq_preds):
        correct_seq += 1

# ── 6) Resultados ────────────────────────────────────────────────
seq_acc  = correct_seq   / total_seq   if total_seq   else 0
char_acc = correct_chars / total_chars if total_chars else 0

print(f"Acurácia de sequência (tudo certo): {correct_seq}/{total_seq} = {seq_acc:.2%}")
print(f"Acurácia de caractere (isolado):   {correct_chars}/{total_chars} = {char_acc:.2%}")
