In [1]:
# @title Dependencies

import torch
import torchvision
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [3]:
# @title Model definition { display-mode: "form" }

# model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=False)
# Repacing last 1k classes linear layer with 2 classes layer
# model.classifier[-1] = torch.nn.Linear(in_features=4096, out_features=2)

from torch import nn

class ClassifierModel(nn.Module):
  def __init__(self):
    super(ClassifierModel, self).__init__()
    self.features = nn.Sequential(
        nn.Conv2d(3,128, kernel_size=(3,3)), # 512x512x3xN --> 510x510x64xN, rf = 3x3
        nn.MaxPool2d(kernel_size=2, stride=2), # 510x510x64xN --> 255x255x64xN
        nn.ReLU(),
        nn.Conv2d(128,256, kernel_size=(4,4)),  # 255x255x64xN --> 252x252x128xN, rf = 6x6
        nn.MaxPool2d(kernel_size=3, stride=3), # 252x252x128xN --> 126x126x128xN
        nn.ReLU(),
        nn.Conv2d(256,64, kernel_size=(3,3)), # 126x126x128xN --> 124x124x64xN, rf = 8x8
        nn.MaxPool2d(kernel_size=2, stride=2), # 124x124x64xN --> 62x62x64xN
        nn.ReLU(),
        nn.Conv2d(64,32, kernel_size=(3,3)), # 62x62x64xN --> 61x61x16xN, rf = 10x10
        nn.MaxPool2d(kernel_size=2, stride=2), # 60x60x16xN --> 30x30x16xN
        nn.ReLU(),
    )
    self.converter = nn.Flatten() # 30x30x16xN --> 14400 params
    self.classifier = nn.Sequential(
        # nn.Linear(in_features=5776, out_features=4096),
        # nn.Sigmoid(),
        # nn.Linear(in_features=4096, out_features=2048),
        # nn.Sigmoid(),
        # nn.Linear(in_features=2048, out_features=1024),
        # nn.Sigmoid(),
        # nn.Linear(in_features=1024, out_features=512),
        # nn.Sigmoid(),
        # nn.Linear(in_features=512, out_features=256),
        # nn.Sigmoid(),
        # nn.Linear(in_features=256, out_features=128),
        # nn.Sigmoid(),
        # nn.Linear(in_features=128, out_features=64),
        # nn.Sigmoid(),
        # nn.Linear(in_features=64, out_features=32),
        # nn.Sigmoid(),
        # nn.Linear(in_features=32, out_features=16),
        # nn.Sigmoid(),
        # nn.Linear(in_features=16, out_features=8),
        # nn.Sigmoid(),
        # nn.Linear(in_features=8, out_features=4),
        # nn.Sigmoid(),
        # nn.Linear(in_features=4, out_features=2)
        nn.Linear(in_features=11552, out_features=8192),
        nn.Sigmoid(),
        nn.Linear(in_features=8192, out_features=1024),
        nn.Sigmoid(),
        nn.Linear(in_features=1024, out_features=2),
    )

  def forward(self, x):
    x = self.features(x)
    x = self.converter(x)
    x = self.classifier(x)
    return x

model = ClassifierModel()

model.to(device)
model.eval()

SyntaxError: invalid syntax (2062907247.py, line 14)

In [None]:
# @title Loading data { display-mode: "form" }

from torchvision.transforms import v2
from torch import Tensor

preprocess = v2.Compose([
    v2.CenterCrop(512),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True)
])
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

torch.manual_seed(0)
dataset = torchvision.datasets.ImageFolder(
    root=r'C:\\Users\\bes-s\\OneDrive\\Документы\\NN\\datasets\\deepscrape-v1',
    transform=preprocess
    )

trainsplit = 0.7
batch_size = 4

dataset_size = len(dataset)
print(f"Size of dataset is: {dataset_size}, train/test split ratio: {trainsplit}, batch size: {batch_size}")
trainset, testset = torch.utils.data.random_split(dataset, [trainsplit, 1-trainsplit])

train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=batch_size, pin_memory=True, pin_memory_device=device, num_workers=1)
test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=batch_size, pin_memory=True, pin_memory_device=device, num_workers=1)

In [None]:
%%time
for batch_ind, data in enumerate(train_loader):
  pass

In [None]:
# @title Training loop { display-mode: "form" }

import torch.optim as optim

import torch
import datetime
from torch.utils.tensorboard import SummaryWriter

def calculate_epoch(writer: SummaryWriter, epoch, optimizer, loss_func, is_train=False):
  if is_train:
    model.train(True)
    data_loader = train_loader
    set_size = dataset_size * trainsplit
  else:
    model.eval()
    data_loader = test_loader
    set_size = dataset_size * (1-trainsplit)

  total = 0
  correct = 0
  running_loss = 0
  
  batch_count =  set_size / batch_size
  
  for batch_ind, data in enumerate(data_loader):
    inputs, targets = data
    inputs, targets = inputs.to(device), targets.to(device)

    optimizer.zero_grad()
    output = model(inputs)

    total += len(output)
    correct += (output.argmax() == targets).float().sum()

    targets = torch.nn.functional.one_hot(targets, num_classes = 2).to(torch.float)
    loss = loss_func(output, targets)
    if is_train:
      loss.backward()
      optimizer.step()

    running_loss += loss.item()

    if batch_ind % 10 == 9:
      plot_x = 100 * (batch_ind / batch_count  + epoch)
      stage = "train" if is_train else "test"
      avg_loss = running_loss / 10
      writer.add_scalar(f"loss/{stage}", avg_loss, plot_x)
      writer.add_scalar(f"accuracy/{stage}", correct/total, plot_x)
      running_loss = 0


def train_loop(model, learn_rate, momentum, epoch_count = 10):
  loss_func = torch.nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=learn_rate, momentum=momentum)

  time_now = datetime.datetime.now()
  layout = {
      "Model statistics" : {
      "loss": ["Multiline", ["loss/train", "loss/test"]],
      "accuracy": ["Multiline", ["accuracy/train", "accuracy/test"]]
      }
  }

  run_name = "C:\\Users\\bes-s\\OneDrive\\Документы\\NN\\runs\\run" + time_now.strftime("%d-%m %H_%M_%S") + " lr(" + str(round(learn_rate,3)) + ") - m(" + str(round(momentum,3)) + ")"
  writer = SummaryWriter(log_dir=run_name)
  writer.add_custom_scalars(layout)
  
  print('Starting at ' + time_now.strftime("%d-%m %Hh %Mm %Ss"))

  for epoch in range(epoch_count):  # loop over the dataset multiple times
    time_start = datetime.datetime.now()
    print(f"epoch #{epoch+1}/{epoch_count}...", end = "")
    calculate_epoch(writer, epoch, optimizer=optimizer, loss_func=loss_func, is_train=True)
    with torch.no_grad():
      calculate_epoch(writer, epoch, optimizer=optimizer, loss_func=loss_func, is_train=False)
      
    time_end = datetime.datetime.now()
    print("done in " + str(time_end - time_start))
    writer.flush()

  writer.close()
  print('done training at' + time_now.strftime("%d-%m %Hh %Mm %Ss"))

In [None]:
train_loop(model, learn_rate=5.0, momentum=0.5, epoch_count=5)

Исправил в коде несколько критических ошибок:
1. Все это время модель училась и тестировалась на учебном наборе :facepalm:
2. Некорректно рассчитывался номер батча внутри эпохи

Изменения:
1. Теперь выводится время обсчета эпохи
2. В названии рана сохраняются гиперпараметры модели
3. На графиках теперь каждая эпоха выпадает на целое число по X, а дробная часть обозначает номер батча

После изменений столкнулся с ошибкой которую отправил вам в телеграм\
Возможно она решится перезагрузкой ядра