In [None]:
# pip install torch torchvision onnx

import copy
import json
import os
import random
from matplotlib.image import imread

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset

In [None]:
class Boats(Dataset):
    def __init__(self, root_dir, transform=None, gt_json_path=""):
        self.root_dir = root_dir
        self.transform = transform
        self.gt_json_path = gt_json_path
        self.labels = json.load(open(gt_json_path, "r"))
        self.image_list = sorted(os.listdir(root_dir))
        self.image_ids = dict(enumerate(self.image_list, start=0))

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img = self.load_image(idx)
        img_name = self.image_ids[idx]
        label = self.labels[img_name]
        if self.transform:
            img = self.transform(img)
        sample = (img, label)
        return sample

    def load_image(self, image_index):
        image_name = self.image_ids[image_index]
        path = os.path.join(self.root_dir, image_name)
        img = imread(path)
        return img


# Improved Neural Network version
class Net(nn.Module):#TODO 9) #creat your own NN architecture
    def __init__(self):
        super(Net, self).__init__()
        #self.fc1 = nn.Linear(3 * 192 * 108, 1)#TODO Question 1)

        # Convolutional Layers
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        
        # Pooling Layer
        self.pool = nn.MaxPool2d(2, 2)
        
        # Fully Connected Layers
        self.fc1 = nn.Linear(64 * 13 * 24, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        # Convolutional and Pooling Layers with Activation
        x = self.pool(F.relu(self.conv1(x)))  # Output: (16, 54, 96)
        x = self.pool(F.relu(self.conv2(x)))  # Output: (32, 27, 48)
        x = self.pool(F.relu(self.conv3(x)))  # Output: (64, 13, 24)
        
        # Flatten
        x = torch.flatten(x, start_dim=1)
        
        # Fully Connected Layers with Activation
        x = F.relu(self.fc1(x))
        
        # Output Layer with Sigmoid Activation
        output = torch.sigmoid(self.fc2(x))
        return output


def train(log_interval, model, device, train_loader, optimizer, criterion, epoch,dry_run):
    """
    Train a network
    You can find example code here: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
    """
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device).float()
        optimizer.zero_grad()#TODO Question 4)
        output = model(data)
        loss = criterion(output, torch.unsqueeze(target, 1))#TODO Question 5)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            if dry_run:
                break


def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device).float()
            output = model(data)
            test_loss += criterion(output, torch.unsqueeze(target, 1)).item()  # sum up batch loss
            pred = torch.round(output)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print("\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return 100. * correct / len(test_loader.dataset)

In [None]:
def main():
    # Training settings #you can mess around with change these values!
    batch_size = 64
    test_batch_size = 1000
    epochs = 20
    learning_rate = 0.0001
    no_cuda = False #If you using course cpu leave False, if you are using GPU set true
    dry_run = False
    seed = random.randint(1,1000)#random seed. Set to constant if you want to train on the same data
    log_interval = 10#how many batches to wait before logging training status
    save_model = False 
    
    
    """
    #This is used if you want to run it as a script file.
    parser = argparse.ArgumentParser(description="PyTorch Ship Detection")
    parser.add_argument("--batch-size", type=int, default=64, metavar="N",
                        help="input batch size for training (default: 64)")
    parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs", type=int, default=14, metavar="N",
                        help="number of epochs to train (default: 14)")
    parser.add_argument("--lr", type=float, default=0.1, metavar="LR",
                        help="learning rate (default: 0.1)")
    parser.add_argument("--no-cuda", action="store_true", default=False,
                        help="disables CUDA training")
    parser.add_argument("--dry-run", action="store_true", default=False,
                        help="quickly check a single pass")
    parser.add_argument("--seed", type=int, default=1, metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument("--log-interval", type=int, default=10, metavar="N",
                        help="how many batches to wait before logging training status")
    parser.add_argument("--save-model", action="store_true", default=False,
                        help="For Saving the current Model")
    args = parser.parse_args()
    """
    #torch.manual_seed(args.seed)
    torch.manual_seed(seed)
    #use_cuda = not args.no_cuda and torch.cuda.is_available()
    use_cuda = no_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    #train_kwargs = {"batch_size": args.batch_size}
    #val_kwargs = {"batch_size": args.test_batch_size}
    train_kwargs = {"batch_size": batch_size}
    val_kwargs = {"batch_size": test_batch_size}
    if use_cuda:
        cuda_kwargs = {"num_workers": 1,
                       "pin_memory": True,
                       "shuffle": True}
        train_kwargs.update(cuda_kwargs)
        val_kwargs.update(cuda_kwargs)

    # Create transform
    transform = transforms.Compose([
        transforms.ToTensor(),
        # This normalization is used on the test server
        transforms.Normalize([0.2404, 0.2967, 0.3563], [0.0547, 0.0527, 0.0477])
        ])

    # Create train and test set
    path_to_dataset = "/courses/CS5330.202510/data/Boat-MNIST"#gobal path to the data on Discovery 
    train_set = Boats(root_dir=path_to_dataset + "/train", transform=transform,
                      gt_json_path=path_to_dataset + "/boat_mnist_labels_trainval.json")
    val_set = Boats(root_dir=path_to_dataset + "/val", transform=transform,
                    gt_json_path=path_to_dataset +"/boat_mnist_labels_trainval.json")

    # Create data loaders
    train_loader = torch.utils.data.DataLoader(train_set, **train_kwargs)
    test_loader = torch.utils.data.DataLoader(val_set, **val_kwargs)

    # Create network, optimizer and loss
    model = Net().to(device)#TODO Question 6)
    #optimizer = optim.SGD(model.parameters(), lr=learning_rate)#TODO Question 7)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()#TODO Question 8)

    # Train and validate
    best_acc = 0
    best_model_wts = copy.deepcopy(model.state_dict())
    for epoch in range(1, epochs + 1):
        train(log_interval, model, device, train_loader, optimizer, criterion, epoch, dry_run)
        acc = test(model, device, test_loader, criterion)
        if acc > best_acc:
            best_acc = acc
            best_model_wts = copy.deepcopy(model.state_dict())

    # Load best model weights
    model.load_state_dict(best_model_wts)
    print(f"Best accuracy (val): {best_acc}")

    #if args.save_model:
    #    torch.save(model.state_dict(), "model.pth")
    if save_model:
        torch.save(model.state_dict(), "model.pth")
    
    # --- Do not touch -----
    # Save model as onnx file
    dummy_input = torch.randn(1, 3, 108, 192, device=device)
    input_names = ["img_1"]
    output_names = ["output1"]
    torch.onnx.export(model, dummy_input, "ship_example.onnx", input_names=input_names, output_names=output_names)
    # ----------------------


#if __name__ == "__main__":
main()



## Answers to questions:
- **Q1:** The line `self.fc1 = nn.Linear(3 * 192 * 108, 1)` defines a fully connected layer that takes an input size of `3 * 192 * 108`, which represents the total number of pixels in a flattened RGB image of size `192x108. The output size is 1, producing a single value that can be used for binary classification (e.g., predicting if an image contains a boat or not).

- **Q2:** The line `x = torch.flatten(x, start_dim=1)` reshapes the tensor `x` into a 2D tensor by flattening all dimensions starting from dimension 1 (keeping the batch size dimension intact). This is necessary to convert the multi-dimensional output from convolutional layers into a flat vector that can be input into a fully connected layer.

- **Q3:** The line `x = self.fc1(x)` passes the flattened input `x` through the fully connected layer `fc1`. This layer applies a linear transformation to the input data, producing an output that can be further processed or used for predictions.

- **Q4:** The line `optimizer.zero_grad()` clears the gradients of all optimized tensors. This is important because gradients are accumulated by default in PyTorch, so we need to zero them out at the start of each training iteration to prevent accumulation from previous iterations.

- **Q5:** The line `loss = criterion(output, torch.unsqueeze(target, 1))` computes the loss between the model's predicted output and the actual target values. The `torch.unsqueeze(target, 1)` function adds an extra dimension to the target tensor to match the shape of the output tensor, as the loss function expects both inputs to have the same shape.

- **Q6:** The line `model = Net().to(device)` initializes the neural network model and moves it to the specified device (`device`), which can be either a CPU or GPU. This ensures that all model computations are performed on the chosen hardware.

- **Q7:** The line `optimizer = optim.SGD(model.parameters(), lr=learning_rate)` creates an optimizer called "SGD" (Stochastic Gradient Descent) to adjust the parameters of the model ("model.parameters()") during training, with a specified "learning_rate" which controls how much the model updates its parameters in each training step; essentially, it sets up the mechanism to optimize the model using the SGD algorithm with a defined learning rate.

- **Q8:** The line `criterion = nn.MSELoss()` defines a variable that will be used to calculate the Mean Squared Error (MSE) loss between the predicted values of the neural network and the actual target values; essentially, it sets up a loss function that measures how far off predictions are from the ground truth on average, by squaring the difference between them and then taking the mean.


## Improved Neural Network Architecture

**Convolutional Layers:**

- **`conv1` Layer:**
  - Input Channels: 3 (RGB image)
  - Output Channels: 16
  - Kernel Size: 5x5
  - Purpose: Extract low-level features like edges and textures.

- **`conv2` Layer:**
  - Input Channels: 16
  - Output Channels: 32
  - Kernel Size: 5x5
  - Purpose: Capture more complex patterns by building upon features from `conv1`.

- **`conv3` Layer:**
  - Input Channels: 32
  - Output Channels: 64
  - Kernel Size: 3x3
  - Purpose: Further abstract features and detect higher-level representations.

**Pooling Layer:**

- **MaxPool2d with Kernel Size 2x2:**
  - Reduces the spatial dimensions by half each time it's applied.
  - Helps in reducing computational load and controls overfitting.

**Activation Function:**

- **ReLU (Rectified Linear Unit):**
  - Introduced after each convolutional layer.
  - Adds non-linearity to the model, enabling it to learn complex patterns.

**Fully Connected Layers:**

- **`fc1` Layer:**
  - Input Features: `64 * 13 * 24` (after flattening the output from `conv3`)
  - Output Features: 128
  - Purpose: Acts as a classifier on the features extracted by the convolutional layers.

- **`fc2` Layer:**
  - Input Features: 128
  - Output Features: 1
  - Purpose: Produces the final output score for binary classification.

**Output Layer:**

- **Sigmoid Activation Function:**
  - Applied to the output of `fc2`.
  - Converts the output score to a probability between 0 and 1.

**Loss Function:**

- **Binary Cross Entropy Loss (`nn.BCELoss`):**
  - Suitable for binary classification tasks.
  - Measures the difference between the predicted probabilities and the actual labels.

**Optimizer:**

- **Adam Optimizer:**
  - An adaptive learning rate method.
  - Combines the benefits of AdaGrad and RMSProp.
  - Generally provides better performance compared to basic SGD.

**Why This Architecture Improves Accuracy:**

- **Feature Extraction:**
  - Convolutional layers are effective at extracting hierarchical features from images.
  - By stacking multiple convolutional layers, the network can learn both simple and complex patterns.

- **Dimensionality Reduction:**
  - Pooling layers reduce the spatial dimensions, making the computations more efficient and reducing the risk of overfitting.

- **Non-Linearity:**
  - ReLU activation functions introduce non-linearity, enabling the network to learn complex mappings between inputs and outputs.

- **Classification Power:**
  - Fully connected layers interpret the features extracted by convolutional layers and make predictions.

- **Optimization:**
  - Using the Adam optimizer helps in faster and more reliable convergence.