## Faster R-CNN training process
1. Set up
    - 1.1 Mapping the training data
    - 1.2 preparing the training data 
2. Training precess
    - 2.1 Prepare PyTorch dataset and the loader
    - 2.2 Split the training and testing data
    - 2.3 Define a Faster R-CNN model
    - 2.4 choose the optimizer
    - 2.5 train the model

In [1]:
import os
from collections import defaultdict

annotation_dir = 'dataset/train/train_annotation'

class_count = defaultdict(int)

# get the class name from the annotation files
for filename in os.listdir(annotation_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(annotation_dir, filename)
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 2:
                    class_name = parts[1]  # the second column is the class name
                    class_count[class_name] += 1

print("category of knife:")
for cls, count in class_count.items():
    print(f"{cls}: {count}")


category of knife:
Straight_Knife: 809
Folding_Knife: 1589
Scissor: 1494
Utility_Knife: 1635
Multi-tool_Knife: 1612


In [2]:
# pip install torch torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

### 1. Set up

#### 1.1 Mapping the training data

In [3]:
image_dir = 'dataset/train/train_image'
anno_dir = 'dataset/train/train_annotation'
# Example: 009000.jpg Straight_Knife 763 486 840 549

data_list = []

for txt_file in os.listdir(anno_dir):
    if txt_file.endswith('.txt'):
        txt_path = os.path.join(anno_dir, txt_file)
        with open(txt_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                image_name = parts[0]
                label = parts[1]
                image_path = os.path.join(image_dir, image_name)
                data_list.append((image_path, label))

print(f"the number of trainingset: {len(data_list)}")
print("the first three:", data_list[:3])


the number of trainingset: 7139
the first three: [('dataset/train/train_image\\009000.jpg', 'Straight_Knife'), ('dataset/train/train_image\\009002.jpg', 'Straight_Knife'), ('dataset/train/train_image\\009003.jpg', 'Straight_Knife')]


#### 1.2 Preparing the training data

In [4]:
labeled_data_list = []

label_to_id = {
    'Straight_Knife': 0,
    'Folding_Knife': 1,
    'Scissor': 2,
    'Utility_Knife': 3,
    'Multi-tool_Knife': 4
}

def is_valid_box(bbox, max_width=1225, max_height=954):
    x1, y1, x2, y2 = bbox
    return (
        x2 > x1 and y2 > y1 and
        x1 >= 0 and y1 >= 0 and
        x2 <= max_width and y2 <= max_height
    )


for txt_file in os.listdir(anno_dir):
    if txt_file.endswith('.txt'):
        txt_path = os.path.join(anno_dir, txt_file)
        with open(txt_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 6:
                    image_name = parts[0]
                    label_name = parts[1]
                    bbox = list(map(int, parts[2:6]))
                    if is_valid_box(bbox):
                        label = label_to_id[label_name]
                        image_path = os.path.join(image_dir, image_name)
                        labeled_data_list.append((image_path, label, bbox))

print(f"the number of trainingset: {len(labeled_data_list)} ")
print("the first three:")
for item in labeled_data_list[:3]:
    print("image_path:", item[0])
    print("category:", item[1])
    print("framework:", item[2])


the number of trainingset: 7133 
the first three:
image_path: dataset/train/train_image\009000.jpg
category: 0
framework: [763, 486, 840, 549]
image_path: dataset/train/train_image\009002.jpg
category: 0
framework: [650, 289, 717, 357]
image_path: dataset/train/train_image\009003.jpg
category: 0
framework: [458, 336, 580, 389]


### 2. Training process

#### 2.1 Prepare PyTorch dataset and the loader

In [5]:
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import torchvision.transforms.functional as F
import torch


class KnifeDataset(Dataset):
    #def __init__(self, data_list, transform=None, resize=(400, 300)):
    def __init__(self, data_list, transform=None, resize=None):
        self.data_list = data_list
        self.transform = transform
        self.resize = resize

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        img_path, label, bbox = self.data_list[idx]
        image = Image.open(img_path).convert('RGB')
        orig_w, orig_h = image.size  # original width and height

        # resize image
        if self.resize:
            new_w, new_h = self.resize
            image = image.resize((new_w, new_h))  # PIL resize

            # resize bbox
            scale_x = new_w / orig_w
            scale_y = new_h / orig_h
            bbox = [
                int(bbox[0] * scale_x),
                int(bbox[1] * scale_y),
                int(bbox[2] * scale_x),
                int(bbox[3] * scale_y)
            ]

        if self.transform:
            image = self.transform(image)

        boxes = torch.tensor([bbox], dtype=torch.float32)  # shape [1, 4]
        labels = torch.tensor([label], dtype=torch.int64)  # shape [1]

        target = {
            "boxes": boxes,
            "labels": labels
        }

        return image, target

transform = transforms.Compose([
    #transforms.Resize((400, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])


#### 2.2 split the training and testing data

In [6]:
# split the dataset into training and validation sets
dataset = KnifeDataset(labeled_data_list, transform=transform)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# define collate_fn to prepare for the Faster R-CNN
def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)


#### 2.3 Define a Faster R-CNN model

In [7]:
import torchvision.models as models
import torch.nn as nn
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("current device:", device)


current device: cuda


In [8]:
# use the pre-trained ResNet18 model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model = model.to(device)




#### 2.4 choose the optimizer

In [9]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=1e-4)   # learning rate: 0.0001

#### 2.5 training the CNN

In [10]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (images, targets) in enumerate(train_loader):
        try:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            total_loss += losses.item()

        except Exception as e:
            print(f"Batch {batch_idx} error: {e}")
            continue

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

torch.save(model.state_dict(), "fasterrcnn_opixray.pth")
print("model saved!")


Epoch 1/10, Loss: 197.0167
Epoch 2/10, Loss: 153.2321
Epoch 3/10, Loss: 146.7032
Epoch 4/10, Loss: 135.0985
Epoch 5/10, Loss: 127.8599
Epoch 6/10, Loss: 119.3529
Epoch 7/10, Loss: 114.3767
Epoch 8/10, Loss: 101.5684
Epoch 9/10, Loss: 91.1660
Epoch 10/10, Loss: 83.8986
model saved!
