<a href="https://colab.research.google.com/github/yuhao831068/Vgg11_for_Classify_CCTV_with_Gun_appearance/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# define dataset
from torch.utils.data import Dataset
from pathlib import Path
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import torch

class ImageDataset(Dataset):
  def __init__(self, root, train, transform=None):
    if train:
      image_root = Path(root) / 'train'
    else:
      image_root = Path(root) / 'test'

    # filter DS.Store
    self.paths = [i for i in image_root.rglob('*') if i.is_file() and i.name != '.DS_Store']
    self.transform = transform


    with open(Path(root) / 'classnames.txt','r') as f:
      lines = f.readlines()
      classes = []
      for line in lines:
        stripped_line = line.strip()
        classes.append(stripped_line)
        self.classes = classes

        # Read descriptions from descriptions.txt
    self.descriptions = {}
    with open(Path(root) / 'descriptions.txt', 'r') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                filename, description = parts
                self.descriptions[filename] = description

            # Vectorize descriptions
    self.vectorizer = TfidfVectorizer()
    all_descriptions = list(self.descriptions.values())
    self.vectorizer.fit(all_descriptions)
    self.vectorized_descriptions = {k: self.vectorizer.transform([v]).toarray()[0] for k, v in self.descriptions.items()}

  def __getitem__(self, index):
    img = Image.open(self.paths[index]).convert('RGB')
    img_path = self.paths[index]
    class_name = self.paths[index].parent.name
    class_idx = self.classes.index(class_name)
    img_name = img_path.name

        # Get the description for the image
    description = self.descriptions.get(img_name, "No description available")
    description_vector = self.vectorized_descriptions.get(img_name, np.zeros(self.vectorizer.get_feature_names_out().shape[0]))
    description_vector = torch.FloatTensor(description_vector)  # make sure the data type is Float

    if self.transform:
      return self.transform(img), class_idx, description, description_vector
    else:
      return img, class_idx, description, description_vector

  def __len__(self):
    return len(self.paths)

In [2]:
# import model
import torchvision.models as models
model = models.vgg11_bn(pretrained=True)

Downloading: "https://download.pytorch.org/models/vgg11_bn-6002323d.pth" to /root/.cache/torch/hub/checkpoints/vgg11_bn-6002323d.pth
100%|██████████| 507M/507M [00:02<00:00, 211MB/s]


In [3]:
 # import weights and set transformer
import torchvision.transforms as transforms
weights = models.VGG11_BN_Weights.DEFAULT
vgg11_bn_transforms = weights.transforms()

In [4]:
# resnet_transform = transforms.Compose([
#     transforms.RandomResizedCrop(224),
#     transforms.RandomHorizontalFlip(),
#     transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
# ])

In [5]:
train_dataset = ImageDataset(root = '/content/drive/MyDrive/final dataset-sr-224-ordered',
                             train = True,
                             transform=vgg11_bn_transforms)
test_dataset = ImageDataset(root = '/content/drive/MyDrive/final dataset-sr-224-ordered',
                            train = False,
                            transform=vgg11_bn_transforms)

In [6]:
train_dataset[0]

(tensor([[[-1.5699, -1.5357, -1.5185,  ...,  0.3652, -1.6555, -1.5185],
          [-1.5528, -1.5699, -1.5528,  ...,  0.5536, -1.6384, -1.5699],
          [-1.5014, -1.5528, -1.5357,  ...,  0.5707, -1.7069, -1.4500],
          ...,
          [-0.6965, -0.6794, -0.1314,  ...,  0.4679,  0.4508,  0.5536],
          [-0.7308, -0.6623,  0.0569,  ...,  0.4166,  0.3652,  0.4679],
          [-0.7650, -0.7308, -0.0287,  ...,  0.3994,  0.3994,  0.5022]],
 
         [[-1.4580, -1.4230, -1.4055,  ...,  0.4853, -1.5455, -1.4055],
          [-1.4405, -1.4580, -1.4405,  ...,  0.6779, -1.5630, -1.4930],
          [-1.3880, -1.4405, -1.4405,  ...,  0.6429, -1.6681, -1.3704],
          ...,
          [-0.7227, -0.7227, -0.1450,  ...,  0.5553,  0.5028,  0.5903],
          [-0.7052, -0.6352,  0.1176,  ...,  0.5028,  0.4153,  0.4853],
          [-0.7227, -0.6527,  0.0826,  ...,  0.4678,  0.4503,  0.5203]],
 
         [[-1.3339, -1.2990, -1.2816,  ...,  0.8099, -1.2467, -1.1073],
          [-1.3164, -1.3339,

In [7]:
import torch
from torch.utils.data import DataLoader

BATCH_SIZE = 16

train_dataloader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True) #shuffle

test_dataloader = DataLoader(test_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=False)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.models as models
import torchvision.transforms as transforms

class MultiModalModel(nn.Module):
    def __init__(self, text_feature_size, num_classes):
        super(MultiModalModel, self).__init__()
        vgg11 = models.vgg11_bn(weights=models.VGG11_BN_Weights.DEFAULT)
        self.image_model = nn.Sequential(
            vgg11.features,
            nn.AdaptiveAvgPool2d((7, 7)),
            nn.Flatten(),
            nn.Linear(512 * 7 * 7, 2048),  # Assuming the VGG11 feature size
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.text_model = nn.Sequential(
            nn.Linear(text_feature_size, 512),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.classifier = nn.Sequential(
            nn.Linear(512 + 2048, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, images, text_features):
        img_features = self.image_model(images)
        text_features = self.text_model(text_features)
        combined_features = torch.cat((img_features, text_features), dim=1)
        outputs = self.classifier(combined_features)
        return outputs

In [9]:
train_dataset.vectorizer.get_feature_names_out()

array(['abandoned', 'air', 'airport', 'aisle', 'alleyway', 'an', 'and',
       'another', 'apartment', 'are', 'around', 'at', 'attacked', 'away',
       'back', 'background', 'backpack', 'backyard', 'bag', 'balloons',
       'bank', 'bar', 'basketball', 'bathroom', 'being', 'bench', 'bike',
       'billboard', 'black', 'block', 'blue', 'both', 'bottle', 'box',
       'boxes', 'bricks', 'bucket', 'building', 'bus', 'by', 'camera',
       'car', 'carried', 'carrying', 'cars', 'cart', 'cashier', 'cell',
       'chair', 'chairs', 'child', 'circular', 'city', 'classroom',
       'computer', 'corner', 'couch', 'counter', 'courtroom', 'crossing',
       'crowd', 'crowded', 'dark', 'dashboard', 'day', 'dealership',
       'desk', 'desks', 'display', 'dog', 'door', 'doors', 'down',
       'dress', 'drinking', 'drinks', 'driver', 'driveway', 'driving',
       'eating', 'elevator', 'empty', 'encampment', 'factory', 'falling',
       'fence', 'fighting', 'flashlight', 'floor', 'footage', 'found',


In [10]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

# make model
text_feature_size = len(train_dataset.vectorizer.get_feature_names_out())
num_classes = len(train_dataset.classes)
model = MultiModalModel(text_feature_size, num_classes)

# # define cost function and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)


In [11]:
# train
num_epochs = 15
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_correct = 0
    train_total = 0
    running_loss = 0.0
    for images, labels, _, text_features in train_dataloader:
        images = images.to(device)
        labels = labels.to(device)
        text_features = text_features.to(device)

        optimizer.zero_grad()
        outputs = model(images, text_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        train_total += labels.size(0)
        train_correct += (outputs.argmax(dim=1) == labels).sum().item()

    epoch_loss = running_loss / len(train_dataloader.dataset)
    train_accuracy = 100 * train_correct / train_total
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')
    print(f'Train Accuracy: {train_accuracy:.2f}%')

    # test
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels, _, text_features in test_dataloader:
            images = images.to(device)
            labels = labels.to(device)
            text_features = text_features.to(device)

            outputs = model(images, text_features)
            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

        test_accuracy = 100 * test_correct / test_total
        print(f'Validation Accuracy: {test_accuracy:.2f}%')

print('Finished Training')


Epoch 1/15, Loss: 0.8380
Train Accuracy: 47.50%
Validation Accuracy: 50.00%
Epoch 2/15, Loss: 0.6900
Train Accuracy: 54.00%
Validation Accuracy: 63.00%
Epoch 3/15, Loss: 0.6732
Train Accuracy: 61.75%
Validation Accuracy: 60.00%
Epoch 4/15, Loss: 0.5987
Train Accuracy: 72.75%
Validation Accuracy: 67.00%
Epoch 5/15, Loss: 0.5088
Train Accuracy: 76.75%
Validation Accuracy: 76.00%
Epoch 6/15, Loss: 0.4849
Train Accuracy: 78.75%
Validation Accuracy: 58.00%
Epoch 7/15, Loss: 0.4713
Train Accuracy: 78.50%
Validation Accuracy: 76.00%
Epoch 8/15, Loss: 0.4189
Train Accuracy: 81.25%
Validation Accuracy: 70.00%
Epoch 9/15, Loss: 0.4274
Train Accuracy: 81.50%
Validation Accuracy: 61.00%
Epoch 10/15, Loss: 0.4165
Train Accuracy: 80.50%
Validation Accuracy: 72.00%
Epoch 11/15, Loss: 0.3450
Train Accuracy: 85.25%
Validation Accuracy: 71.00%
Epoch 12/15, Loss: 0.3400
Train Accuracy: 82.00%
Validation Accuracy: 68.00%
Epoch 13/15, Loss: 0.3244
Train Accuracy: 85.25%
Validation Accuracy: 70.00%
Epoch 14