pytorch_CAM_v2.py

# original code from: https://github.com/metalbubble/CAM/blob/master/pytorch_CAM.py
# modified by Jinwoo Choi (jinchoi@vt.edu)

# simple implementation of CAM in PyTorch for the networks such as ResNet, DenseNet, SqueezeNet, Inception

import io
import requests
from PIL import Image
import torch
from torchvision import models, transforms
from torch.autograd import Variable
from torch.nn import functional as F
from model import generate_model
from mean import get_mean, get_std
from spatial_transforms import (
    Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
    MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
from temporal_transforms import LoopPadding, TemporalRandomCrop
from target_transforms import ClassLabel, VideoID
from target_transforms import Compose as TargetCompose
from dataset import get_training_set, get_validation_set, get_test_set

from opts import parse_opts
import numpy as np
import cv2
import os
import pdb

# input image
LABELS_URL = 'https://s3.amazonaws.com/outcome-blog/imagenet/labels.json'
IMG_URL = 'http://media.mlive.com/news_impact/photo/9933031-large.jpg'

opt = parse_opts()
if opt.root_path != '':
    opt.video_path = os.path.join(opt.root_path, opt.video_path)
    opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
    opt.result_path = os.path.join(opt.root_path, opt.result_path)
    if not os.path.exists(opt.result_path):
        os.makedirs(opt.result_path)
    if opt.resume_path:
        opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
    if opt.pretrain_path:
        opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
opt.scales = [opt.initial_scale]
for i in range(1, opt.n_scales):
    opt.scales.append(opt.scales[-1] * opt.scale_step)
opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
opt.std = get_std(opt.norm_value)
print(opt)

# networks such as googlenet, resnet, densenet already use global average pooling at the end, so CAM could be used directly.

net, parameters = generate_model(opt)
finalconv_name = 'layer4'
print(net)

net.eval()

# hook the feature extractor
features_blobs = []
def hook_feature(module, input, output):
    features_blobs.append(output.data.cpu().numpy())

net._modules['module'].layer4.register_forward_hook(hook_feature)

# get the softmax weight
params = list(net.parameters())
weight_softmax = np.squeeze(params[-2].data.cpu().numpy())

def returnCAM(feature_conv, weight_softmax, class_idx):
    # generate the class activation maps upsample to 256x256
    size_upsample = (256, 256)
    bz, nc, h, w = feature_conv.shape
    output_cam = []
    for idx in class_idx:
        cam = weight_softmax[idx].dot(feature_conv.reshape((nc, h*w)))
        cam = cam.reshape(h, w)
        cam = cam - np.min(cam)
        cam_img = cam / np.max(cam)
        cam_img = np.uint8(255 * cam_img)
        output_cam.append(cv2.resize(cam_img, size_upsample))
    return output_cam


normalize = transforms.Normalize(
   mean=[0.485, 0.456, 0.406],
   std=[0.229, 0.224, 0.225]
)
preprocess = transforms.Compose([
   transforms.Resize((224,224)),
   transforms.ToTensor(),
   normalize
])

if opt.no_mean_norm and not opt.std_norm:
    norm_method = Normalize([0, 0, 0], [1, 1, 1])
elif not opt.std_norm:
    norm_method = Normalize(opt.mean, [1, 1, 1])
else:
    norm_method = Normalize(opt.mean, opt.std)

spatial_transform = Compose([
    Scale(opt.sample_size),
    CenterCrop(opt.sample_size),
    ToTensor(opt.norm_value), norm_method
])
temporal_transform = LoopPadding(opt.sample_duration)
target_transform = ClassLabel()
validation_data = get_validation_set(
    opt, spatial_transform, temporal_transform, target_transform)
val_loader = torch.utils.data.DataLoader(
    validation_data,
    batch_size=opt.val_batch_size,
    shuffle=False,
    num_workers=opt.n_threads,
    pin_memory=True)

vis_root = os.path.join(opt.result_path, 'vis')

for i, (inputs, targets, path, frame_indices) in enumerate(val_loader):
    if i%100 == 0:
        print('clip {}/{}'.format(i, len(val_loader)))    

    inputs = Variable(inputs, volatile=True)

    # forward pass    
    logit = net(inputs)

    # get the probs
    h_x = F.softmax(logit, dim=1).data.squeeze()
    probs, idx = h_x.sort(0, True)    
    probs = probs.cpu().numpy()
    idx = idx.cpu().numpy()

    is_correct = True if idx[0] == targets[0].numpy().tolist() else False
    pred_cls = validation_data.class_names[idx[0]]

    # generate class activation mapping for the top1 prediction
    # visualize only the center frame
    CAMs = returnCAM(np.moveaxis(features_blobs[0][0], 0,1), weight_softmax, [idx[0]])

    # render the CAM and output
    img = cv2.imread(os.path.join(path[0], 'image_{:05d}.jpg'.format(frame_indices[7].cpu().numpy()[0])))
    height, width, _ = img.shape
    heatmap = cv2.applyColorMap(cv2.resize(CAMs[0],(width, height)), cv2.COLORMAP_JET)
    result = heatmap * 0.3 + img * 0.5
    cur_cls = path[0].split('/')[-2]
    cur_vid = path[0].split('/')[-1]
    output_path = os.path.join(vis_root, 'true' if is_correct else 'false', cur_cls, cur_vid)
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    cv2.imwrite(os.path.join(output_path, 'CAM_frame_{:05d}_pred_cls_{}.jpg'.format(frame_indices[7].cpu().numpy()[0], pred_cls)), result)
    # cv2.imwrite(os.path.join(output_path, 'CAM_frame_{:05d}_pred_cls_{}.jpg'.format(frame_indices[0].cpu().numpy()[0], pred_cls)), result)
    # cv2.imwrite(os.path.join(output_path, 'CAM_frame_{:05d}_pred_cls_{}.jpg'.format(frame_indices[-1].cpu().numpy()[0], pred_cls)), result)

print('Finished CAM visualization!')