# Cervix detector with [Single Shot Detector (SSD)](https://github.com/weiliu89/caffe/tree/ssd)

Use Caffe-MSLS


Useful links : 

- [Train SSD on custom dataset](https://github.com/weiliu89/caffe/wiki/Train-SSD-on-custom-dataset)

- [SSD detect notebook](https://github.com/weiliu89/caffe/blob/ssd/examples/ssd_detect.ipynb)

- [SSD notebook](https://github.com/weiliu89/caffe/tree/ssd)



In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

In [3]:
sys.path.append(os.path.join("..","common"))

## Check available `Caffe` distributions

In [4]:
!ls /opt/caffe*

/opt/caffe-master:
build		 data	     include	      Makefile.config.example  src
caffe.cloc	 distribute  INSTALL.md       matlab		       tools
cmake		 docker      LICENSE	      models		       xbyak
CMakeLists.txt	 docs	     Makefile	      python
CONTRIBUTING.md  examples    Makefile.bak     README.md
CONTRIBUTORS.md  external    Makefile.config  scripts

/opt/caffe-mlsl:
build		 data	     include		      matlab	 tools
caffe.cloc	 distribute  INSTALL.md		      models	 xbyak
cmake		 docker      LICENSE		      python
CMakeLists.txt	 docs	     Makefile		      README.md
CONTRIBUTING.md  examples    Makefile.config	      scripts
CONTRIBUTORS.md  external    Makefile.config.example  src


In [5]:
#!diff /opt/caffe-master/Makefile.config  /opt/caffe-mlsl/Makefile.config

## Setup `Caffe` and check SSD512\* model

In [6]:
#os.environ["CAFFE_ROOT"] = "/opt/caffe-master"
os.environ["CAFFE_ROOT"] = "/opt/caffe-mlsl"
os.environ["PATH"] += ":%s/build/tools" % os.environ["CAFFE_ROOT"]
# LD_LIBRARY_PATH is setup in .bash_profile
# os.environ["LD_LIBRARY_PATH"] += ":%s/external/mkl/mklml_lnx_2017.0.2.20170110/lib" % os.environ["CAFFE_ROOT"]
# os.environ["LD_LIBRARY_PATH"] += ":/opt/intel/mlsl_2017.0.006/intel64/lib"

In [7]:
sys.path.append("%s/python" % os.environ["CAFFE_ROOT"])

In [8]:
!ls "$CAFFE_ROOT"/models/intel_optimized_models

alexnet  googlenet  googlenet_v2  resnet_50


## Setup trainval datasets

In [9]:
from data_utils import get_annotations
from PIL import Image

In [10]:
labels_dict = {
    'os': 0,
    'cervix': 1,
}
sloth_annotations_filename =  os.path.join('..', 'resources', 'cervix_os.json') 

In [11]:
def write_images_labels(data_path, annotations, labels_dict, output_path):
    """
    Method to convert create image files (sym links) and txt annotations and write a listfile
    
    - plain txt detection annotation: label_id, xmin, ymin, xmax, ymax
    """
    output_images_folder = os.path.join(output_path, "images")
    os.makedirs(output_images_folder)
    
    output_labels_folder = os.path.join(output_path, "labels")
    os.makedirs(output_labels_folder)
    
    def _clamp(x, dim):
        return min(max(x, 0), dim-1)
    
    listfile = os.path.join(output_path, "listfile.txt")
    with open(listfile, 'w') as listfile_writer:    
        for annotation in annotations:
            img_filename = annotation['filename']
            basename, ext = os.path.splitext(os.path.basename(img_filename))
            basename = os.path.split(os.path.dirname(img_filename))[1] + '_' + basename
            src_image_filename = os.path.abspath(os.path.join(data_path, img_filename))
            dst_image_filename = "%s%s" % (basename,ext)
            dst_label_filename = "%s.txt" % basename
            listfile_writer.write("%s %s\n" % (os.path.join("images", dst_image_filename), 
                                               os.path.join("labels", dst_label_filename)))
                                  
            dst_image_filename = os.path.abspath(os.path.join(output_images_folder, dst_image_filename))
            dst_label_filename = os.path.abspath(os.path.join(output_labels_folder, dst_label_filename))
            os.symlink(src_image_filename, dst_image_filename)
            pil_image = Image.open(img_filename)
            image_size = pil_image.size       

            with open(dst_label_filename, 'w') as writer:
                for obj in annotation['annotations']:
                    # format : class_name bbox_left bbox_top bbox_right bbox_bottom
                    l, t, w, h = int(obj['x']), int(obj['y']), int(obj['width']), int(obj['height'])
                    r = l+w; b = t+h
                    l = _clamp(l, image_size[0])
                    t = _clamp(t, image_size[1])
                    r = _clamp(r, image_size[0])
                    b = _clamp(b, image_size[1])
                    line = "{label_id} {l} {t} {r} {b}\n".format(
                        label_id=labels_dict[obj['class']],
                        l=l, t=t, r=r, b=b
                    )   
                    writer.write(line)

    return output_images_folder, output_labels_folder, listfile

In [12]:
import numpy as np
np.random.seed(2017)

train_test_split = 0.75

annotations = get_annotations(sloth_annotations_filename)
# Create data split
num_labels = len(annotations)
indices = np.random.permutation(num_labels)
split_index = int(num_labels * train_test_split)
train_annotations = np.array(annotations)[indices[:split_index]]
test_annotations = np.array(annotations)[indices[split_index:]]

print "Total : %s, Train : %s, Val : %s" % (num_labels, len(train_annotations), len(test_annotations))

Total : 208, Train : 156, Val : 52


Create LMDB for training and validation datasets

In [13]:
INPUT_DATA = os.path.abspath(os.path.join("..", "input"))
RESOURCES_PATH = os.path.abspath(os.path.join("..", "resources"))
GENERATED_DATA = os.path.abspath(os.path.join(INPUT_DATA, "generated"))

In [14]:
label_map_filename = os.path.join(RESOURCES_PATH,"labelmap_ccs.prototxt")

In [None]:
import shutil

for _dataset, _annotations in zip(['train', 'val'], [train_annotations, test_annotations]):
    
    _dataset_path = os.path.join(GENERATED_DATA, _dataset)

#     if os.path.exists(os.path.join(_dataset_path, _dataset + ".lmdb")):
#         print("Found existing file: ", os.path.join(_dataset_path, _dataset + ".lmdb"))
#         continue
    
    if os.path.isdir(_dataset_path):
        shutil.rmtree(_dataset_path)        
    ret = write_images_labels(RESOURCES_PATH, _annotations, labels_dict, _dataset_path)
    output_images_folder, output_labels_folder, listfile = ret
    
    args = [
        # See the code source : https://github.com/intel/caffe/blob/master/tools/convert_annoset.cpp 
        "--anno_type=detection", # The type of annotation {classification, detection}.
        "--label_type=txt", # The type of label file format for detection {xml, json, txt}.
        "--backend=lmdb",
        "--label_map_file=" + label_map_filename, # A file with LabelMap protobuf message.
        "--check_label=True", # Check that there is no duplicated name/label. 
        "--resize_height=512", # Height images are resized to.
        "--resize_width=512", # Width images are resized to.
        "--encode_type=png", 
        "--encoded=True",
        "--shuffle=True", # Randomly shuffle the order of images and their labels.
        os.path.join(GENERATED_DATA, _dataset_path) + '/', # root, The root directory which contains the images and annotations.
        os.path.join(RESOURCES_PATH, listfile), # listfile, The file which contains image paths and annotation info paths.
        os.path.join(_dataset_path, _dataset + ".lmdb"), # outdir, The output directory which stores the database file.        
    ]
    
    args_str = " ".join(args)
    qsub_script = "\"#PBS -V -I -x -N convert_annoset\nconvert_annoset %s\"" % args_str
    !echo {qsub_script} > qsub_script
    !chmod +x qsub_script
    !qsub $PWD/qsub_script    
    
    !rm $PWD/qsub_script

qsub: waiting for job 4235.c001 to start
qsub: job 4235.c001 ready


  ########################################################################
  # Colfax Cluster - https://colfaxresearch.com/
  #      Date:           Thu Mar 23 09:01:37 PDT 2017
  #    Job ID:           4235.c001
  #      User:           u2459
  # Resources:           neednodes=1,nodes=1,walltime=24:00:00
  ########################################################################
  
I0323 09:01:38.870036 190317 convert_annoset.cpp:156] Shuffling data
I0323 09:01:38.873212 190317 convert_annoset.cpp:159] A total of 156 images.
I0323 09:01:38.941279 190317 db_lmdb.cpp:72] Opened lmdb /home/u2459/Intel_MobileODT/input/generated/train/train.lmdb


In [16]:
!echo Training : $(ls {GENERATED_DATA}/train)
!echo Validation : $(ls {GENERATED_DATA}/val)

Training : images labels listfile.txt train.lmdb
Validation : detection_out images labels listfile.txt val.lmdb


## Setup network and solver

Following https://github.com/intel/caffe/wiki/SSD:-Single-Shot-MultiBox-Detector

In [17]:
ssd300_path = os.path.join(RESOURCES_PATH, "SSD_300x300")
weights_filename = os.path.join(ssd300_path, "VGG_ILSVRC_16_layers_fc_reduced.caffemodel")

if not os.path.exists(weights_filename):
    !cd {RESOURCES_PATH}/SSD_300x300 && wget http://cs.unc.edu/~wliu/projects/ParseNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel -o {weights_filename}

In [18]:
batch_size = 32
n_epochs = 100
data_augmentation_factor = 7
train_dataset_size = !cat {GENERATED_DATA}/train/listfile.txt | wc -l
train_dataset_size = int(train_dataset_size[0])
val_dataset_size = !cat {GENERATED_DATA}/val/listfile.txt | wc -l
val_dataset_size = int(val_dataset_size[0])

train_dataset_size, val_dataset_size

n_train_iterations = int(data_augmentation_factor * train_dataset_size * n_epochs * 1.0 / batch_size + 0.5)
n_train_iterations_per_epoch = int(data_augmentation_factor * train_dataset_size * 1.0 / batch_size + 0.5)
n_val_iterations = int(val_dataset_size * 1.0 / batch_size + 0.5)

n_iterations = n_train_iterations + n_val_iterations


print("max_iter", n_train_iterations)
print("test_iter", n_val_iterations)
print("test_interval", n_train_iterations_per_epoch)


('max_iter', 3413)
('test_iter', 2)
('test_interval', 34)


In [19]:
#train_dataset_filename = os.path.join(GENERATED_DATA, "train", "train.lmdb")
#val_dataset_filename = os.path.join(GENERATED_DATA, "train", "val.lmdb")

## Train network

In [20]:
from datetime import datetime

# Create job folder
JOBS_PATH = os.path.join(GENERATED_DATA, 'jobs')
if not os.path.exists(JOBS_PATH):
    os.makedirs(JOBS_PATH)

# Create current job path
#now = datetime.now()
#current_job_path = os.path.join(JOBS_PATH, "train_%s" % str(now.strftime("%Y-%m-%d-%H-%M")))
#os.makedirs(current_job_path)

In [21]:
!ls {GENERATED_DATA}
!ls {JOBS_PATH}

jobs  train  val


In [22]:
from qsub_utils import submit_job
from qsub_utils import setup_configuration
from qsub_utils import PBS_CONFIGURATION

In [23]:
setup_configuration(nodes='4:knl7210:ram96gb')

In [24]:
!cat {os.path.join(ssd300_path, "solver.prototxt")}

###### Networks ######
train_net: "train.prototxt" 
test_net: "val.prototxt"
###### Training parameters ######
base_lr: 0.0001
max_iter: 3413
lr_policy: "exp"
gamma: 0.975
weight_decay: 7e-07
###### Validation parameters ######
test_iter: 2
test_interval: 34
test_initialization: false
###### Optimizer ######
type: "AdaDelta"
momentum: 0.9
###### Other ######
display: 5
solver_mode: CPU
snapshot: 34
snapshot_prefix: "snapshot_VGG_VOC0712_SSD_300x300"
debug_info: false
snapshot_after_train: true
average_loss: 34
iter_size: 1
eval_type: "detection"
ap_version: "11point"


In [25]:
caffe_train_cmd = [
    "caffe",
    "train",
    "-solver", os.path.join(ssd300_path, "solver.prototxt"),
    #"-weights", weights_filename
]

process, job_info = submit_job(caffe_train_cmd, name='caffe_train_ssd300', cwd=ssd300_path)

try:
    while True:
        out = process.stdout.readline()    
        if len(out) > 0:        
            print out

        if process.poll() is not None and len(out) == 0:
            break
except KeyboardInterupt:
    !qdel {job_info['id']}
    !qstat

qsub: job 4232.c001 ready





  ########################################################################

  # Colfax Cluster - https://colfaxresearch.com/

  #      Date:           Thu Mar 23 08:37:56 PDT 2017

  #    Job ID:           4232.c001

  #      User:           u2459

  # Resources:           neednodes=4:knl7210:ram96gb,nodes=4:knl7210:ram96gb,walltime=24:00:00

  ########################################################################

  

I0323 08:37:58.363988 189992 caffe.cpp:274] Use CPU.

I0323 08:37:58.365396 189992 solver.cpp:108] Initializing solver from parameters: 

train_net: "train.prototxt"

test_net: "val.prototxt"

test_iter: 2

test_interval: 34

base_lr: 0.0001

display: 5

max_iter: 3413

lr_policy: "exp"

gamma: 0.975

momentum: 0.9

weight_decay: 7e-07

snapshot: 34

snapshot_prefix: "snapshot_VGG_VOC0712_SSD_300x300"

solver_mode: CPU

debug_info: false

train_state {

  level: 0

  stage: ""

}

snapshot_after_train: true

test_initialization: false

av