In [1]:
%matplotlib inline
#!pip install fastai -q --upgrade
#!pip install pycocotools
#!pip install -U albumentations

In [2]:
#imports
import math

#from fastai import *
#from fastai.vision import *
from fastai.vision.all import * 

import matplotlib.pyplot as plt
import matplotlib.image as mpim
import matplotlib.patches as patches

from pathlib import Path
import pandas as pd
import json
import os

import torch 
import torch.nn as nn
import torch.nn.functional as F 
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision.transforms as trans

os.chdir("includes")
from engine import train_one_epoch, evaluate
import utils
os.chdir("..")

import albumentations as A
import cv2

In [3]:
#gather ressources
path=Path('images')
testpath=Path('test')

#get training/validation and test images
images=get_image_files(path)
testimages=get_image_files(testpath)

#recover annotations from csv
annotations=pd.read_csv(path/'wappen.csv')
annotations=annotations.drop(['file_size','file_attributes','region_count','region_id'],axis=1)

In [4]:
#mpl helpers
def printImg(image,ax=None,size=None):
    if ax==None:
        im,ax=plt.subplots(figsize=size)
        
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.imshow(image)
    return ax
        
def drawBB(ax, bl, tr,col):
    height=tr[1]-bl[1]
    width=tr[0]-bl[0]
    bb=patches.Rectangle(bl,width,height,fill=False,color=col,lw=2.0)
    ax.add_patch(bb)
    
def drawText(ax,x,y,text,col):
    ax.text(x,y,text,color=col,fontsize=14)
    
def printImages(images,annotation_dict,max_n=42):
    for i in range(min(len(images),max_n)):
        image=images[i]
        ax=printImg(mpim.imread(image),size=[14,14])
        imgname=os.path.normpath(image).split(os.sep)[-1]
        annotations=annotation_dict[imgname]
        for bb,label in annotations:
            color=colordict[label]
            drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
            #drawText(ax,(bb[2]+bb[0])//2-8*len(label),(bb[3]+bb[1])//2-14,label,color)


In [5]:
#some global lookup stuff
colordict={"wappen":'magenta',"text":'cyan',"#na#":'green',"objekt":'red'}
clsToId={"wappen":1,"text":2,"objekt":1}
IdToClsSeperated={1:"wappen",2:"text"}
IdToClsMerged={1:"objekt"}

In [6]:
#global structures

#transforms
mytransforms = A.Compose([
    A.RandomBrightnessContrast(p=0.2),
    #A.Rotate(limit=5),
    A.Resize(1024,1024)
], bbox_params=A.BboxParams(format='pascal_voc', min_visibility=0.1, label_fields=['labels']))

testTransforms=trans.Compose([
    trans.Resize((1024,1024)),
    trans.ToTensor()
])

In [7]:
#dataset creation helpers
def selectClosestText(filename,bbox,maxYdown=100,maxYup=800,maxXdiff=300):
    #collect all text boxes that are located closely to the top of the wappen box or None if there is none
    maxY=min(bbox[1],bbox[3])
    medX=(bbox[0]+bbox[2])/2
    closestBox=None
    closestDist=0
    for tbox,cls in imtoann[filename]:
        if cls!='text':
            continue
        mdlY=(tbox[1]+tbox[3])/2
        if (mdlY-maxY)>maxYdown:
            continue
        if (maxY-mdlY)>maxYup:
            continue
        if abs((bbox[0]-tbox[0]))>maxXdiff and abs((bbox[2]-tbox[2]))>maxXdiff:
            continue
        mdlX=(tbox[0]+tbox[2])/2
        dist=(maxY-mdlY)**2 +(medX-mdlX)**2
        if closestBox==None:
            closestBox=(tbox,cls)
            closestDist=dist
        elif dist<closestDist:
            closestBox=(tbox,cls)
            closestDist=dist
            
    return closestBox

#transforms a given dictionary to a dictionary that is needed to create torch datasets
def getDatasetDictionary(inDict):
    imtoannlist={}
    for image in inDict:
        bblist=[]
        lbllist=[]
        for bb,lbl in inDict[image]:
            bblist.append(bb)
            lbllist.append(lbl)
        imtoannlist[image]=(bblist,lbllist)
    return imtoannlist

def createTorchDataset(dataset,val_pct=0.2):
    #split indices
    indices=torch.randperm(len(dataset)).tolist()
    
    lastTrain=len(dataset)-(int(len(dataset)*val_pct))

    #create troch datasets
    trainingSet=torch.utils.data.Subset(dataset, indices[:lastTrain])
    validationSet=torch.utils.data.Subset(dataset, indices[lastTrain:])
    
    return (trainingSet,validationSet)

def summarizeData(inDict):
    numImgs=len(inDict)
    numDict={}
    
    curDict={}
    minDict={}
    maxDict={}
    for key in inDict:
        curDict={}
        labels=inDict[key][1]
        boxes=inDict[key][0]
        for i in range(len(labels)):
            if(labels[i] in numDict):
                numDict[labels[i]]+=1
            else:
                numDict[labels[i]]=1
            if(labels[i] in curDict):
                curDict[labels[i]]+=1
            else:
                curDict[labels[i]]=1
                
            for key2 in curDict:
                if(key2 in minDict):
                    minDict[key2]=min(curDict[key2],minDict[key2])
                else:
                    minDict[key2]=curDict[key2]
                if(key2 in maxDict):
                    maxDict[key2]=max(curDict[key2],maxDict[key2])
                else:
                    maxDict[key2]=curDict[key2]
            
    print("Dataset comprises "+str(numImgs)+" images")
    for key in numDict:
        print("Number of "+str(key)+": "+str(numDict[key]))
        print("Maximum number of "+str(key)+": "+str(maxDict[key]))
        print("Minimum number of "+str(key)+": "+str(minDict[key]))
    

In [8]:
#definition of the dataset class
class EmblemTextSet(torch.utils.data.Dataset):
    def __init__(self,images,annotation_dict,transforms=None):
        self.transforms=transforms
        self.images=images
        self.dict=annotation_dict
    
    def __getitem__(self, idx):
        img = Image.open(images[idx]).convert("RGB")
        imgkey=os.path.normpath(images[idx]).split(os.sep)[-1]
        boxes=torch.tensor(self.dict[imgkey][0])
        labels=torch.tensor([clsToId[label] for label in self.dict[imgkey][1]],dtype=torch.int64)    
       
        if self.transforms!=None:
            img_transformed = self.transforms(image=np.array(img), bboxes=boxes,labels=labels)
        
        imgt=trans.ToTensor()(img_transformed['image'])
        target = {}
        target["boxes"] = torch.tensor(img_transformed['bboxes'],dtype=torch.float32)
        target["labels"] = labels
        target["image_id"] = torch.tensor([idx])
        target["area"]=torch.tensor([(b[3]-b[1])*(b[2]-b[0]) for b in target['boxes']])
        target["iscrowd"]=torch.tensor([0]*len(target['labels']), dtype=torch.int64)
    
        return imgt, target
    
    def __len__(self):
         return len(self.images)

In [9]:
#train functions

#train function that stops training process when the precision on the validation set decreases or stagnates
def train_model_prevent_overfit(model,datasets,bs,epochs,filename="./stored_models/currentModelEpochFinder",optimizer=None,lr=0.005,
                               max_decrease=0.3,max_worsening_epochs=10):
    device="cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    #create loaders
    train,valid=datasets
    train_loader=torch.utils.data.DataLoader(train, batch_size=bs, shuffle=True,collate_fn=utils.collate_fn)
    validation_loader=torch.utils.data.DataLoader(valid, batch_size=bs, shuffle=True,collate_fn=utils.collate_fn)
      
    params = [p for p in model.parameters() if p.requires_grad]
    
    if optimizer==None:
        optimizer = torch.optim.SGD(params, lr=lr,momentum=0.9,weight_decay=0.0005)
        #optimizer =torch.optim.Adam(params, lr=lr, betas=(0.9, 0.999), eps=1e-08)
        
    lr_scheduler =torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=4, T_mult=2)
    #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
    
    bestmAp=0.0
    mAphistory=[]
    strikes=0
    
    #training loop
    for epoch in range(epochs):
        #train on training set
        train_one_epoch(model, optimizer, train_loader, device, epoch,
                   print_freq=10)
        #adjust lr
        lr_scheduler.step()
  
        #check valid set
        coco_evaluator=evaluate(model, validation_loader, device=device)
        
        mAp=0
        for _,evaluator in coco_evaluator.coco_eval.items():
            mAp=sum(evaluator.stats[0:3])/3
            
        mAphistory.append(mAp)
            
        if(mAp<bestmAp*(1-max_decrease)):
            print("precision decreased by more than "+str(max_decrease)+" percent from best value at epoch: "+str(epoch)+" abandoning!")
            break
        
        if(mAp>bestmAp):
            strikes=0
            bestmAp=mAp
            torch.save(model.state_dict(),filename+"-bestmap")
        else:
            strikes+=1
            
        if(strikes>=max_worsening_epochs):
            print("precision on validation has not increased in "+str(max_worsening_epochs)+" epochs in epoch "+str(epoch)+" abandoning!")
            break
        
    #store trained parameters
    torch.save(model.state_dict(),filename)
    #return the epoch number with the best loss on validation set
    print(mAphistory)
    return mAphistory.index(max(mAphistory))+1
    


#default train function
def train_model(model,datasets,bs,epochs,filename="./stored_models/currentModel",optimizer=None,lr=0.005):
    device="cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    #create loaders
    train,valid=datasets
    train_loader=torch.utils.data.DataLoader(train, batch_size=bs, shuffle=True,collate_fn=utils.collate_fn)
    validation_loader=torch.utils.data.DataLoader(valid, batch_size=bs, shuffle=True,collate_fn=utils.collate_fn)
      
    params = [p for p in model.parameters() if p.requires_grad]
    
    if optimizer==None:
        optimizer = torch.optim.SGD(params, lr=lr,momentum=0.9,weight_decay=0.0005)
        #optimizer =torch.optim.Adam(params, lr=lr, betas=(0.9, 0.999), eps=1e-08)
        
    lr_scheduler =torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=4, T_mult=2)
    #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3,gamma=0.1)
    
    #training loop
    for epoch in range(epochs):
        #train on training set
        train_one_epoch(model, optimizer, train_loader, device, epoch,
                   print_freq=10)
        #adjust lr
        lr_scheduler.step()
  
        #check valid set
        evaluate(model, validation_loader, device=device)
        
    #store trained parameters
    torch.save(model.state_dict(),filename)

In [10]:
#shameless copy of train_one_epoch from torchvision engine, with slight adjustments 
#such that loss and learning rate values are stored in respective lists

def train_one_epoch_LRfind(model, optimizer, data_loader, device, epoch, 
                           lr_updater,print_freq,lrlist,losslist,stopFactor,lrfactor):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
    
    #this loops all minibatches
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        
        
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            #print("Loss is {}, stopping training".format(loss_value))
            #print(loss_dict_reduced)
            #sys.exit(1)
            return

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        #append (smoothed) current loss to losslist
        
        curloss=float(losses.cpu().detach().numpy())
        if len(losslist)>0:
            losslist.append(0.05  * curloss + (1 - 0.05) * losslist[-1])
        else:
            losslist.append(curloss)

        curlr=optimizer.state_dict()["param_groups"][0]["lr"] 
        lrlist.append(curlr)
        
        if lr_scheduler is not None:
            lr_scheduler.step()

        lr_updater.step()
            
        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        
#plots learning rate vs loss
def plotLR(lrates,losses,skipf=1,skipl=1,minimal=None,steepest=None):
    fig, ax = plt.subplots(figsize=(12,6))
    plt.plot(lrates[skipf:-skipl],losses[skipf:-skipl])
    plt.xscale("log")
    plt.xlabel("learning rate")
    plt.ylabel("smoothened loss")
    if(minimal!=None):
        plt.scatter([minimal[0]],[minimal[1]])
    if(steepest!=None):
        plt.scatter([steepest[0]],[steepest[1]])
    plt.show()

#learning rate finder, if optimizer is passed, it is expected to have lowerBound as learning rate
def findStartingLR(data,model,bs,lowerBound=1e-7,upperBound=0.1,stopFactor=10,optimizer=None,steps=100,plot=True):
    #load model to device
    device="cuda" if torch.cuda.is_available() else "cpu" 
    model.to(device)
    
    train_loader=torch.utils.data.DataLoader(data[0], batch_size=bs, shuffle=True,collate_fn=utils.collate_fn)
    
    curLR=lowerBound
    
    lrates=[]
    losses=[]

    #set optimizer with initial lr at lower bound
    params = [p for p in model.parameters() if p.requires_grad]
    
    if optimizer==None:
        optimizer=torch.optim.SGD(params, lr=lowerBound)
        
    #set lr scheduler to a lambda function such that lr gets increased exponentially from lower bound to upper bound
    totalfactor=upperBound/lowerBound
    factor=totalfactor**(1.0/steps)
    multiplyLR= lambda x: factor**x
    multiplyLR_sched=torch.optim.lr_scheduler.LambdaLR(optimizer, multiplyLR)
    
    #set number of epochs to include at least steps number of steps
    batches_per_epoch=math.ceil(len(data[0])/bs)
    num_epochs=math.ceil(steps/batches_per_epoch)
    
    for i in range(num_epochs):
        train_one_epoch_LRfind(model, optimizer, train_loader, device, i, lr_updater=multiplyLR_sched, print_freq=10,
                               lrlist=lrates,losslist=losses,stopFactor=stopFactor,lrfactor=factor)
        
    losses.pop(0)
    lrates.pop(0)
    
    #find min point
    mind=losses.index(min(losses))
    minimal=(lrates[mind],min(losses))
    
    descents=[]
    lrates_descents=[]
    for i in range(1,len(losses)):
        descents.append(losses[i-1]-losses[i])
        lrates_descents.append(lrates[i-1])
    
    steepestind=descents.index(max(descents))
    steepest=(lrates_descents[steepestind],0.5*(losses[steepestind]+losses[steepestind+1]))
    
    #print(lrates)
    #print(losses)
    #print(lrates_descents)
    #print(descents)
    
    #plot LR vs losses
    if plot:
        plotLR(lrates,losses,minimal=minimal,steepest=steepest)
    
    #return minimal and steepest point
    return (steepest[0],minimal[0])
    




In [11]:
#trains a frcnn model with resnet50 backbone on the given data
#if a state dict is given, the model parameters will be initialized according to it
#otherwise a pretrained model on CoCo is used
def singleTrainingSession(data,num_classes,bs,stateDict=None,filename="./stored_models/currentModel"):
    #create trained model
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features,num_classes)
    
    #create the model that is used for finding the lr
    lrfindmodel = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    lrfindmodel.roi_heads.box_predictor = FastRCNNPredictor(in_features,num_classes)
    
    #load previously trained parameters
    if stateDict!=None:
        model.load_state_dict(torch.load(stateDict))
        lrfindmodel.load_state_dict(torch.load(stateDict))
    
    #use lrfindmodel to get a decent learning rate for sgd
    lrsuggestions=findStartingLR(data,lrfindmodel,bs=bs,lowerBound=1e-7,upperBound=1,steps=200,plot=False)
    print(lrsuggestions)
    
    #in cases where the steepest descent is at a higher lr than the minimal loss, no reasonable learning rate could be discovered
    if(lrsuggestions[0]>lrsuggestions[1]):
        print("No reasonable learning rate could be discovered")
        return
    
    #take a value that is close to the steepest descent shifted a bit towards the minimal loss
    lr=(995/1000*lrsuggestions[0]+5/1000*lrsuggestions[1])
    
    #train_model(model,data,bs=bs,epochs=20,filename=filename,lr=lr)
    bestNepochs=train_model_prevent_overfit(model,data,bs,epochs=200,filename=filename,lr=lr,max_decrease=0.1,max_worsening_epochs=10)
    print("best results after "+str(bestNepochs)+" epochs")
    
    #return the trained model
    return model
    




In [12]:
#filters a list of annotations based on their score, by looking at a models confidence in the predictions
def filerConfidenceGap(boxes,labels,confidences,always_include_above=0.9,never_include_below=0.2):
    #find distances between confidences to discover largest gap
    largest_gap=0
    gap_confidence=0
    for i in range(1,len(confidences)):
        gap=confidences[i]-confidences[i-1]
        if(gap>largest_gap):
            largest_gap=gap
            gap_confidence=0.5*(confidences[i]+confidences[i-1])
   
    #if largest gap < lower limit, set it to the lower limit
    #if largest gap > upper limit, set it to the upper limit
    gap_confidence=min(max(gap_confidence,never_include_below),always_include_above)
    
    fboxes=[]
    flabels=[]
    fconfidences=[]
    #filter
    for i in range(len(confidences)):
        if confidences[i]>gap_confidence:
            fboxes.append([float(boxes[i][0]),float(boxes[i][1]),float(boxes[i][2]),float(boxes[i][3])])
            flabels.append(int(labels[i]))
            fconfidences.append(confidences[i])
    
    return (fboxes,flabels,fconfidences)

def evaluateAndPrintData(model,dataset,labelDict,max_num=4,filename="./stored_models/currentModel",score_threshold=0.5):
    #load state dict from path
    model.load_state_dict(torch.load(filename))
    #put model in eval mode
    model.eval()
    
    for i in range(min(max_num,len(dataset))):
        image,gtannotations=dataset[i]
        #print(mpim.imread(images[0]))
        
        img=Image.fromarray(image.mul(255).permute(1, 2,0).byte().numpy())
        ax=printImg(np.asarray(img),size=[14,14])
       
        annotations=(gtannotations['boxes'],gtannotations['labels'])
        #print ground truth
        for j in range(len(annotations[1])):
            label=labelDict[int(annotations[1][j])]
            bb=annotations[0][j]
            color=colordict[label]
            drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
            #drawText(ax,(bb[2]+bb[0])//2-8*len(label),(bb[3]+bb[1])//2-14,label,color)
        
        prediction=model([image])
        ax=printImg(np.asarray(img),size=[14,14])
        
        boxes=prediction[0]["boxes"]
        labels=prediction[0]["labels"]
        scores=prediction[0]["scores"]
        #print(scores)
       
        #filter out low scoring predictions
        fboxes,flabels,_=filerConfidenceGap(boxes,labels,scores,always_include_above=0.9,never_include_below=score_threshold)
        
        #print(fboxes)
        #print(flabels)
        annotations=(torch.tensor(fboxes),torch.tensor(flabels))
        
        #print prediction
        for j in range(len(annotations[1])):
            label=labelDict[int(annotations[1][j])]
            bb=annotations[0][j]
            color=colordict[label]
            drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
            #drawText(ax,(bb[2]+bb[0])//2-8*len(label),(bb[3]+bb[1])//2-14,label,color)

#evaluate unseen image with model and print the resulting predicted boxes
def evaluateAndPrintUnseenData(model,images,transforms,labelDict,max_num=4,filename="./stored_models/currentModel",score_threshold=0.5):
    #load state dict from path
    model.load_state_dict(torch.load(filename))
    #put model in eval mode
    model.eval()
    
    for i in range(min(max_num,len(images))):
        img=Image.open(images[i]).convert("RGB")
        imgTensor=transforms(img)
        #get prediction
        prediction=model([imgTensor])
        #get image
        image=Image.fromarray(imgTensor.mul(255).permute(1, 2,0).byte().numpy())
        ax=printImg(np.asarray(image),size=[14,14])
        
        boxes=prediction[0]["boxes"]
        labels=prediction[0]["labels"]
        scores=prediction[0]["scores"]
        
        #filter out low scoring predictions
        fboxes=[]
        flabels=[]
        for j in range(len(boxes)):
            if scores[j]>score_threshold:
                fboxes.append([boxes[j][0],boxes[j][1],boxes[j][2],boxes[j][3]])
                flabels.append(labels[j])
                
        annotations=(torch.tensor(fboxes),torch.tensor(flabels))
        
        #print prediction
        for j in range(len(annotations[1])):
            label=labelDict[int(annotations[1][j])]
            bb=annotations[0][j]
            color=colordict[label]
            drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
            #drawText(ax,(bb[2]+bb[0])//2-8*len(label),(bb[3]+bb[1])//2-14,label,color)

#prints evaluations on seen and unseen data
def summary(model,images,data,transforms,labelDict,max_n=4,filename="./stored_models/currentModel",score_threshold=0.5):
    evaluateAndPrintData(model,data,labelDict,max_num=max_n,filename=filename,score_threshold=score_threshold)
    evaluateAndPrintUnseenData(model,images,transforms,labelDict,max_num=max_n,filename=filename,score_threshold=score_threshold)

In [13]:
#more functions to display images 
def area(box1,box2):
    x1,y1,x2,y2=box1
    x3,y3,x4,y4=box2
    xdiff=min(max(x1,x2),max(x3,x4))-max(min(x1,x2),min(x3,x4))
    ydiff=min(max(y1,y2),max(y3,y4))-max(min(y1,y2),min(y3,y4))
    
    if(xdiff<0 or ydiff<0):
        return 0
    
    return xdiff*ydiff

def IoU(box1,box2):
    x1,y1,x2,y2=box1
    x3,y3,x4,y4=box2
    
    area1=(max(x1,x2)-min(x1,x2))*(max(y1,y2)-min(y1,y2))
    area2=(max(x3,x4)-min(x3,x4))*(max(y3,y4)-min(y3,y4))
    
    intersectArea=area(box1,box2)
    unitedArea=area1+area2-intersectArea
    
    IoU=0.
    if(unitedArea>0):
            IoU=intersectArea/unitedArea
            
    return IoU

#check weather first box is completely surrounded by second box
def fullSurround(box1,box2):
    x1,y1,x2,y2=box1
    x3,y3,x4,y4=box2
    if x3<x1 and y3<y1 and x4>x2 and y4>y2:
        return True
    else:
        return False
    

#filter boxes of the same class that overlap too much
def filterOverlappingBoxes(boxes,label,scores,threshold=0.9):
    filteredBoxes=[]
    filteredLabels=[]
    filteredOut=[]
    for i in range(len(boxes)):
        conflict=False
        for j in range(len(boxes)):
            if i==j:
                continue
            if label[i]!=label[j]:
                continue
            if j in filteredOut:
                continue       
            #check for overlap with box j
            overlap=IoU(boxes[i],boxes[j])
            if overlap<threshold and not fullSurround(boxes[i],boxes[j]):
                continue
            #if overlap is big enough, and confidence in second box is higher, mark for removal
            if scores[i]<scores[j]:
                conflict=True
                break
        
        if conflict:
            filteredOut.append(i)
        else:
            filteredBoxes.append(boxes[i])
            filteredLabels.append(label[i])
        
    return (filteredBoxes,filteredLabels)



#convert bounding box predictions trained on a 1024x1024 to original image size
def convertBoxesToOriginalSize(boxes,image,transformedSize=(1024,1024)):
    img=Image.open(image).convert("RGB")
    origSize=img.size
    #print(transformedSize[0])
    #print(origSize[0])
    scales=(origSize[0]/transformedSize[0] , origSize[1]/transformedSize[1])
    scaledBoxes=[]
    for box in boxes:
        scaledBoxes.append([scales[0]*box[0],scales[1]*box[1],scales[0]*box[2],scales[1]*box[3]])
    return scaledBoxes


#print predictions from two different models next to each other for comparison
def displayComparison(images,models,labelDicts,size=(6,4),annotations=None,gtLabels=None,score_min=0.3,score_max=0.9,threshold=0.9):
    for model in models:
        model.eval()
    #get number of rows/columns for subplots
    height=len(images)
    cols=len(models)
    
    if(height>=10):
        half=len(images)//2
        displayComparison(images[:half],models,labelDicts,size,annotations,gtLabels,score_min,score_max,threshold)
        displayComparison(images[half:],models,labelDicts,size,annotations,gtLabels,score_min,score_max,threshold)
        return
   
    #set up overlying figure
    #fig=plt.figure(figsize=size)
    fig,axarr=plt.subplots(height,cols,figsize=(size[0]*cols,size[1]*height))
    
    
    
    #loop through all subplots
    for row in range(height):
        image=images[row]
        img=Image.open(image).convert("RGB")
        for col in range(cols):
            index=row*cols+col+1
               
            #print annotated ground truth image
            if(annotations!=None and col==cols-1):
                ax=showImg(mpim.imread(image),fig,height,cols,index,size)
                imgname=os.path.normpath(image).split(os.sep)[-1]
                annList=annotations[imgname]
                for bb,label in annList:
                    color=gtLabels[label]
                    drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
            
            #display predicted image
            else:
                model=models[col]
                #transform
                imgTensor=testTransforms(img)
                #image=Image.fromarray(imgTensor.mul(255).permute(1, 2,0).byte().numpy())
                #predict
                prediction=model([imgTensor])
                #filter
                fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"],always_include_above=score_max,never_include_below=score_min)
                #print(fboxes)
                #print(flabels)
                fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=threshold)
                
                fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))
                annList=(torch.tensor(fboxes),torch.tensor(flabels))
                
                #print(annList[0])
                ax=axarr[row,col]
                #ax=plt.subplot(height,cols,index)
                ax.get_xaxis().set_visible(False)
                ax.get_yaxis().set_visible(False)
                ax.imshow(img)
                #ax.imshow(np.asarray(image))
                #display
                #ax=showImg(mpim.imread(image),fig,height,cols,index,size)
                for i in range(len(annList[0])):
                    bb=annList[0][i]
                    label=annList[1][i]
                    labeltxt=labelDicts[col][int(label)]
                    color=colordict[labeltxt]
                    drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)

                    
#slight adaptation to the dataset variants of these methods to allow for      
def selectClosestText2(boxes,label,index,maxYdown=100,maxYup=800,maxXdiff=300):
    #collect all text boxes that are located closely to the top of the wappen box or None if there is none
    bbox=boxes[index]
    
    maxY=min(bbox[1],bbox[3])
    medX=(bbox[0]+bbox[2])/2
    closestBox=None
    closestDist=0
    for i in range(0,len(boxes)):
        tbox=boxes[i]
        cls=label[i]
        if i==index:
            continue
        if cls!=2:
            continue
        mdlY=(tbox[1]+tbox[3])/2
        if (mdlY-maxY)>maxYdown:
            continue
        if (maxY-mdlY)>maxYup:
            continue
        if abs((bbox[0]-tbox[0]))>maxXdiff and abs((bbox[2]-tbox[2]))>maxXdiff:
            continue
        mdlX=(tbox[0]+tbox[2])/2
        dist=(maxY-mdlY)**2 +(medX-mdlX)**2
        if closestBox==None:
            closestBox=tbox
            closestDist=dist
        elif dist<closestDist:
            closestBox=tbox
            closestDist=dist
            
    return closestBox        

#return a list of merged
def mergeBoxes(boxes,labels,maxYdown=100,maxYup=800,maxXdiff=300,mergedList=None):
    mergedBoxes=[]
    mergedLabels=[]
    for i in range(len(boxes)):
        x1,y1,x2,y2=boxes[i]
        if labels[i]==1:
            textbox=selectClosestText2(boxes,labels,i,maxYdown,maxYup,maxXdiff)
            if textbox==None:
                mergedBoxes.append(boxes[i])
            else:
                x3,y3,x4,y4=textbox
                mergedBox=[min(x1,x3),min(y1,y3),max(x2,x4),max(y2,y4)]
                mergedBoxes.append(mergedBox)
            if mergedList!=None:
                mergedList.append((boxes[i],textbox))
            #new structure will always have the label 'object'
            mergedLabels.append(1)
            
    return (mergedBoxes,mergedLabels)       


#prints several steps next to each other from the post processing of seperated images
def summarySeperated(model,images,labelDict,size=(12,10)):
    n_steps=3 #3-steps prediction-overlapping box pruning-wappen,text merging
    height=len(images)
    
    if(height>10):
        half=height//2
        fhimages=images[:half]
        lhimages=images[half:]
        summarySeperated(model,fhimages,labelDict,size)
        summarySeperated(model,lhimages,labelDict,size)
        return
    
    
    fig,axarr=plt.subplots(height,n_steps,figsize=(size[0]*n_steps,size[1]*height))
    
    for row in range(height):
        #get predictions and postprocess
        image=images[row]
        img=Image.open(image).convert("RGB")
        imgTensor=testTransforms(img)
        prediction=model([imgTensor])
        fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
        fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
        annList1=(torch.tensor(fboxes),torch.tensor(flabels))
        fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
        annList2=(torch.tensor(fboxes),torch.tensor(flabels))
        fboxes,flabels=mergeBoxes(fboxes,flabels,maxYdown=100,maxYup=800,maxXdiff=300)
        #print(fboxes)
        #print(flabels)
        annList3=(torch.tensor(fboxes),torch.tensor(flabels))
        annLists=[annList1,annList2,annList3]
        for col in range(n_steps):
            #index=row*n_steps+col+1   
            ax=axarr[row,col]
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)
            ax.imshow(img)
            annList=annLists[col]
            for i in range(len(annList[0])):
                bb=annList[0][i]
                label=annList[1][i]
                labeltxt=labelDict[int(label)]
                if(col==2):
                    labeltxt="objekt"
                color=colordict[labeltxt]
                drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
    

In [14]:
#functions for outputting predicted image data to json
def isIn(box,boxlist):
    x1,y1,x2,y2=box
    for box1,box2 in boxlist:
        x3,y3,x4,y4=box1
        if(x1==x3 and x2==x4 and y1==y3 and y2==y4):
            return True
        if(box2==None):
            continue
        x3,y3,x4,y4=box2
        if(x1==x3 and x2==x4 and y1==y3 and y2==y4):
            return True
    
    return False

#create a json file for image and 
def createJson(image,model,fname):
    
    imgname=os.path.normpath(image).split(os.sep)[-1]
    data={}
    op=open(fname,"w+")
    data["filename"]=imgname
    data["regions"]=[]
      
    img=Image.open(image).convert("RGB")
    imgTensor=testTransforms(img)
    prediction=model([imgTensor])
       
    fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
    fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
    fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
    annList1=(torch.tensor(fboxes),torch.tensor(flabels))
    mergedList=[]
    fboxes,flabels=mergeBoxes(fboxes,flabels,maxYdown=100,maxYup=800,maxXdiff=300,mergedList=mergedList)
    annList2=(torch.tensor(fboxes),torch.tensor(flabels))
    
    for i in range (len(annList2[0])):
        curdict={}
        curdict["class"]="merged"
        box=annList2[0][i]
        text=mergedList[i][1]
        symbol=mergedList[i][0]
        curdict["box"]=[float(box[0]),float(box[1]),float(box[2]),float(box[3])]
        if text!=None:
            curdict["textbox"]=[float(text[0]),float(text[1]),float(text[2]),float(text[3])]
        curdict["symbolbox"]=[float(symbol[0]),float(symbol[1]),float(symbol[2]),float(symbol[3])]
        data["regions"].append(curdict)
        
    #take care of the artifacts that were not merged
    for j in range(len(annList1[0])):
        box=annList1[0][i]
        cls=annList1[1][i]
        if not isIn(box,mergedList):
            curdict={}
            if(cls==1):
                curdict["class"]="symbol"
            if(cls==2):
                curdict["class"]="text"
            curdict["box"]=[float(box[0]),float(box[1]),float(box[2]),float(box[3])]
            data["regions"].append(curdict)
    
    #print(data)
    json.dump(data,op)
    
    
#create a distinct json file for each region of intrest in image
def createIndividualJsons(image,model,fname):
    imgname=os.path.normpath(image).split(os.sep)[-1]
    
    img=Image.open(image).convert("RGB")
    imgTensor=testTransforms(img)
    prediction=model([imgTensor])
    
    
    fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
    fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
    fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
    annList1=(torch.tensor(fboxes),torch.tensor(flabels))
    mergedList=[]
    fboxes,flabels=mergeBoxes(fboxes,flabels,maxYdown=100,maxYup=800,maxXdiff=300,mergedList=mergedList)
    annList2=(torch.tensor(fboxes),torch.tensor(flabels))
    
    #print(annList2[0])
    #print(mergedList)
    
    coveredParts=[]
    #loop through every merged box
    for i in range (len(annList2[0])):
        bbox=annList2[0][i]
        data={}
        outfile=fname+"-merged-"+str(i)+".json"
        op=open(outfile,"w+")
        data["filename"]=imgname
        data["region"]=[]
        curdict={}
        curdict["class"]="merged"
        curdict["bbox"]=[float(bbox[0]),float(bbox[1]),float(bbox[2]),float(bbox[3])]
        data["region"].append(curdict)
        originSymbol=mergedList[i][0]
        curdict={}
        curdict["class"]="symbol"
        curdict["bbox"]=[float(originSymbol[0]),float(originSymbol[1]),float(originSymbol[2]),float(originSymbol[3])]
        data["region"].append(curdict)
        originText=mergedList[i][1]
        if originText!=None:
            curdict={}
            curdict["class"]="text"
            curdict["bbox"]=[float(originText[0]),float(originText[1]),float(originText[2]),float(originText[3])]
            data["region"].append(curdict)
        #print("")
        #print(data)
        json.dump(data,op)
    
    idx=0
    for i in range (len(annList1[0])):
        box=annList1[0][i]
        cls=annList1[1][i]
        #search if box was already covered as part of the merged boxes
        if not isIn(box,mergedList):
            data={}
            outfile=fname+"-fragment-"+str(idx)+".json"
            op=open(outfile,"w+")
            idx+=1
            data["filename"]=imgname
            data["region"]=[]
            curdict={}
            if(cls==1):
                curdict["class"]="symbol"
            if(cls==2):
                curdict["class"]="text"
            curdict["bbox"]=[float(box[0]),float(box[1]),float(box[2]),float(box[3])]
            data["region"].append(curdict)
            #print("")
            #print(data)
            json.dump(data,op)


#create json files for each image in images
def createJsonsFor(model,images,paramfile="./stored_models/currentModel",splitOnSymbols=False):
    model.load_state_dict(torch.load(paramfile))
    model.eval()
    for image in images:
        imgname=os.path.normpath(image).split(os.sep)[-1]
        fname="./outputfiles/"+imgname.split('.')[0]
        if splitOnSymbols:
            createIndividualJsons(image,model,fname)
        else:
            createJson(image,model,fname+".json")
       
        
        
    

In [15]:
#create dictionary from image to a list of annotations
imtoann={}
for index,row in annotations.iterrows():
    imgname=row['filename']
    #print(imgname)
    bbdata=json.loads(row['region_shape_attributes'])
    #print(bbdata)
    labeldata=json.loads(row['region_attributes'])
    x1=int(bbdata['x'])
    y1=int(bbdata['y'])
    x2=x1+int(bbdata['width'])
    y2=y1+int(bbdata['height'])
    bb=[x1,y1,x2,y2]
    label=labeldata['region']
    annotation=(bb,label)
    if imgname in imtoann:
        if(not annotation in imtoann[imgname]):
            imtoann[imgname].append(annotation)
    else:
        imtoann[imgname]=[annotation]
        
imtoannlist=getDatasetDictionary(imtoann)

In [16]:
#create dictionary of merged bbs
imToMergedBB={}
for item in imtoann:
    filename=item
    for item in imtoann[filename]:
        bbox,cls=item
        if cls!='wappen':
            continue
        matchingText=selectClosestText(filename,bbox)
        #print(item)
        #print(matchingText)
        
        annotation=None
        if matchingText==None:
            annotation=(bbox,'objekt')
        else:
            txtbox=matchingText[0]
            minX=min(bbox[0],bbox[2],txtbox[0],txtbox[2])
            minY=min(bbox[1],bbox[3],txtbox[1],txtbox[3])
            maxX=max(bbox[0],bbox[2],txtbox[0],txtbox[2])
            maxY=max(bbox[1],bbox[3],txtbox[1],txtbox[3])
            annotation=([minX,minY,maxX,maxY],'objekt')
        if(filename in imToMergedBB):
            if(not annotation in imToMergedBB[filename]):
                imToMergedBB[filename].append(annotation)
        else:
            imToMergedBB[filename]=[annotation] 
            
imToAnnMerged=getDatasetDictionary(imToMergedBB)

In [17]:
#create datasets
#box and text seperated
datasetSeperated=EmblemTextSet(images,imtoannlist,mytransforms)
dataSeperated=createTorchDataset(datasetSeperated,0.2)
#box and text merged
datasetMerged=EmblemTextSet(images,imToAnnMerged,mytransforms)
dataMerged=createTorchDataset(datasetMerged,0.2)

In [18]:
#findLRSeperated
testmodel = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = testmodel.roi_heads.box_predictor.cls_score.in_features
testmodel.roi_heads.box_predictor = FastRCNNPredictor(in_features,3)

findStartingLR(dataSeperated,testmodel,bs=4,lowerBound=1e-7,upperBound=1,stopFactor=10,optimizer=None,steps=200,plot=True)

KeyboardInterrupt: 

In [23]:
#findLRMerged
testmodel = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = testmodel.roi_heads.box_predictor.cls_score.in_features
testmodel.roi_heads.box_predictor = FastRCNNPredictor(in_features,2)

findStartingLR(dataMerged,testmodel,bs=4,lowerBound=1e-7,upperBound=1,stopFactor=10,optimizer=None,steps=200,plot=True)

In [18]:
trainedModelSeperated=singleTrainingSession(dataSeperated,3,bs=4)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: [0]  [ 0/16]  eta: 0:01:24  lr: 0.000000  loss: 8.9704 (8.9704)  loss_classifier: 1.1083 (1.1083)  loss_box_reg: 0.7429 (0.7429)  loss_objectness: 6.3698 (6.3698)  loss_rpn_box_reg: 0.7494 (0.7494)  time: 5.2624  data: 0.8625  max mem: 3164
Epoch: [0]  [10/16]  eta: 0:00:11  lr: 0.000000  loss: 9.3045 (9.3152)  loss_classifier: 1.0722 (1.0732)  loss_box_reg: 0.8345 (0.8253)  loss_objectness: 6.5739 (6.7230)  loss_rpn_box_reg: 0.7055 (0.6936)  time: 1.8453  data: 0.8534  max mem: 3164
Epoch: [0]  [15/16]  eta: 0:00:01  lr: 0.000000  loss: 9.3045 (9.4190)  loss_classifier: 1.0722 (1.0753)  loss_box_reg: 0.8249 (0.8131)  loss_objectness: 6.6307 (6.8234)  loss_rpn_box_reg: 0.7047 (0.7072)  time: 1.7822  data: 0.8185  max mem: 3164
Epoch: [0] Total time: 0:00:28 (1.7824 s / it)
Epoch: [1]  [ 0/16]  eta: 0:00:24  lr: 0.000000  loss: 9.2799 (9.2799)  loss_classifier: 1.0730 (1.0730)  loss_box_reg: 0.9233 (0.9233)  loss_objectness: 6.6525 (6.6525)  loss_rpn_box_reg: 0.6310 (0.6310)  tim

(0.00010232929922807567, 0.05495408738576271)
Epoch: [0]  [ 0/16]  eta: 0:00:30  lr: 0.000025  loss: 9.7896 (9.7896)  loss_classifier: 1.3266 (1.3266)  loss_box_reg: 0.6623 (0.6623)  loss_objectness: 7.0648 (7.0648)  loss_rpn_box_reg: 0.7359 (0.7359)  time: 1.9250  data: 0.8157  max mem: 3164
Epoch: [0]  [10/16]  eta: 0:00:09  lr: 0.000276  loss: 4.4667 (5.4870)  loss_classifier: 1.0243 (1.0379)  loss_box_reg: 0.7289 (0.7283)  loss_objectness: 2.2829 (3.1450)  loss_rpn_box_reg: 0.5962 (0.5757)  time: 1.5586  data: 0.8261  max mem: 3227
Epoch: [0]  [15/16]  eta: 0:00:01  lr: 0.000377  loss: 2.1718 (4.3936)  loss_classifier: 0.7931 (0.9222)  loss_box_reg: 0.7543 (0.7333)  loss_objectness: 0.3129 (2.2446)  loss_rpn_box_reg: 0.4401 (0.4934)  time: 1.5160  data: 0.8103  max mem: 3227
Epoch: [0] Total time: 0:00:24 (1.5162 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3273 (0.3273)  evaluator_time: 0.1702 (0.1702)  time: 1.3002  data: 0.7887  max mem: 322

Epoch: [4]  [ 0/16]  eta: 0:00:23  lr: 0.000377  loss: 1.4214 (1.4214)  loss_classifier: 0.4517 (0.4517)  loss_box_reg: 0.7454 (0.7454)  loss_objectness: 0.0451 (0.0451)  loss_rpn_box_reg: 0.1792 (0.1792)  time: 1.4944  data: 0.8398  max mem: 3227
Epoch: [4]  [10/16]  eta: 0:00:08  lr: 0.000377  loss: 1.3518 (1.3568)  loss_classifier: 0.4170 (0.4097)  loss_box_reg: 0.7018 (0.7047)  loss_objectness: 0.0634 (0.0630)  loss_rpn_box_reg: 0.1792 (0.1794)  time: 1.4793  data: 0.8243  max mem: 3227
Epoch: [4]  [15/16]  eta: 0:00:01  lr: 0.000377  loss: 1.3179 (1.3129)  loss_classifier: 0.3957 (0.3852)  loss_box_reg: 0.6867 (0.6832)  loss_objectness: 0.0565 (0.0603)  loss_rpn_box_reg: 0.1792 (0.1842)  time: 1.4294  data: 0.7960  max mem: 3227
Epoch: [4] Total time: 0:00:22 (1.4296 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.2773 (0.2773)  evaluator_time: 0.1551 (0.1551)  time: 1.3002  data: 0.8548  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 0.26

Epoch: [8]  [ 0/16]  eta: 0:00:24  lr: 0.000188  loss: 0.7119 (0.7119)  loss_classifier: 0.1957 (0.1957)  loss_box_reg: 0.3955 (0.3955)  loss_objectness: 0.0163 (0.0163)  loss_rpn_box_reg: 0.1045 (0.1045)  time: 1.5033  data: 0.8153  max mem: 3227
Epoch: [8]  [10/16]  eta: 0:00:09  lr: 0.000188  loss: 0.7251 (0.7654)  loss_classifier: 0.1980 (0.2035)  loss_box_reg: 0.3855 (0.3842)  loss_objectness: 0.0338 (0.0361)  loss_rpn_box_reg: 0.1376 (0.1416)  time: 1.5178  data: 0.8291  max mem: 3227
Epoch: [8]  [15/16]  eta: 0:00:01  lr: 0.000188  loss: 0.7119 (0.7395)  loss_classifier: 0.1918 (0.1976)  loss_box_reg: 0.3654 (0.3714)  loss_objectness: 0.0327 (0.0356)  loss_rpn_box_reg: 0.1252 (0.1349)  time: 1.4661  data: 0.7991  max mem: 3227
Epoch: [8] Total time: 0:00:23 (1.4664 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3243 (0.3243)  evaluator_time: 0.0801 (0.0801)  time: 1.1845  data: 0.7631  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 0.27

Epoch: [12]  [ 0/16]  eta: 0:00:24  lr: 0.000377  loss: 0.7456 (0.7456)  loss_classifier: 0.1964 (0.1964)  loss_box_reg: 0.3793 (0.3793)  loss_objectness: 0.0380 (0.0380)  loss_rpn_box_reg: 0.1319 (0.1319)  time: 1.5202  data: 0.8381  max mem: 3227
Epoch: [12]  [10/16]  eta: 0:00:08  lr: 0.000377  loss: 0.6801 (0.6840)  loss_classifier: 0.1833 (0.1835)  loss_box_reg: 0.3414 (0.3322)  loss_objectness: 0.0309 (0.0359)  loss_rpn_box_reg: 0.1294 (0.1324)  time: 1.4819  data: 0.8210  max mem: 3227
Epoch: [12]  [15/16]  eta: 0:00:01  lr: 0.000377  loss: 0.6673 (0.6750)  loss_classifier: 0.1773 (0.1800)  loss_box_reg: 0.3339 (0.3312)  loss_objectness: 0.0309 (0.0351)  loss_rpn_box_reg: 0.1233 (0.1288)  time: 1.4320  data: 0.7938  max mem: 3227
Epoch: [12] Total time: 0:00:22 (1.4322 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3043 (0.3043)  evaluator_time: 0.1902 (0.1902)  time: 1.3622  data: 0.8538  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [16]  [ 0/16]  eta: 0:00:23  lr: 0.000321  loss: 0.5779 (0.5779)  loss_classifier: 0.1541 (0.1541)  loss_box_reg: 0.2983 (0.2983)  loss_objectness: 0.0217 (0.0217)  loss_rpn_box_reg: 0.1038 (0.1038)  time: 1.4604  data: 0.8075  max mem: 3227
Epoch: [16]  [10/16]  eta: 0:00:08  lr: 0.000321  loss: 0.5305 (0.5504)  loss_classifier: 0.1508 (0.1522)  loss_box_reg: 0.2527 (0.2602)  loss_objectness: 0.0273 (0.0290)  loss_rpn_box_reg: 0.1038 (0.1091)  time: 1.4869  data: 0.8314  max mem: 3227
Epoch: [16]  [15/16]  eta: 0:00:01  lr: 0.000321  loss: 0.5305 (0.5423)  loss_classifier: 0.1487 (0.1489)  loss_box_reg: 0.2470 (0.2533)  loss_objectness: 0.0273 (0.0302)  loss_rpn_box_reg: 0.1038 (0.1099)  time: 1.4487  data: 0.8079  max mem: 3227
Epoch: [16] Total time: 0:00:23 (1.4489 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.2793 (0.2793)  evaluator_time: 0.0551 (0.0551)  time: 1.1721  data: 0.8187  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [20]  [ 0/16]  eta: 0:00:26  lr: 0.000188  loss: 0.4421 (0.4421)  loss_classifier: 0.1228 (0.1228)  loss_box_reg: 0.2011 (0.2011)  loss_objectness: 0.0345 (0.0345)  loss_rpn_box_reg: 0.0837 (0.0837)  time: 1.6322  data: 0.9245  max mem: 3227
Epoch: [20]  [10/16]  eta: 0:00:09  lr: 0.000188  loss: 0.4609 (0.4734)  loss_classifier: 0.1329 (0.1328)  loss_box_reg: 0.2083 (0.2194)  loss_objectness: 0.0262 (0.0255)  loss_rpn_box_reg: 0.0898 (0.0957)  time: 1.5799  data: 0.8728  max mem: 3227
Epoch: [20]  [15/16]  eta: 0:00:01  lr: 0.000188  loss: 0.4730 (0.4759)  loss_classifier: 0.1317 (0.1328)  loss_box_reg: 0.2091 (0.2186)  loss_objectness: 0.0217 (0.0263)  loss_rpn_box_reg: 0.0919 (0.0981)  time: 1.5260  data: 0.8432  max mem: 3227
Epoch: [20] Total time: 0:00:24 (1.5262 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3504 (0.3504)  evaluator_time: 0.0791 (0.0791)  time: 1.2543  data: 0.8081  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [24]  [ 0/16]  eta: 0:00:26  lr: 0.000055  loss: 0.4417 (0.4417)  loss_classifier: 0.1162 (0.1162)  loss_box_reg: 0.1962 (0.1962)  loss_objectness: 0.0208 (0.0208)  loss_rpn_box_reg: 0.1085 (0.1085)  time: 1.6521  data: 0.9575  max mem: 3227
Epoch: [24]  [10/16]  eta: 0:00:09  lr: 0.000055  loss: 0.4734 (0.4636)  loss_classifier: 0.1220 (0.1279)  loss_box_reg: 0.2097 (0.2158)  loss_objectness: 0.0192 (0.0211)  loss_rpn_box_reg: 0.1006 (0.0988)  time: 1.5602  data: 0.8702  max mem: 3227
Epoch: [24]  [15/16]  eta: 0:00:01  lr: 0.000055  loss: 0.4520 (0.4568)  loss_classifier: 0.1220 (0.1278)  loss_box_reg: 0.2044 (0.2114)  loss_objectness: 0.0198 (0.0225)  loss_rpn_box_reg: 0.0931 (0.0951)  time: 1.5026  data: 0.8346  max mem: 3227
Epoch: [24] Total time: 0:00:24 (1.5027 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3593 (0.3593)  evaluator_time: 0.0821 (0.0821)  time: 1.3152  data: 0.8588  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [28]  [ 0/16]  eta: 0:00:25  lr: 0.000377  loss: 0.4467 (0.4467)  loss_classifier: 0.1208 (0.1208)  loss_box_reg: 0.2040 (0.2040)  loss_objectness: 0.0272 (0.0272)  loss_rpn_box_reg: 0.0946 (0.0946)  time: 1.5698  data: 0.8423  max mem: 3227
Epoch: [28]  [10/16]  eta: 0:00:09  lr: 0.000377  loss: 0.4485 (0.4567)  loss_classifier: 0.1255 (0.1290)  loss_box_reg: 0.2082 (0.2051)  loss_objectness: 0.0227 (0.0216)  loss_rpn_box_reg: 0.0946 (0.1010)  time: 1.5637  data: 0.8479  max mem: 3227
Epoch: [28]  [15/16]  eta: 0:00:01  lr: 0.000377  loss: 0.4331 (0.4394)  loss_classifier: 0.1203 (0.1237)  loss_box_reg: 0.1923 (0.2016)  loss_objectness: 0.0187 (0.0209)  loss_rpn_box_reg: 0.0877 (0.0934)  time: 1.5022  data: 0.8170  max mem: 3227
Epoch: [28] Total time: 0:00:24 (1.5024 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3233 (0.3233)  evaluator_time: 0.0480 (0.0480)  time: 1.1657  data: 0.7804  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [32]  [ 0/16]  eta: 0:00:24  lr: 0.000362  loss: 0.4666 (0.4666)  loss_classifier: 0.1250 (0.1250)  loss_box_reg: 0.2113 (0.2113)  loss_objectness: 0.0186 (0.0186)  loss_rpn_box_reg: 0.1117 (0.1117)  time: 1.5364  data: 0.8077  max mem: 3227
Epoch: [32]  [10/16]  eta: 0:00:09  lr: 0.000362  loss: 0.4075 (0.4079)  loss_classifier: 0.1195 (0.1185)  loss_box_reg: 0.1904 (0.1885)  loss_objectness: 0.0181 (0.0173)  loss_rpn_box_reg: 0.0769 (0.0836)  time: 1.5653  data: 0.8618  max mem: 3227
Epoch: [32]  [15/16]  eta: 0:00:01  lr: 0.000362  loss: 0.3960 (0.4030)  loss_classifier: 0.1181 (0.1139)  loss_box_reg: 0.1842 (0.1851)  loss_objectness: 0.0181 (0.0184)  loss_rpn_box_reg: 0.0777 (0.0856)  time: 1.4988  data: 0.8216  max mem: 3227
Epoch: [32] Total time: 0:00:23 (1.4989 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3338 (0.3338)  evaluator_time: 0.0671 (0.0671)  time: 1.2376  data: 0.8237  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [36]  [ 0/16]  eta: 0:00:24  lr: 0.000321  loss: 0.3620 (0.3620)  loss_classifier: 0.1043 (0.1043)  loss_box_reg: 0.1763 (0.1763)  loss_objectness: 0.0176 (0.0176)  loss_rpn_box_reg: 0.0638 (0.0638)  time: 1.5074  data: 0.8238  max mem: 3227
Epoch: [36]  [10/16]  eta: 0:00:09  lr: 0.000321  loss: 0.3620 (0.3628)  loss_classifier: 0.1030 (0.1031)  loss_box_reg: 0.1683 (0.1690)  loss_objectness: 0.0176 (0.0184)  loss_rpn_box_reg: 0.0691 (0.0722)  time: 1.5177  data: 0.8292  max mem: 3227
Epoch: [36]  [15/16]  eta: 0:00:01  lr: 0.000321  loss: 0.3822 (0.3777)  loss_classifier: 0.1030 (0.1054)  loss_box_reg: 0.1763 (0.1746)  loss_objectness: 0.0178 (0.0209)  loss_rpn_box_reg: 0.0805 (0.0768)  time: 1.4737  data: 0.8066  max mem: 3227
Epoch: [36] Total time: 0:00:23 (1.4737 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3137 (0.3137)  evaluator_time: 0.0460 (0.0460)  time: 1.1434  data: 0.7697  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [40]  [ 0/16]  eta: 0:00:24  lr: 0.000260  loss: 0.3835 (0.3835)  loss_classifier: 0.1130 (0.1130)  loss_box_reg: 0.1732 (0.1732)  loss_objectness: 0.0111 (0.0111)  loss_rpn_box_reg: 0.0861 (0.0861)  time: 1.5292  data: 0.8524  max mem: 3227
Epoch: [40]  [10/16]  eta: 0:00:09  lr: 0.000260  loss: 0.3623 (0.3557)  loss_classifier: 0.0962 (0.0977)  loss_box_reg: 0.1701 (0.1654)  loss_objectness: 0.0199 (0.0189)  loss_rpn_box_reg: 0.0721 (0.0738)  time: 1.5157  data: 0.8270  max mem: 3227
Epoch: [40]  [15/16]  eta: 0:00:01  lr: 0.000260  loss: 0.3560 (0.3543)  loss_classifier: 0.0962 (0.0991)  loss_box_reg: 0.1616 (0.1638)  loss_objectness: 0.0194 (0.0181)  loss_rpn_box_reg: 0.0721 (0.0734)  time: 1.4986  data: 0.8240  max mem: 3227
Epoch: [40] Total time: 0:00:23 (1.4987 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3373 (0.3373)  evaluator_time: 0.0651 (0.0651)  time: 1.2802  data: 0.8628  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [44]  [ 0/16]  eta: 0:00:25  lr: 0.000188  loss: 0.3427 (0.3427)  loss_classifier: 0.0980 (0.0980)  loss_box_reg: 0.1791 (0.1791)  loss_objectness: 0.0091 (0.0091)  loss_rpn_box_reg: 0.0565 (0.0565)  time: 1.6020  data: 0.9168  max mem: 3227
Epoch: [44]  [10/16]  eta: 0:00:09  lr: 0.000188  loss: 0.3239 (0.3230)  loss_classifier: 0.0963 (0.0904)  loss_box_reg: 0.1542 (0.1521)  loss_objectness: 0.0120 (0.0156)  loss_rpn_box_reg: 0.0635 (0.0650)  time: 1.5362  data: 0.8472  max mem: 3227
Epoch: [44]  [15/16]  eta: 0:00:01  lr: 0.000188  loss: 0.3357 (0.3372)  loss_classifier: 0.0963 (0.0963)  loss_box_reg: 0.1590 (0.1587)  loss_objectness: 0.0120 (0.0160)  loss_rpn_box_reg: 0.0635 (0.0662)  time: 1.4827  data: 0.8156  max mem: 3227
Epoch: [44] Total time: 0:00:23 (1.4830 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3323 (0.3323)  evaluator_time: 0.0450 (0.0450)  time: 1.1691  data: 0.7727  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [48]  [ 0/16]  eta: 0:00:24  lr: 0.000116  loss: 0.3690 (0.3690)  loss_classifier: 0.1129 (0.1129)  loss_box_reg: 0.1606 (0.1606)  loss_objectness: 0.0181 (0.0181)  loss_rpn_box_reg: 0.0775 (0.0775)  time: 1.5054  data: 0.8288  max mem: 3227
Epoch: [48]  [10/16]  eta: 0:00:09  lr: 0.000116  loss: 0.3423 (0.3305)  loss_classifier: 0.0943 (0.0961)  loss_box_reg: 0.1589 (0.1534)  loss_objectness: 0.0181 (0.0177)  loss_rpn_box_reg: 0.0567 (0.0633)  time: 1.5041  data: 0.8337  max mem: 3227
Epoch: [48]  [15/16]  eta: 0:00:01  lr: 0.000116  loss: 0.3423 (0.3339)  loss_classifier: 0.0943 (0.0953)  loss_box_reg: 0.1606 (0.1554)  loss_objectness: 0.0155 (0.0169)  loss_rpn_box_reg: 0.0583 (0.0663)  time: 1.4688  data: 0.8104  max mem: 3227
Epoch: [48] Total time: 0:00:23 (1.4690 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3303 (0.3303)  evaluator_time: 0.0571 (0.0571)  time: 1.2134  data: 0.8020  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [52]  [ 0/16]  eta: 0:00:26  lr: 0.000055  loss: 0.3345 (0.3345)  loss_classifier: 0.0929 (0.0929)  loss_box_reg: 0.1512 (0.1512)  loss_objectness: 0.0199 (0.0199)  loss_rpn_box_reg: 0.0705 (0.0705)  time: 1.6334  data: 0.9056  max mem: 3227
Epoch: [52]  [10/16]  eta: 0:00:09  lr: 0.000055  loss: 0.3071 (0.3167)  loss_classifier: 0.0861 (0.0890)  loss_box_reg: 0.1512 (0.1492)  loss_objectness: 0.0146 (0.0169)  loss_rpn_box_reg: 0.0541 (0.0616)  time: 1.5272  data: 0.8354  max mem: 3227
Epoch: [52]  [15/16]  eta: 0:00:01  lr: 0.000055  loss: 0.3250 (0.3235)  loss_classifier: 0.0922 (0.0917)  loss_box_reg: 0.1512 (0.1520)  loss_objectness: 0.0120 (0.0163)  loss_rpn_box_reg: 0.0583 (0.0635)  time: 1.4810  data: 0.8126  max mem: 3227
Epoch: [52] Total time: 0:00:23 (1.4812 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3343 (0.3343)  evaluator_time: 0.0741 (0.0741)  time: 1.2942  data: 0.8718  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [56]  [ 0/16]  eta: 0:00:23  lr: 0.000014  loss: 0.3219 (0.3219)  loss_classifier: 0.0927 (0.0927)  loss_box_reg: 0.1598 (0.1598)  loss_objectness: 0.0199 (0.0199)  loss_rpn_box_reg: 0.0495 (0.0495)  time: 1.4531  data: 0.8018  max mem: 3227
Epoch: [56]  [10/16]  eta: 0:00:08  lr: 0.000014  loss: 0.3268 (0.3272)  loss_classifier: 0.0918 (0.0940)  loss_box_reg: 0.1480 (0.1518)  loss_objectness: 0.0183 (0.0193)  loss_rpn_box_reg: 0.0561 (0.0622)  time: 1.4866  data: 0.8278  max mem: 3227
Epoch: [56]  [15/16]  eta: 0:00:01  lr: 0.000014  loss: 0.3219 (0.3251)  loss_classifier: 0.0893 (0.0919)  loss_box_reg: 0.1474 (0.1493)  loss_objectness: 0.0153 (0.0173)  loss_rpn_box_reg: 0.0563 (0.0667)  time: 1.4423  data: 0.8019  max mem: 3227
Epoch: [56] Total time: 0:00:23 (1.4425 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3113 (0.3113)  evaluator_time: 0.0280 (0.0280)  time: 1.1260  data: 0.7737  max mem: 3227
Test:  [3/4]  eta: 0:00:01  model_time: 

In [21]:
trainedModelMerged=singleTrainingSession(dataMerged,2,bs=4)

Epoch: [0]  [ 0/16]  eta: 0:00:23  lr: 0.000000  loss: 2.3742 (2.3742)  loss_classifier: 0.7349 (0.7349)  loss_box_reg: 0.7642 (0.7642)  loss_objectness: 0.7361 (0.7361)  loss_rpn_box_reg: 0.1391 (0.1391)  time: 1.4684  data: 0.8063  max mem: 3227
Epoch: [0]  [10/16]  eta: 0:00:08  lr: 0.000000  loss: 2.1672 (2.1338)  loss_classifier: 0.6869 (0.6890)  loss_box_reg: 0.6555 (0.6866)  loss_objectness: 0.6875 (0.6488)  loss_rpn_box_reg: 0.1122 (0.1094)  time: 1.4530  data: 0.8121  max mem: 3227
Epoch: [0]  [15/16]  eta: 0:00:01  lr: 0.000000  loss: 1.9365 (2.0534)  loss_classifier: 0.6879 (0.6922)  loss_box_reg: 0.6555 (0.6912)  loss_objectness: 0.4547 (0.5713)  loss_rpn_box_reg: 0.0905 (0.0987)  time: 1.3977  data: 0.7781  max mem: 3227
Epoch: [0] Total time: 0:00:22 (1.3978 s / it)
Epoch: [1]  [ 0/16]  eta: 0:00:22  lr: 0.000000  loss: 2.2493 (2.2493)  loss_classifier: 0.6863 (0.6863)  loss_box_reg: 0.9130 (0.9130)  loss_objectness: 0.5452 (0.5452)  loss_rpn_box_reg: 0.1048 (0.1048)  tim

Epoch: [10]  [15/16]  eta: 0:00:01  lr: 0.144544  loss: 0.7309 (0.8317)  loss_classifier: 0.1802 (0.1887)  loss_box_reg: 0.5107 (0.5665)  loss_objectness: 0.0094 (0.0157)  loss_rpn_box_reg: 0.0397 (0.0609)  time: 1.4580  data: 0.7800  max mem: 3227
Epoch: [10] Total time: 0:00:23 (1.4582 s / it)
Epoch: [11]  [ 0/16]  eta: 0:00:25  lr: 0.156675  loss: 1.9696 (1.9696)  loss_classifier: 0.4615 (0.4615)  loss_box_reg: 1.2912 (1.2912)  loss_objectness: 0.0362 (0.0362)  loss_rpn_box_reg: 0.1808 (0.1808)  time: 1.5760  data: 0.8445  max mem: 3227
(1.1748975549395294e-07, 0.08222426499470752)
Epoch: [0]  [ 0/16]  eta: 0:00:25  lr: 0.000028  loss: 2.3232 (2.3232)  loss_classifier: 0.5750 (0.5750)  loss_box_reg: 0.7137 (0.7137)  loss_objectness: 0.8860 (0.8860)  loss_rpn_box_reg: 0.1484 (0.1484)  time: 1.5924  data: 0.8578  max mem: 3285
Epoch: [0]  [10/16]  eta: 0:00:09  lr: 0.000302  loss: 1.6014 (1.7080)  loss_classifier: 0.5516 (0.5407)  loss_box_reg: 0.8282 (0.7748)  loss_objectness: 0.1145

Test:  [3/4]  eta: 0:00:01  model_time: 0.2763 (0.2695)  evaluator_time: 0.0831 (0.0871)  time: 1.1465  data: 0.7789  max mem: 3553
Test: Total time: 0:00:04 (1.1467 s / it)
Averaged stats: model_time: 0.2763 (0.2695)  evaluator_time: 0.0831 (0.0871)
Accumulating evaluation results...
DONE (t=0.01s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.683
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.959
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.813
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.683
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.068
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.645
 Average Recall     (AR) @[ IoU=0.50:0.

Epoch: [7]  [10/16]  eta: 0:00:08  lr: 0.000284  loss: 0.2769 (0.2914)  loss_classifier: 0.0786 (0.0841)  loss_box_reg: 0.1925 (0.1967)  loss_objectness: 0.0009 (0.0020)  loss_rpn_box_reg: 0.0088 (0.0086)  time: 1.4972  data: 0.8127  max mem: 3553
Epoch: [7]  [15/16]  eta: 0:00:01  lr: 0.000284  loss: 0.2867 (0.2951)  loss_classifier: 0.0808 (0.0857)  loss_box_reg: 0.1890 (0.1960)  loss_objectness: 0.0013 (0.0045)  loss_rpn_box_reg: 0.0088 (0.0090)  time: 1.4511  data: 0.7879  max mem: 3553
Epoch: [7] Total time: 0:00:23 (1.4513 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3090 (0.3090)  evaluator_time: 0.0330 (0.0330)  time: 1.1661  data: 0.8021  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 0.2722 (0.2662)  evaluator_time: 0.0330 (0.0355)  time: 1.0620  data: 0.7482  max mem: 3553
Test: Total time: 0:00:04 (1.0625 s / it)
Averaged stats: model_time: 0.2722 (0.2662)  evaluator_time: 0.0330 (0.0355)
Accumulating evaluation results...
DONE (

Epoch: [11]  [ 0/16]  eta: 0:00:24  lr: 0.000016  loss: 0.2850 (0.2850)  loss_classifier: 0.0793 (0.0793)  loss_box_reg: 0.1937 (0.1937)  loss_objectness: 0.0003 (0.0003)  loss_rpn_box_reg: 0.0115 (0.0115)  time: 1.5184  data: 0.8318  max mem: 3553
Epoch: [11]  [10/16]  eta: 0:00:09  lr: 0.000016  loss: 0.2430 (0.2505)  loss_classifier: 0.0785 (0.0772)  loss_box_reg: 0.1530 (0.1639)  loss_objectness: 0.0012 (0.0018)  loss_rpn_box_reg: 0.0075 (0.0075)  time: 1.5018  data: 0.8149  max mem: 3553
Epoch: [11]  [15/16]  eta: 0:00:01  lr: 0.000016  loss: 0.2469 (0.2487)  loss_classifier: 0.0785 (0.0773)  loss_box_reg: 0.1543 (0.1608)  loss_objectness: 0.0012 (0.0031)  loss_rpn_box_reg: 0.0067 (0.0075)  time: 1.4497  data: 0.7848  max mem: 3553
Epoch: [11] Total time: 0:00:23 (1.4499 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.2973 (0.2973)  evaluator_time: 0.0400 (0.0400)  time: 1.1506  data: 0.8003  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [15]  [ 0/16]  eta: 0:00:24  lr: 0.000377  loss: 0.2996 (0.2996)  loss_classifier: 0.0870 (0.0870)  loss_box_reg: 0.2036 (0.2036)  loss_objectness: 0.0018 (0.0018)  loss_rpn_box_reg: 0.0073 (0.0073)  time: 1.5060  data: 0.8238  max mem: 3553
Epoch: [15]  [10/16]  eta: 0:00:09  lr: 0.000377  loss: 0.2044 (0.2092)  loss_classifier: 0.0686 (0.0698)  loss_box_reg: 0.1262 (0.1305)  loss_objectness: 0.0020 (0.0030)  loss_rpn_box_reg: 0.0054 (0.0059)  time: 1.5417  data: 0.8465  max mem: 3553
Epoch: [15]  [15/16]  eta: 0:00:01  lr: 0.000377  loss: 0.1934 (0.2075)  loss_classifier: 0.0651 (0.0689)  loss_box_reg: 0.1211 (0.1300)  loss_objectness: 0.0018 (0.0027)  loss_rpn_box_reg: 0.0056 (0.0058)  time: 1.4907  data: 0.8220  max mem: 3553
Epoch: [15] Total time: 0:00:23 (1.4909 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.2844 (0.2844)  evaluator_time: 0.0381 (0.0381)  time: 1.1296  data: 0.7921  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [19]  [ 0/16]  eta: 0:00:24  lr: 0.000246  loss: 0.2076 (0.2076)  loss_classifier: 0.0684 (0.0684)  loss_box_reg: 0.1300 (0.1300)  loss_objectness: 0.0041 (0.0041)  loss_rpn_box_reg: 0.0052 (0.0052)  time: 1.5298  data: 0.8348  max mem: 3553
Epoch: [19]  [10/16]  eta: 0:00:09  lr: 0.000246  loss: 0.1717 (0.1786)  loss_classifier: 0.0622 (0.0625)  loss_box_reg: 0.1005 (0.1097)  loss_objectness: 0.0009 (0.0013)  loss_rpn_box_reg: 0.0052 (0.0050)  time: 1.5225  data: 0.8245  max mem: 3553
Epoch: [19]  [15/16]  eta: 0:00:01  lr: 0.000246  loss: 0.1717 (0.1777)  loss_classifier: 0.0622 (0.0625)  loss_box_reg: 0.1005 (0.1083)  loss_objectness: 0.0009 (0.0021)  loss_rpn_box_reg: 0.0048 (0.0049)  time: 1.4787  data: 0.7997  max mem: 3553
Epoch: [19] Total time: 0:00:23 (1.4789 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3393 (0.3393)  evaluator_time: 0.0150 (0.0150)  time: 1.1681  data: 0.7987  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [23]  [ 0/16]  eta: 0:00:24  lr: 0.000091  loss: 0.1654 (0.1654)  loss_classifier: 0.0612 (0.0612)  loss_box_reg: 0.0965 (0.0965)  loss_objectness: 0.0035 (0.0035)  loss_rpn_box_reg: 0.0041 (0.0041)  time: 1.5394  data: 0.8528  max mem: 3553
Epoch: [23]  [10/16]  eta: 0:00:09  lr: 0.000091  loss: 0.1544 (0.1604)  loss_classifier: 0.0612 (0.0588)  loss_box_reg: 0.0929 (0.0940)  loss_objectness: 0.0019 (0.0031)  loss_rpn_box_reg: 0.0042 (0.0045)  time: 1.5047  data: 0.8165  max mem: 3553
Epoch: [23]  [15/16]  eta: 0:00:01  lr: 0.000091  loss: 0.1544 (0.1674)  loss_classifier: 0.0612 (0.0615)  loss_box_reg: 0.0929 (0.0990)  loss_objectness: 0.0014 (0.0024)  loss_rpn_box_reg: 0.0042 (0.0045)  time: 1.4725  data: 0.8025  max mem: 3553
Epoch: [23] Total time: 0:00:23 (1.4726 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3013 (0.3013)  evaluator_time: 0.0260 (0.0260)  time: 1.1270  data: 0.7857  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [27]  [ 0/16]  eta: 0:00:25  lr: 0.000004  loss: 0.1458 (0.1458)  loss_classifier: 0.0560 (0.0560)  loss_box_reg: 0.0858 (0.0858)  loss_objectness: 0.0002 (0.0002)  loss_rpn_box_reg: 0.0038 (0.0038)  time: 1.5661  data: 0.8472  max mem: 3553
Epoch: [27]  [10/16]  eta: 0:00:09  lr: 0.000004  loss: 0.1519 (0.1581)  loss_classifier: 0.0565 (0.0585)  loss_box_reg: 0.0900 (0.0934)  loss_objectness: 0.0013 (0.0018)  loss_rpn_box_reg: 0.0044 (0.0044)  time: 1.5622  data: 0.8490  max mem: 3553
Epoch: [27]  [15/16]  eta: 0:00:01  lr: 0.000004  loss: 0.1519 (0.1609)  loss_classifier: 0.0563 (0.0590)  loss_box_reg: 0.0900 (0.0959)  loss_objectness: 0.0010 (0.0016)  loss_rpn_box_reg: 0.0044 (0.0045)  time: 1.5135  data: 0.8177  max mem: 3553
Epoch: [27] Total time: 0:00:24 (1.5136 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3173 (0.3173)  evaluator_time: 0.0340 (0.0340)  time: 1.1859  data: 0.8216  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [31]  [ 0/16]  eta: 0:00:23  lr: 0.000402  loss: 0.1575 (0.1575)  loss_classifier: 0.0634 (0.0634)  loss_box_reg: 0.0895 (0.0895)  loss_objectness: 0.0002 (0.0002)  loss_rpn_box_reg: 0.0044 (0.0044)  time: 1.4885  data: 0.8159  max mem: 3553
Epoch: [31]  [10/16]  eta: 0:00:08  lr: 0.000402  loss: 0.1334 (0.1385)  loss_classifier: 0.0486 (0.0522)  loss_box_reg: 0.0757 (0.0820)  loss_objectness: 0.0006 (0.0008)  loss_rpn_box_reg: 0.0033 (0.0034)  time: 1.4681  data: 0.7957  max mem: 3553
Epoch: [31]  [15/16]  eta: 0:00:01  lr: 0.000402  loss: 0.1380 (0.1456)  loss_classifier: 0.0492 (0.0548)  loss_box_reg: 0.0763 (0.0857)  loss_objectness: 0.0006 (0.0012)  loss_rpn_box_reg: 0.0035 (0.0039)  time: 1.4265  data: 0.7749  max mem: 3553
Epoch: [31] Total time: 0:00:22 (1.4266 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.2983 (0.2983)  evaluator_time: 0.0190 (0.0190)  time: 1.1500  data: 0.8187  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [35]  [ 0/16]  eta: 0:00:24  lr: 0.000365  loss: 0.1187 (0.1187)  loss_classifier: 0.0480 (0.0480)  loss_box_reg: 0.0650 (0.0650)  loss_objectness: 0.0023 (0.0023)  loss_rpn_box_reg: 0.0035 (0.0035)  time: 1.5194  data: 0.8358  max mem: 3553
Epoch: [35]  [10/16]  eta: 0:00:09  lr: 0.000365  loss: 0.1305 (0.1311)  loss_classifier: 0.0496 (0.0493)  loss_box_reg: 0.0735 (0.0770)  loss_objectness: 0.0005 (0.0013)  loss_rpn_box_reg: 0.0035 (0.0035)  time: 1.5196  data: 0.8317  max mem: 3553
Epoch: [35]  [15/16]  eta: 0:00:01  lr: 0.000365  loss: 0.1285 (0.1281)  loss_classifier: 0.0490 (0.0489)  loss_box_reg: 0.0724 (0.0741)  loss_objectness: 0.0005 (0.0017)  loss_rpn_box_reg: 0.0035 (0.0034)  time: 1.4592  data: 0.8013  max mem: 3553
Epoch: [35] Total time: 0:00:23 (1.4593 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3023 (0.3023)  evaluator_time: 0.0180 (0.0180)  time: 1.1761  data: 0.8388  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [39]  [ 0/16]  eta: 0:00:24  lr: 0.000303  loss: 0.1013 (0.1013)  loss_classifier: 0.0405 (0.0405)  loss_box_reg: 0.0583 (0.0583)  loss_objectness: 0.0002 (0.0002)  loss_rpn_box_reg: 0.0023 (0.0023)  time: 1.5511  data: 0.8660  max mem: 3553
Epoch: [39]  [10/16]  eta: 0:00:09  lr: 0.000303  loss: 0.1013 (0.1130)  loss_classifier: 0.0406 (0.0450)  loss_box_reg: 0.0599 (0.0645)  loss_objectness: 0.0003 (0.0006)  loss_rpn_box_reg: 0.0027 (0.0029)  time: 1.5079  data: 0.8122  max mem: 3553
Epoch: [39]  [15/16]  eta: 0:00:01  lr: 0.000303  loss: 0.1145 (0.1178)  loss_classifier: 0.0482 (0.0469)  loss_box_reg: 0.0628 (0.0668)  loss_objectness: 0.0004 (0.0009)  loss_rpn_box_reg: 0.0031 (0.0032)  time: 1.4588  data: 0.7836  max mem: 3553
Epoch: [39] Total time: 0:00:23 (1.4589 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:05  model_time: 0.3193 (0.3193)  evaluator_time: 0.0260 (0.0260)  time: 1.2782  data: 0.9128  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [43]  [ 0/16]  eta: 0:00:25  lr: 0.000226  loss: 0.1039 (0.1039)  loss_classifier: 0.0440 (0.0440)  loss_box_reg: 0.0573 (0.0573)  loss_objectness: 0.0004 (0.0004)  loss_rpn_box_reg: 0.0023 (0.0023)  time: 1.5930  data: 0.8698  max mem: 3553
Epoch: [43]  [10/16]  eta: 0:00:09  lr: 0.000226  loss: 0.1086 (0.1138)  loss_classifier: 0.0470 (0.0465)  loss_box_reg: 0.0608 (0.0634)  loss_objectness: 0.0003 (0.0010)  loss_rpn_box_reg: 0.0027 (0.0028)  time: 1.5431  data: 0.8504  max mem: 3553
Epoch: [43]  [15/16]  eta: 0:00:01  lr: 0.000226  loss: 0.1077 (0.1090)  loss_classifier: 0.0460 (0.0447)  loss_box_reg: 0.0577 (0.0605)  loss_objectness: 0.0002 (0.0008)  loss_rpn_box_reg: 0.0028 (0.0030)  time: 1.5006  data: 0.8247  max mem: 3553
Epoch: [43] Total time: 0:00:24 (1.5007 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.2963 (0.2963)  evaluator_time: 0.0250 (0.0250)  time: 1.1550  data: 0.8167  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [47]  [ 0/16]  eta: 0:00:23  lr: 0.000146  loss: 0.0855 (0.0855)  loss_classifier: 0.0387 (0.0387)  loss_box_reg: 0.0439 (0.0439)  loss_objectness: 0.0003 (0.0003)  loss_rpn_box_reg: 0.0026 (0.0026)  time: 1.4924  data: 0.8097  max mem: 3553
Epoch: [47]  [10/16]  eta: 0:00:09  lr: 0.000146  loss: 0.0991 (0.1019)  loss_classifier: 0.0399 (0.0415)  loss_box_reg: 0.0578 (0.0559)  loss_objectness: 0.0004 (0.0016)  loss_rpn_box_reg: 0.0028 (0.0030)  time: 1.5046  data: 0.8208  max mem: 3553
Epoch: [47]  [15/16]  eta: 0:00:01  lr: 0.000146  loss: 0.1014 (0.1037)  loss_classifier: 0.0421 (0.0430)  loss_box_reg: 0.0577 (0.0566)  loss_objectness: 0.0004 (0.0014)  loss_rpn_box_reg: 0.0026 (0.0028)  time: 1.4762  data: 0.8081  max mem: 3553
Epoch: [47] Total time: 0:00:23 (1.4763 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3252 (0.3252)  evaluator_time: 0.0180 (0.0180)  time: 1.1668  data: 0.8096  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

Epoch: [51]  [ 0/16]  eta: 0:00:24  lr: 0.000075  loss: 0.1097 (0.1097)  loss_classifier: 0.0435 (0.0435)  loss_box_reg: 0.0627 (0.0627)  loss_objectness: 0.0009 (0.0009)  loss_rpn_box_reg: 0.0025 (0.0025)  time: 1.5174  data: 0.8368  max mem: 3553
Epoch: [51]  [10/16]  eta: 0:00:09  lr: 0.000075  loss: 0.1034 (0.0967)  loss_classifier: 0.0442 (0.0406)  loss_box_reg: 0.0571 (0.0530)  loss_objectness: 0.0002 (0.0004)  loss_rpn_box_reg: 0.0026 (0.0026)  time: 1.5227  data: 0.8385  max mem: 3553
Epoch: [51]  [15/16]  eta: 0:00:01  lr: 0.000075  loss: 0.1034 (0.1005)  loss_classifier: 0.0447 (0.0420)  loss_box_reg: 0.0549 (0.0551)  loss_objectness: 0.0003 (0.0005)  loss_rpn_box_reg: 0.0026 (0.0028)  time: 1.4801  data: 0.8162  max mem: 3553
Epoch: [51] Total time: 0:00:23 (1.4802 s / it)
creating index...
index created!
Test:  [0/4]  eta: 0:00:04  model_time: 0.3143 (0.3143)  evaluator_time: 0.0140 (0.0140)  time: 1.1831  data: 0.8298  max mem: 3553
Test:  [3/4]  eta: 0:00:01  model_time: 

[0.2561407796509985, 0.7170538347538868, 0.8022168945422985, 0.8182364292491296, 0.8189169486123484, 0.8665823712117383, 0.885649782917282, 0.8939673061761461, 0.9043052484806778, 0.9084504717089198, 0.908494443084369, 0.9062862278818025, 0.9213251879032199, 0.919884203622462, 0.9219203862017579, 0.9308516278235226, 0.9251075817236568, 0.9295478097226045, 0.9391501453604807, 0.9395376542771996, 0.9392005225137753, 0.9410197648772791, 0.9349451083157652, 0.9420245140008422, 0.9404114479516249, 0.9430141969079106, 0.9419283581601378, 0.9352670059697532, 0.939702651855829, 0.9419898130053648, 0.9444801523644456, 0.9432977745065704, 0.9422691674937763, 0.9415713152038018, 0.9445847237522988, 0.9414651966486002, 0.9444496500503944, 0.9465949506275756, 0.9395696011389569, 0.9401587749013797, 0.9432559516519325, 0.9405038435567574, 0.9425249347153525, 0.9385194996693542, 0.9494524765932226, 0.9405192210940944, 0.9449961200211119, 0.9445291590714616, 0.9433865712505178, 0.9446675926436532, 0.9

In [21]:
#summarize seperated
testmodel = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = testmodel.roi_heads.box_predictor.cls_score.in_features
testmodel.roi_heads.box_predictor = FastRCNNPredictor(in_features,3)
summary(testmodel,testimages,datasetSeperated,testTransforms,IdToClsSeperated,max_n=10,filename="./stored_models/modelSeperated",score_threshold=0.8)

In [22]:
#summarize merged
testmodel = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = testmodel.roi_heads.box_predictor.cls_score.in_features
testmodel.roi_heads.box_predictor = FastRCNNPredictor(in_features,2)
summary(testmodel,testimages,datasetMerged,testTransforms,IdToClsMerged,max_n=10,filename="./stored_models/modelMerged",score_threshold=0.8)

In [18]:
modelSeperated = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = modelSeperated.roi_heads.box_predictor.cls_score.in_features
modelSeperated.roi_heads.box_predictor = FastRCNNPredictor(in_features,3)
modelSeperated.load_state_dict(torch.load("./stored_models/modelSeperated"))

modelMerged = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = modelMerged.roi_heads.box_predictor.cls_score.in_features
modelMerged.roi_heads.box_predictor = FastRCNNPredictor(in_features,2)
modelMerged.load_state_dict(torch.load("./stored_models/modelMerged"))

#displayComparison(testimages,[modelSeperated,modelMerged],[IdToClsSeperated,IdToClsMerged],size=(14,10),annotations=None,gtLabels=None,threshold=0.2)

<All keys matched successfully>

In [24]:
summarySeperated(modelSeperated,testimages,IdToClsSeperated,size=(12,10))

In [32]:
createJsonsFor(modelSeperated,testimages,paramfile="./stored_models/modelSeperated",splitOnSymbols=False)#for one file per image
#createJsonsFor(modelSeperated,testimages,paramfile="./stored_models/modelSeperated",splitOnSymbols=False) #for one file per symbol

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [18]:
summarizeData(imtoannlist)

Dataset comprises 77 images
Number of wappen: 728
Maximum number of wappen: 16
Minimum number of wappen: 1
Number of text: 970
Maximum number of text: 19
Minimum number of text: 1


In [25]:
modelSeperated = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = modelSeperated.roi_heads.box_predictor.cls_score.in_features
modelSeperated.roi_heads.box_predictor = FastRCNNPredictor(in_features,3)
modelSeperated

In [19]:
summarizeData(imToAnnMerged)

Dataset comprises 77 images
Number of objekt: 728
Maximum number of objekt: 16
Minimum number of objekt: 1


In [26]:
printImages(images,imtoann,max_n=4)

In [19]:
#this concludes the object detection part, now do ocr to recover texts
#imports
#!pip install pytesseract

import pytesseract
import math
import re


In [20]:
#create greyscale copies of all images as required for ocr
def greyImages(outdir,images,greyimages):
    for image in images:
        imgname=imgname=os.path.normpath(image).split(os.sep)[-1]
        outname=outdir+"/"+imgname
        img=cv2.imread(str(image))
        grey=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    
        cv2.imwrite(outname,grey)
        greyimages.append(outname)
        
#creates several images for a given greyscale image and a list of textboxes with the format [minx,miny,maxx,maxy]
#returns their filenames as a list
def createSlices(greyimage,textboxes,outdir="slices"):
    imgname=imgname=os.path.normpath(greyimage).split(os.sep)[-1]
    img=cv2.imread(greyimage)
    slices=[]
    index=0
    for box in textboxes:
        minX,minY,maxX,maxY=box
        minX=math.floor(minX)
        minY=math.floor(minY)
        maxX=math.ceil(maxX)
        maxY=math.ceil(maxY)
        outname=outdir+"/"+imgname.replace(".jpg","")+"-"+str(index)+".jpg"
        #height=maxX-minX
        #width=maxY-minY
        sl=img[minY:maxY, minX:maxX]
        cv2.imwrite(outname,sl)
        slices.append(outname)
        index+=1
    return slices

#finds boundaries for text boxes from annotation dict
def textBoxesFromDict(image,annotations):
    imgname=imgname=os.path.normpath(image).split(os.sep)[-1]
    tboxes=[]
    for bbox,cls in annotations[imgname]:
        if cls=='text':
            tboxes.append(bbox)
    return tboxes

def filterBoxes(fboxes,flabels):
    tboxes=[]
    
    for i in range(len(fboxes)):
        bbox=fboxes[i]
        label=flabels[i]
        if(IdToClsSeperated[label]=='text'):
            tboxes.append(bbox)
            
    return tboxes
    


#finds boundaries for text boxes from model prediction
def textBoxesFromModel(image,model):
    
    model.eval()
    
    img=Image.open(image).convert("RGB")
    imgTensor=testTransforms(img)
    prediction=model([imgTensor])
       
    fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
    fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
    fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
    
    tboxes=filterBoxes(fboxes,flabels)

    return tboxes

#creates slice images for all text boxes of given image list from annotation dictionary
def slicesFromImages(images,annotations,outdir="greyimages"):
    slices=[]
    greyimages=[]
    greyImages(outdir,images,greyimages)
    
    for image in greyimages:
        boxes=textBoxesFromDict(image,annotations)
        slices=slices+createSlices(image,boxes)
    return slices

#creates slice images for all text boxes of given image list from model predictions
def slicesFromImagesAndModel(images,model,outdir="greyimages"):
    slices=[]
    greyimages=[]
    greyImages(outdir,images,greyimages)
    
    for image in greyimages:
        boxes=textBoxesFromModel(image,model)
        slices=slices+createSlices(image,boxes)
    return slices
        

In [110]:
#functionalities that allow for a prediction with the use of tesseract ocr

#takes an image slice as input and rescales it to a better format for text detection 
def rescaleImg(image):
    scaledImage=rescaleHeight(image)
    resized=rescaleWidth(scaledImage)
    height,width=resized.shape[0:2]
    hpad=math.ceil(0.05*height)
    wpad=math.ceil(0.05*width)
    bordered=cv2.copyMakeBorder(
    resized,
    top=hpad,
    bottom=hpad,
    left=wpad,
    right=wpad,
    borderType=cv2.BORDER_REPLICATE)
    return bordered

#rescales height such that single characters should always be visible
def rescaleHeight(image,optimalHeight=300.0):
    height,width=image.shape[0:2]
    
    if height>optimalHeight:
        return image
    scalefactor=optimalHeight/height
    nshape=(math.ceil(width*scalefactor),math.ceil(height*scalefactor))
    
    #print(image.shape)
    
    #print(nshape)
    
    resized = cv2.resize(image, nshape, interpolation = cv2.INTER_AREA)
    return resized

#scales an image of text such that at least a minimum textlength is achieved 
def rescaleWidth(image, minimalWidth=600.0):
    height,width=image.shape[0:2]
    
    if width>minimalWidth:
        return image
    
    scalefactor=minimalWidth/width
    nshape=(math.ceil(width*scalefactor),math.ceil(height*scalefactor))
    
    #print(image.shape)
    
    #print(nshape)
    
    resized = cv2.resize(image, nshape, interpolation = cv2.INTER_AREA)
    return resized

#deskewing function, takes a binarazed cv2 image as input
#returns a version of this image with an affine rotation transformation applied to it
def deskewImg(img):
    #create inverse for further processing
    inverse=cv2.bitwise_not(img.copy())
    #get all foreground pixel
    foreground=np.column_stack(np.where(inverse>0))
    #find rotated box that contains all foreground pixel with a minimal area
    angle=cv2.minAreaRect(foreground)[-1]
    #adjust angle to account for desired rotation
    #print(angle)
    angle=-angle
    if angle<-45:
        angle=-angle-90
    
    #print(angle)
    
    #find center
    (height,width)=img.shape[:2]
    centre=(width//2,height//2)
    #rotate
    rotMat=cv2.getRotationMatrix2D(centre,angle,1.0)
    rotatedOriginal=cv2.warpAffine(img,rotMat,(width,height),flags=cv2.INTER_CUBIC,borderMode=cv2.BORDER_REPLICATE)
    
    return rotatedOriginal

#adds additional pre processing steps to the given slices
def preprocessSlices(slices,binarization=True,removeNoise=True,deskew=True,dilate=True,erode=True,rescale=True):
    for image in slices:
        #load image for pre processing
        img=cv2.imread(image,0)
    
        if rescale:
            img=rescaleImg(img)
        
        #binarization
        if binarization:
            #img=cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 141, 13)
            img=cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 197, 23)
        
        #cv2.imwrite(image.replace(".jpg","")+"-test.jpg",img)
        
        if removeNoise:
            kernel=np.ones((1,1),np.uint8)
            img=cv2.dilate(img,kernel,iterations=1)
            img=cv2.erode(img,kernel,iterations=1)
            img=cv2.morphologyEx(img,cv2.MORPH_CLOSE,kernel)
            #img=cv2.medianBlur(img,3)
            img=cv2.bilateralFilter(img,11, 90, 90,cv2.BORDER_DEFAULT)
            
        if dilate:
            img=cv2.bitwise_not(img)
            kernel=np.ones((2,2),np.uint8)
            img=cv2.dilate(img,kernel,iterations=1)
            img=cv2.bitwise_not(img)
            
        if erode:
            img=cv2.bitwise_not(img)
            kernel=np.ones((2,2),np.uint8)
            img=cv2.erode(img,kernel,iterations=1)
            img=cv2.bitwise_not(img)
            
        if deskew:
            img=deskewImg(img)
        
        cv2.imwrite(image,img)
    

#actual prediction function
def textForImage(greyimage,model=None,annotations=None,boxes=[],preproc=[True,True,True,True,True,True]):
    box2text={}
    boxes=boxes
    
    if(len(boxes)==0):
        if(model!=None):
            boxes=textBoxesFromModel(greyimage,model)
        elif(annotations!=None):
            boxes=textBoxesFromDict(greyimage,annotations)
        else:
            print("no model or annotation dict given")
            return box2text
    
    slices=createSlices(greyimage,boxes)
    
    #configuration
    config="--psm 6 --dpi 450"
    
    preprocessSlices(slices,preproc[0],preproc[1],preproc[2],preproc[3],preproc[4],preproc[5])
    
    for i in range(len(slices)):
        image=slices[i]
        box=[round(boxes[i][0],4),round(boxes[i][1],4),round(boxes[i][2],4),round(boxes[i][3],4)]
        img=Image.open(image)
        text=pytesseract.image_to_string(img,config=config,lang="deu")
        box2text[str(box)]=text
    
    
    return box2text

#function to perform ocr and dump data into a dataframe along some metadata
def dataForImage(greyimage,model=None,annotations=None,boxes=[],preproc=[True,True,True,True,True,True]):
    box2text={}
    boxes=boxes
    
    if(len(boxes)==0):
        if(model!=None):
            boxes=textBoxesFromModel(greyimage,model)
        elif(annotations!=None):
            boxes=textBoxesFromDict(greyimage,annotations)
        else:
            print("no model or annotation dict given")
            return box2text
    
    slices=createSlices(greyimage,boxes)
    
    #configuration
    config="--psm 6 --dpi 450"
    
    preprocessSlices(slices,preproc[0],preproc[1],preproc[2],preproc[3],preproc[4],preproc[5])
    
    for i in range(len(slices)):
        image=slices[i]
        box=[round(boxes[i][0],4),round(boxes[i][1],4),round(boxes[i][2],4),round(boxes[i][3],4)]
        img=Image.open(image)
        df = pytesseract.image_to_data(img,config=config,lang="deu",output_type='data.frame')
        
        #break down dataframe to a text and a confidence value
        df=df[df.conf>-1.]
        textlist=df.groupby('block_num')['text'].apply(list)
        #print(textlist)
       
        text=""
        confidence=0.
        if len(textlist)!=0:
            text=" ".join(textlist[1])
            confidence=df.groupby(['block_num'])['conf'].mean()[1]
            if(confidence==0.):
                #TODO this is a temporary decsision to prevent div/0 but still have to find out why so many good predictions are zero
                confidence=1.0
        #else:
            #text=pytesseract.image_to_string(img,config=config,lang="deu")
        
        #box2text[str(box)]=text
        box2text[str(box)]=(text,confidence)
     
    return box2text
    

def getConsensusText(greyimage,model=None,annotations=None,boxes=[],configurations=[[True,True,True,True,True,True],[True,False,True,False,False,True],[True,True,True,False,True,True],[True,True,True,True,False,True]],technique="MAX"):
    if(len(boxes)==0):
        if(model!=None):
            boxes=textBoxesFromModel(greyimage,model)
        elif(annotations!=None):
            boxes=textBoxesFromDict(greyimage,annotations)
        else:
            print("no model or annotation dict given")
            return {}
    
    #get predictions for every configuration
    predDicts=[]
    for configs in configurations:
        predDicts.append(dataForImage(greyimage,boxes=boxes,preproc=configs))
    
    #attempt to find a consensus prediction
    bestPredictionary={}
    for key in predDicts[0]:
        predictions=[pd[key] for pd in predDicts]
        if technique=="MAX":
            mval=0.
            mword=""
            for text,conf in predictions:
                if(conf>mval):
                    mval=conf
                    mword=text

            bestPredictionary[key]=mword
        
    return bestPredictionary

In [54]:
#some methods for displaying images with annotated text

def findLinkingText(textdict,box,threshold=1.0):
    for key in textdict:
        keybox=[float(x) for x in key.strip('[').strip(']').split(',')]
        diff=abs(box[0]-keybox[0])+abs(box[1]-keybox[1])+abs(box[2]-keybox[2])+abs(box[3]-keybox[3])
        if diff<threshold:
            return textdict[key]
        
        
    return None


def summarySeperated2(model,images,labelDict,size=(12,10),max_n=4):
    model.eval()
    greyimages=[]
    greyImages("greyimages",images,greyimages)
    
    for row in range(min(len(images),max_n)):
        #get predictions and postprocess
        image=images[row]
        img=Image.open(image).convert("RGB")
        imgTensor=testTransforms(img)
        prediction=model([imgTensor])
        
        fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
        
        fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
        fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
        annList=(torch.tensor(fboxes),torch.tensor(flabels))   
        
        #get text predictions
        #textdict=textForImage(greyimages[row],boxes=filterBoxes(fboxes,flabels))
        textdict=getConsensusText(greyimages[row],boxes=filterBoxes(fboxes,flabels))
        #print(textdict)
        textdict=filterUnexpectedCharacters(textdict)
        
        ax=printImg(Image.open(image),ax=None,size=size)
        
        for i in range(len(annList[0])):
            bb=annList[0][i]
            label=annList[1][i]
            labeltxt=labelDict[int(label)]
            color=colordict[labeltxt]
            drawBB(ax,(bb[0],bb[1]),(bb[2],bb[3]),color)
            if labeltxt=="text":
                txt=findLinkingText(textdict,bb)
                drawText(ax,bb[0],bb[1]+100,txt,"red")
            

In [55]:
#some methods for post processing
def filterUnexpectedCharacters(textdict):
    unexpectedChars='[!§$%&/?<>~;:|@°^®_%=©}{]'
    
    chars = set('0123456789QWERTZUIOPÜASDFGHJKLÖÄYXCVBNM')
    
    for key in textdict:
        entry=textdict[key]
        #filter unexpected characters
        filteredEntry=re.sub(unexpectedChars,'',entry)
        
        #make sure every word does at least contain one character
        finalEntry=""
        for line in filteredEntry.split("\n"):
            for word in line.split(" "):
                if len(word)==0:
                    continue
                if word[0]=="-" or word[0]=="|":
                    word=word[1:]
                if any((c in chars) for c in word) or word=="v.":
                    if word!=line.split(" ")[0]:
                        finalEntry+=" "
                    finalEntry+=word
            if line!=filteredEntry.split("\n")[-1]:
                    finalEntry+="\n"
        
        #filter words
        textdict[key]=finalEntry
    return textdict
        

In [56]:
#adjusted methods to create the output json files 

#create a json file for image and model with annotated text
def createJson2(image,model,fname,greyimage):
    
    imgname=os.path.normpath(image).split(os.sep)[-1]
    data={}
    op=open(fname,"w+")
    data["filename"]=imgname
    data["regions"]=[]
      
    img=Image.open(image).convert("RGB")
    imgTensor=testTransforms(img)
    prediction=model([imgTensor])
       
    fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
    fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
    fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
    annList1=(torch.tensor(fboxes),torch.tensor(flabels))
    
    #recover text
    textdict=textForImage(greyimage,boxes=filterBoxes(fboxes,flabels))
    textdict=filterUnexpectedCharacters(textdict)
    
    mergedList=[]
    fboxes,flabels=mergeBoxes(fboxes,flabels,maxYdown=100,maxYup=800,maxXdiff=300,mergedList=mergedList)
    annList2=(torch.tensor(fboxes),torch.tensor(flabels))
    
    for i in range (len(annList2[0])):
        curdict={}
        curdict["class"]="merged"
        box=annList2[0][i]
        text=mergedList[i][1]
        symbol=mergedList[i][0]
        curdict["box"]=[float(box[0]),float(box[1]),float(box[2]),float(box[3])]
        if text!=None:
            curdict["textbox"]=[float(text[0]),float(text[1]),float(text[2]),float(text[3])]
            txt=findLinkingText(textdict,[float(text[0]),float(text[1]),float(text[2]),float(text[3])])
            curdict["text"]=txt
            
        curdict["symbolbox"]=[float(symbol[0]),float(symbol[1]),float(symbol[2]),float(symbol[3])]
        data["regions"].append(curdict)
        
    #take care of the artifacts that were not merged
    for j in range(len(annList1[0])):
        box=annList1[0][i]
        cls=annList1[1][i]
        if not isIn(box,mergedList):
            curdict={}
            if(cls==1):
                curdict["class"]="symbol"
            if(cls==2):
                curdict["class"]="text"
                txt=findLinkingText(textdict,[float(box[0]),float(box[1]),float(box[2]),float(box[3])])
                curdict["text"]=txt
            curdict["box"]=[float(box[0]),float(box[1]),float(box[2]),float(box[3])]
            data["regions"].append(curdict)
    
    #print(data)
    json.dump(data,op)
    
    
#create a distinct json file for each region of intrest in image
def createIndividualJsons2(image,model,fname,greyimage):
    imgname=os.path.normpath(image).split(os.sep)[-1]
    
    img=Image.open(image).convert("RGB")
    imgTensor=testTransforms(img)
    prediction=model([imgTensor])
    
    
    fboxes,flabels,fconfidences=filerConfidenceGap(prediction[0]["boxes"],prediction[0]["labels"],prediction[0]["scores"])
    fboxes=convertBoxesToOriginalSize(fboxes,image,transformedSize=(1024,1024))  
    fboxes,flabels=filterOverlappingBoxes(fboxes,flabels,fconfidences,threshold=0.2)
    annList1=(torch.tensor(fboxes),torch.tensor(flabels))
    
    #recover text
    textdict=textForImage(greyimage,boxes=filterBoxes(fboxes,flabels))
    textdict=filterUnexpectedCharacters(textdict)
    
    mergedList=[]
    fboxes,flabels=mergeBoxes(fboxes,flabels,maxYdown=100,maxYup=800,maxXdiff=300,mergedList=mergedList)
    annList2=(torch.tensor(fboxes),torch.tensor(flabels))
    
    #print(annList2[0])
    #print(mergedList)
    
    coveredParts=[]
    #loop through every merged box
    for i in range (len(annList2[0])):
        bbox=annList2[0][i]
        data={}
        outfile=fname+"-merged-"+str(i)+".json"
        op=open(outfile,"w+")
        data["filename"]=imgname
        data["region"]=[]
        curdict={}
        curdict["class"]="merged"
        curdict["bbox"]=[float(bbox[0]),float(bbox[1]),float(bbox[2]),float(bbox[3])]
        data["region"].append(curdict)
        originSymbol=mergedList[i][0]
        curdict={}
        curdict["class"]="symbol"
        curdict["bbox"]=[float(originSymbol[0]),float(originSymbol[1]),float(originSymbol[2]),float(originSymbol[3])]
        data["region"].append(curdict)
        originText=mergedList[i][1]
        if originText!=None:
            curdict={}
            curdict["class"]="text"
            curdict["bbox"]=[float(originText[0]),float(originText[1]),float(originText[2]),float(originText[3])]
            txt=findLinkingText(textdict,[float(originText[0]),float(originText[1]),float(originText[2]),float(originText[3])])
            curdict["text"]=txt
            data["region"].append(curdict)
        #print("")
        #print(data)
        json.dump(data,op)
    
    idx=0
    for i in range (len(annList1[0])):
        box=annList1[0][i]
        cls=annList1[1][i]
        #search if box was already covered as part of the merged boxes
        if not isIn(box,mergedList):
            data={}
            outfile=fname+"-fragment-"+str(idx)+".json"
            op=open(outfile,"w+")
            idx+=1
            data["filename"]=imgname
            data["region"]=[]
            curdict={}
            if(cls==1):
                curdict["class"]="symbol"
            if(cls==2):
                curdict["class"]="text"
                txt=findLinkingText(textdict,[float(box[0]),float(box[1]),float(box[2]),float(box[3])])
                curdict["text"]=txt
            curdict["bbox"]=[float(box[0]),float(box[1]),float(box[2]),float(box[3])]
            data["region"].append(curdict)
            #print("")
            #print(data)
            json.dump(data,op)


#create json files for each image in images
def createJsonsFor2(model,images,paramfile="./stored_models/currentModel",splitOnSymbols=False):
    greyimages=[]
    greyImages("greyimages",images,greyimages)
    
    model.load_state_dict(torch.load(paramfile))
    model.eval()
    for i in range(len(images)):
        image=images[i]
        imgname=os.path.normpath(image).split(os.sep)[-1]
        fname="./outputfiles/"+imgname.split('.')[0]
        if splitOnSymbols:
            createIndividualJsons2(image,model,fname,greyimages[i])
        else:
            createJson2(image,model,fname+".json",greyimages[i])

In [25]:
greyimages=[]
greyImages("greyimages",images,greyimages)

In [101]:
tdict=textForImage(greyimages[21],model=modelSeperated)
tdict

{'[2162.9091, 1228.1789, 2474.511, 1293.9924]': 'BACOURT.\n\x0c',
 '[1222.8478, 2151.8998, 1588.344, 2213.0397]': 'BECRINGEN _\n\x0c',
 '[369.5807, 139.1463, 636.8027, 209.6558]': 'Dd. HA\n\x0c',
 '[919.1738, 4.4321, 2142.8652, 141.4809]': 'L0THRINGER ADEL.\n\x0c',
 '[2382.4354, 160.4886, 2565.6081, 229.6756]': 'Taf 6\n\x0c',
 '[1207.6285, 3095.6102, 1625.9805, 3162.5756]': 'BETTSTEIN.\n\x0c',
 '[2022.824, 3109.6707, 2486.6248, 3171.3666]': 'Mas .v. BETT STEIN\n\x0c',
 '[291.4516, 1212.0064, 931.1501, 1280.4587]': '_ASPERMONT-LYN DENIT _\n\x0c',
 '[1237.0659, 255.549, 1707.9508, 335.9361]': '„ASPERMONT.L\n\x0c',
 '[504.3212, 259.5691, 814.3831, 325.2813]': 'ASMENTZ\n\x0c',
 '[1085.2256, 1214.5183, 1758.4462, 1289.4415]': '_RECKHEIM-ASPERMONI_\n\x0c',
 '[2021.8765, 273.6957, 2648.0978, 343.6568]': 'ASPERMONT-IYNDEN I\n\x0c',
 '[462.5565, 3093.4079, 804.4015, 3158.0508]': '„BER G ‚18 Sf.\n\x0c',
 '[2197.4563, 2166.4618, 2437.9559, 2229.3268]': 'BER Gr. a\n\x0c',
 '[486.8146, 2145.1065, 7

In [113]:
df=getConsensusText(greyimages[21],model=modelSeperated)
df

{'[2162.9091, 1228.1789, 2474.511, 1293.9924]': 'BACOURT.',
 '[1222.8478, 2151.8998, 1588.344, 2213.0397]': 'BECRINGEN _',
 '[369.5807, 139.1463, 636.8027, 209.6558]': 'Dd. HA',
 '[919.1738, 4.4321, 2142.8652, 141.4809]': 'L0THRINGER ADEL.-',
 '[2382.4354, 160.4886, 2565.6081, 229.6756]': 'Taf 6',
 '[1207.6285, 3095.6102, 1625.9805, 3162.5756]': 'BETTSTEIN.',
 '[2022.824, 3109.6707, 2486.6248, 3171.3666]': 'Mqs .y.BETTSTEIN',
 '[291.4516, 1212.0064, 931.1501, 1280.4587]': '_ASPERMONT-LYN DENIT _',
 '[1237.0659, 255.549, 1707.9508, 335.9361]': 'ASPHRMONT.L.',
 '[504.3212, 259.5691, 814.3831, 325.2813]': 'ASMENTZ',
 '[1085.2256, 1214.5183, 1758.4462, 1289.4415]': 'ZRECKHEIM-ASPERMONT_',
 '[2021.8765, 273.6957, 2648.0978, 343.6568]': 'ASPERMONT-IYNDEN I',
 '[462.5565, 3093.4079, 804.4015, 3158.0508]': '„BERG ‚188%.',
 '[2197.4563, 2166.4618, 2437.9559, 2229.3268]': 'BERG. _',
 '[486.8146, 2145.1065, 709.7056, 2207.7127]': 'HAYON.'}

In [115]:
summarySeperated2(modelSeperated,testimages[20:40],IdToClsSeperated,size=(12,10),max_n=10)

In [237]:
createJsonsFor2(modelSeperated,testimages[20:40],paramfile="./stored_models/modelSeperated",splitOnSymbols=False)