# Find Sumerian (OCR Part Only)

### This notebook is an exemplar which demonstrates transferring zip files between a bDrive folder and Savio scratch to run OCR on images using Tesseract (inside a Singularity container)

( tested with boxsdk (2.0.0a2) on python 3.5 kernel)
pip install -Iv boxsdk==2.0.0a2 

_This software is available under the terms of the Educational Community License, Version 2.0 (ECL 2.0). This software is Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")._

The text of the ECL license is reproduced below.

Educational Community License, Version 2.0
*************************************
Copyright 2017 The Regents of the University of California, Berkeley ("Berkeley")

Educational Community License, Version 2.0, April 2007

The Educational Community License version 2.0 ("ECL") consists of the
Apache 2.0 license, modified to change the scope of the patent grant in
section 3 to be specific to the needs of the education communities using
this license. The original Apache 2.0 license can be found at:[http://www.apache.org/licenses/LICENSE-2.0]

## This notebook is incomplete.

### Notebook configuration section
Set of target and source directories, script file names and other used as parameters in processing below.

In [None]:
# Modify variables in this cell
username = 'nicolaschan' # Put your savio username here
project_name = 'aanderson' # Name of directory in your scratch directory for data
number_to_process = 1 # Number of PDFs to process
ocr_time = '03:00:00' # String for time for OCR job; e.g., '03:00:00' = 3 hours
all_files_list = '/global/scratch/groups/dh/aanderson/all_files.txt'
ocr_output_dir = '/global/scratch/groups/dh/aanderson/results/'
all_pdfs = '/global/scratch/groups/dh/aanderson/all_pdfs/'
process_incomplete = False # ONLY ONE notebook should run with this at a time

In [None]:
# Should not need modification unless tesseract changes
run_folder = '/global/scratch/{}/{}/'.format(username, project_name)
tesseract_scratch_data_dir = '/scratch/'
tesseract_img = '/global/scratch/groups/dh/tesseract2_3.img'

In [None]:
#Make Directories if they do not exist
import os
import sys
import errno

def dir_create(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise
        else:
            print('Folder at: ' + path + ' already exists. Skipping...')

### Identify Range to Process

In [None]:
# TODO: Set start_index based on the last completed ones
start_index = 0
end_index = start_index + number_to_process - 1

run_folder = run_folder + 'range_' + str(start_index) + '-' + str(end_index) + '/'
scratch_data_dir = run_folder + 'data/'

gs_command_script = run_folder + 'gsCommandScript.sh'
t4_command_script = run_folder + 't4CommandScipt.sh'
slurm_script = run_folder + 'slurmscript.sh'

!mkdir -p $scratch_data_dir
scratch_data_dir

In [None]:
#Henry Ang 10/13/2017
from time import localtime, strftime
logMsg = "{2} Start processing index {0} to index {1}".format(start_index, end_index, strftime("%Y-%m-%d %H:%M:%S", localtime())
)
print(logMsg)
!echo $logMsg >> /global/scratch/groups/dh/aanderson/process_log.txt

### Copy Cached PDFs

In [None]:
def range_to_pdfs(start, end, all_files_list):
    pdfs = []
    with open(all_files_list, 'rt', encoding='utf-8') as f:
        current_line = 0
        for line in f:
            if current_line > end_index:
                break
            if current_line >= start_index:
                pdfs.append(line.split(';')[-1].strip('\n'))
            current_line += 1
    return pdfs

pdfs = range_to_pdfs(start_index, end_index, all_files_list)
print(len(pdfs), 'PDFs to process')

In [None]:
def copy_pdf(pdf):
    pdf_path = '{}/{}.pdf'.format(all_pdfs, pdf)
    err = !cp $pdf_path $scratch_data_dir
    if len(err):
        raise Exception('PDF "{}" not copied! (Perhaps not downloaded yet?)'.format(pdf))
    print('Copied', pdf)
    
for pdf in pdfs:
    copy_pdf(pdf)

### Utility Functions

__function to return all files in directory tree.__

In [None]:
import os
def scantreeForFiles(path):
    """Recursively yield DirEntry objects for given directory."""
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            yield from scantreeForFiles(entry.path) 
        else:
            yield entry.path

__function to return list of all folders in directory tree.__

In [None]:
import os
def scandirForFolders(path, dirlist):
    """Recursively yield DirEntry objects for given directory."""
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            dirlist.append(entry.path)
            scandirForFolders(entry.path, dirlist)   

__Validate all the task log files produced by ht_helper __

In [None]:
def validateTaskResults(fileroot, totalTasks):
    # file root is job-name.jobId.taskNumber.log
    errorList = []    
    for i in range(0, totalTasks-1):
        fn = fileroot + '.' + str(i)
        if os.path.exists(fn):
            out = !tail -1 {fn}
            retval = out[0]
            #print ('return code: ', out[0])
        else:
            print ('warning: log file not available: ', fn)
        
        if ( retval != '0' ):
            errorList.append(i)
    return errorList

__SLURM job script__ normal

In [None]:
# batch script
batchtemplate = '#!/bin/bash -l  \n\
# Job name: \n\
#SBATCH --job-name=' + project_name + '\n\
# \n\
# Account: \n\
#SBATCH --account=ac_scsguest \n\
# \n\
# Partition: \n\
#SBATCH --partition=savio2 \n\
# \n\
## Scale by increasing the number of nodes \n\
#SBATCH --nodes=5  \n\
## DO NOT change ntasks-per-node setting as T4 also distributes across cores \n\
#SBATCH --ntasks-per-node=6 \n\
#SBATCH --qos=savio_normal \n\
# \n\
# Wall clock limit: \n\
#SBATCH --time={} \n\
# \n\
## Command(s) to run: \n\
module load gcc openmpi  \n\
/global/home/groups/allhands/bin/ht_helper.sh  -t {} -n1 -s1 -vL \n' 

### Create script to convert all pdf files in working directory to images

In [None]:
import glob, os
import shutil 

# Ghostscript executable is inside the container.
# TEMPLATE: gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=/scratch/test/output/test-%d.png \
#   -r300 /scratch/test/germanocr.pdf
SINGULARITYCMD = 'singularity exec -B {}:/scratch/ {} ' 
GHOSTSCRIPTCMD = 'gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=\"{}-%d.png\" -r300 \"{}\" ;  echo $?'

os.chdir(scratch_data_dir)
print ('current working directory: ', os.getcwd())

scmd = SINGULARITYCMD.format(scratch_data_dir, tesseract_img)

# total number of ghostscript tasks
gsCommandTotal = 0

with open(gs_command_script, 'w') as f:  
    for entry in scantreeForFiles(scratch_data_dir):
        filename, file_extension = os.path.splitext(entry)
        if ( entry.endswith('.pdf')):
            relativepath1 = entry[len(scratch_data_dir):]
            relativepath2 = filename[len(scratch_data_dir):]
            gcmd = GHOSTSCRIPTCMD.format(tesseract_scratch_data_dir +relativepath2 ,
                                         tesseract_scratch_data_dir+relativepath1)
            f.write(scmd + gcmd + '\n')
            gsCommandTotal += 1

#set time limit for this batch run
outputbatchscript = batchtemplate.format('00:30:00',  gs_command_script)
with open(slurm_script, 'w') as f:  
    f.write(outputbatchscript)

__Execute the task script with ht_helper__

In [None]:
os.chdir(run_folder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print('Execute ghostscript output: ', out ) 
job_id = out[0].split()[3]
print(job_id)

In [None]:
#Henry Ang 10/13/2017
logMsg = "{1} Start converting PDF to PNG, job ID:{0}".format(job_id, strftime("%Y-%m-%d %H:%M:%S", localtime()))
print(logMsg)
!echo $logMsg >> /global/scratch/groups/dh/aanderson/process_log.txt

In [None]:
import time
# print the users queue and the job status by id
!squeue -u $username #possibly do not need
print('--------------------------------')
print('Savio Job has been submitted. This cell will notify you when the job is done.')
jobState = False
while not jobState:
    out = !scontrol show job $job_id
    if any("COMPLETED" in s for s in out):
        print('\n******Savio Job finished******')
        jobState = True
    else:
       print('.', end='')
       time.sleep(10) #Can tweak this so that people can see it moving

__Check all task log files for bad exit code__  
task numbers align with lines in the task script  
check the log file of tasks in the returned array of failures 

In [None]:
import glob, os
if (not jobState):
    print("WARNING: Your SLURM Job has not finished processing! Please wait for the cell above to complete.")
else:
    print ('current working directory: ', os.getcwd())
    fileroot = project_name + '.' + job_id + '.log'
    tasklist = validateTaskResults(fileroot, gsCommandTotal)
    print ('these tasks in task script failed: ', tasklist)

__Remove task logs after any errors have been resolved__

In [None]:
filter = fileroot + '*'
print('filter: ', filter)
for f in glob.glob(filter):
    os.remove(f)

### Create script to ocr all png files in working directory to text

In [None]:
import glob, os
os.chdir(scratch_data_dir)
print ('current working directory: ', os.getcwd())
# template: tesseract --tessdata-dir /opt/tessdata /scratch/germanocr_Page_01.png  germanout  -l deu
#TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata \"{}\" \"{}\" \'  -l deu+eng+tur+fra -c tessedit_create_hocr=1;  echo $?'
TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu+eng+tur+fra -c tessedit_create_hocr=1 \"{}\" \"{}\" \';  echo $?'

scmd = SINGULARITYCMD.format(scratch_data_dir, tesseract_img)
# total number of tesseract tasks
t4CommandTotal = 0

with open(t4_command_script, 'w') as f:
    for entry in scantreeForFiles(scratch_data_dir):
        if ( entry.endswith('.png')):
            filename, file_extension = os.path.splitext(entry)
            relativepath1 = entry[len(scratch_data_dir):]
            relativepath2 = filename[len(scratch_data_dir):]
            tcmd = TCMD.format(tesseract_scratch_data_dir + relativepath1, 
                               tesseract_scratch_data_dir + relativepath2)
            #print(scmd + tcmd)
            f.write(scmd + tcmd + '\n')
            t4CommandTotal += 1

#set time limit for this batch run
outputbatchscript = batchtemplate.format(ocr_time,  t4_command_script)
with open(slurm_script, 'w') as f:  
    f.write(outputbatchscript)

__Execute the task script with ht_helper__

In [None]:
os.chdir(run_folder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print('Execute tesseract4 output: ', out ) 
job_id = out[0].split()[3]
print(job_id)

In [None]:
#Henry Ang 10/13/2017
logMsg = "{1} Start converting PNG to text, job ID:{0}".format(job_id, strftime("%Y-%m-%d %H:%M:%S", localtime()))
print(logMsg)
!echo $logMsg >> /global/scratch/groups/dh/aanderson/process_log.txt

In [None]:
import time
# print the users queue and the job status by id
!squeue -u $username #possibly do not need
print('--------------------------------')
print('Savio Job has been submitted. This cell will notify you when the job is done.')
jobState = False
while not jobState:
    out = !scontrol show job $job_id
    if any("COMPLETED" in s for s in out):
        print('\n******Savio Job finished******')
        jobState = True
    else:
       print('.', end='')
       time.sleep(10) #Can tweak this so that people can see it moving

In [None]:
os.chdir(run_folder)
print ('current working directory: ', os.getcwd())

fileroot = project_name + '.' + job_id + '.log'
#tasklist = validateTaskResults(fileroot, 10) first check a small subset
tasklist = validateTaskResults(fileroot, t4CommandTotal)
print ('these tasks in task script failed: ', tasklist)

# Remove task logs
#filter = fileroot + '*'
#for f in glob.glob(filter):
#    os.remove(f)

### Copy all .hocr files to results directory

In [None]:
chmod 775 $scratch_data_dir/*.hocr
!cp $scratch_data_dir/*.hocr $ocr_output_dir