### This notebook is an exemplar which demonstrates transferring files between a Box folder and Savio scratch to run OCR on images using Tesseract (inside a Singularity container)

( tested with boxsdk (2.0.0a2) on python 3.5 kernel)
pip install -Iv boxsdk==2.0.0a2 


_This software is available under the terms of the Educational Community License, Version 2.0 (ECL 2.0). This software is Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")._

The text of the ECL license is reproduced below.

Educational Community License, Version 2.0
*************************************
Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")

Educational Community License, Version 2.0, April 2007

The Educational Community License version 2.0 ("ECL") consists of the
Apache 2.0 license, modified to change the scope of the patent grant in
section 3 to be specific to the needs of the education communities using
this license. The original Apache 2.0 license can be found at:[http://www.apache.org/licenses/LICENSE-2.0]

### Notebook configuration section
Set of target and source directories, script file names and other used as parameters in processing below.

In [None]:
runFolder = '/global/home/users/mmanning/'

tesseractimage = '/global/scratch/mmanning/tesseract2.img'
tesseractdatadir = '/opt/tessdata/'
pdfnamelist = []

boxProjectFolder = 'TesseractNotebook'
scratchDataDirectory = '/global/scratch/mmanning/tesseractnotebook/'
tesseractScratchDataDirectory = '/scratch/'

SINGULARITYCMD = 'singularity exec -B /global/scratch/mmanning/tesseractnotebook:/scratch  /global/scratch/mmanning/tesseract2.img'

### Box Authorization

function to store the oauth2 refresh token in a local file. This can be modified to use a keychain or other as required.

In [None]:
def store_tokens(access_token, refresh_token):
    
    """Callback for storing refresh tokens. (For now we ignore access tokens)."""
    with open('apptoken.cfg', 'w') as f:
     f.write(refresh_token.strip())

Oauth2 information is read from a local file with three lines, one line per parameter. 
The client id and client secret are defined in the Box application created for this notebook.  Create the application at the Box Developers site: https://berkeley.app.box.com/developers/services/edit/

The redirect uri can be any site that requires validation. Run the bootstrap notebook to create initial 
tokens that are then continually refreshed

In [None]:
import os

CLIENT_ID = None
CLIENT_SECRET = None
REDIRECT_URI = None
os.chdir('/global/home/users/mmanning')
# Read app info from text file
with open('app.cfg', 'r') as app_cfg:
    CLIENT_ID = app_cfg.readline()
    CLIENT_SECRET = app_cfg.readline()
    REDIRECT_URI = app_cfg.readline()

The refresh token is read from a local file. This token was created by running the bootstrap notebook which requires the user to validate with CalNet Authentication Service credentials, then stores the returned auth and refresh tokens in the same config files.

In [None]:
REFRESH_TOKEN = None

# Read app info from text file
with open('apptoken.cfg', 'r') as apptoken_cfg:
    REFRESH_TOKEN = apptoken_cfg.readline()

__Perform autentication__ 
then create globus client
Verify client is working by retrieving the name of the users root folder in Box

In [None]:
from boxsdk import OAuth2
from boxsdk import Client

# Do OAuth2 authorization.
oauth = OAuth2(
    client_id=CLIENT_ID.strip(),
    client_secret=CLIENT_SECRET.strip(),
    refresh_token=REFRESH_TOKEN.strip(),
    store_tokens=store_tokens
)

client = Client(oauth)

root_folder = client.folder(folder_id='0').get()
print ("folder name: ", root_folder['name'] )

items = client.folder(folder_id='0').get_items(limit=100, offset=0)
#print ("items: ", items )

### Utility functions

__function to find folder id be folder name.__
Current SDK does not have a 'find by name' function so musst loop thru all folders and look for match.

In [None]:
def find_folder_id(folder_name):
    folderlist = client.search(query=folder_name, result_type='folder', limit=10, offset=0)
    
    if len(folderlist) == 0 or len(folderlist) > 1:
        print('folder not found: ', folder_name)
        return 0
    else:
        return folderlist[0]['id']

### Retrieve the pdfs from the Box folder.
currently the Box SDK does not have an option for finding a folder by name so if you are looking for a specific folder then you would need to loop thru all the items in the list below and do a name match. Once you find the folder and retrieve the id, you can save that id for subsequent runs. Another option is to get the id from the url in the web client, but approah below is more flexible for now.

In [None]:
import os
import shutil 

print ('current working directory: ', os.getcwd())
os.chdir(scratchDataDirectory)

# test folder contents
items = client.folder(folder_id='0').get_items(limit=20, offset=0)
if type(items) is list:
    print ('number of files in top folder: ', len(items) )
    
    targetfolderId = ''
    for item in items:
        if item['type'] == 'folder':
            print('folder name: ', item['name'])
            if item['name'] == boxProjectFolder:
                targetfolderId = item['id']
                print('targetfolderId: ', targetfolderId)
        
    if targetfolderId is not None:
        tgtitems = client.folder(folder_id=targetfolderId).get_items(limit=200, offset=0)
        if type(tgtitems) is list:
            print ('number of files in target folder: ', len(tgtitems) ) 
        
        # download all pdf files
        for tgtitem in tgtitems:
            if  not tgtitem['type'] == 'folder' and tgtitem['name'].endswith('.pdf'):
                print('downloading: ', tgtitem['name'])
                #pdfcontent = client.file(file_id=tgtitem['id']).content()
                #newfile = open(scratchDataDirectory + tgtitem['name'], 'wb')
                #newfile.write(pdfcontent)
                #newfile.close()

__clean up filenames to reduce issues__

In [None]:
!for f in *\ *; do mv "$f" "${f//[^A-Za-z0-9._-]/_}"; done

### Convert all pdf files in working directory to images

In [None]:

def runGhostscript(pdfFile):
    print("filename: ", pdfFile)
    if filename.endswith(".pdf") : 
        name, extension = os.path.splitext(pdfFile)
        
        # Ghostscript executable is inside the container.
        # TEMPLATE: gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=/scratch/test/output/test-%d.png -r300 /scratch/test/germanocr.pdf
        GHOSTSCRIPTCMD = 'gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile={}{}-%d.png -r300 {}{}'
        gcmd = GHOSTSCRIPTCMD.format(tesseractScratchDataDirectory, name, tesseractScratchDataDirectory,pdfFile )

        # 
        # convert pdf to png
        #
        print("singularity cmd: ", SINGULARITYCMD)
        print("gs cmd: ", gcmd)
        #result = subprocess.call(GHOSTSCRIPTCMD)
        result = !$SINGULARITYCMD $gcmd
        print("gs result: ", result)

In [None]:
pdffileList = []
for filename in os.listdir(scratchDataDirectory):
    print("filename: ", filename)
    if filename.endswith(".pdf") : 
        pdffileList.append(filename)
        
        name, extension = os.path.splitext(filename)
        pdfnamelist.append(name)
        
print("pdffileList: ", pdffileList)
print("filenameList: ", pdfnamelist )


#
# multiprocess the pdf to png work work
#
#pool0 = Pool(20)
#pool0.map(runGhostscript, pdffileList)
#pool0.close()
#pool0.join()

### Run tesseract on all image files in the working directory

In [None]:
def runTesseract(imagefile):
    print("imagefile : ", imagefile)
    
    # template: tesseract --tessdata-dir /opt/tessdata /scratch/germanocr_Page_01.png  germanout  -l deu
    TCMD = 'tesseract --tessdata-dir /opt/tessdata {}{} {}{}  -l eng'
    #
    # ocr the png
    # 
    basename, ext = os.path.splitext(imagefile)
    tcmd = TCMD.format(tesseractScratchDataDirectory, imagefile, tesseractScratchDataDirectory,basename )
    print("tesseract cmd: ", tcmd)
    #print("singularity cmd: ", SINGULARITYCMD)

    result = !$SINGULARITYCMD $tcmd
    print("tesseract result: ", result)
    TCMD = 'tesseract --tessdata-dir /opt/tessdata {}{} {}{}  -l eng' 


In [None]:
from multiprocessing import Pool

imageList = []
for imagename in os.listdir(scratchDataDirectory):
    if imagename.endswith(".png"):
        imageList.append(imagename)

#
# multiprocess the ocr work
#
#pool = Pool()
#pool.map(runTesseract, imageList)
#pool.close()
#pool.join()

### Merge text files and upload to Box

In [None]:
import re

def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]  

In [None]:
print("pdfnamelist: ", pdfnamelist ) 

for name in pdfnamelist:
    mergeList = []
    for filename in os.listdir(scratchDataDirectory):
        #print("filename: ", filename)
        if filename.endswith(".txt") and filename.startswith(name): 
            mergeList.append(filename)
    #print(mergeList)

    sortedList = sorted(mergeList, key = natural_sort_key)
    #print(sortedList)

    alltextfilename = ''.join([scratchDataDirectory,name,'ALL.txt'])
    with open(alltextfilename, 'w', encoding="utf-8") as outfile:
        for fname in sortedList:
            with open(''.join([scratchDataDirectory,fname]), encoding="utf-8" ) as infile:
                for line in infile:
                    outfile.write(line)
                    
    upload_folder = client.folder(folder_id=targetfolderId).get()
    # upload the complete text file
    textfileinbox = upload_folder.upload( alltextfilename)  
    print ("text file id: ", textfileinbox['id'] )
    