### This notebook is an exemplar which demonstrates transferring zip files between a bDrive folder and Savio scratch to run OCR on images using Tesseract (inside a Singularity container)

( tested with boxsdk (2.0.0a2) on python 3.5 kernel)
pip install -Iv boxsdk==2.0.0a2 


_This software is available under the terms of the Educational Community License, Version 2.0 (ECL 2.0). This software is Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")._

The text of the ECL license is reproduced below.

Educational Community License, Version 2.0
*************************************
Copyright 2017 The Regents of the University of California, Berkeley ("Berkeley")

Educational Community License, Version 2.0, April 2007

The Educational Community License version 2.0 ("ECL") consists of the
Apache 2.0 license, modified to change the scope of the patent grant in
section 3 to be specific to the needs of the education communities using
this license. The original Apache 2.0 license can be found at:[http://www.apache.org/licenses/LICENSE-2.0]

### Notebook configuration section
Set of target and source directories, script file names and other used as parameters in processing below.

In [50]:
savioUsername = 'mmanning' #Put your savio username here
bDriveProjectFolder = 'CShapreau' #Put the name of your Google Drive Folder with data here, ensure that it is NOT nested
bDriveResultsFolder = 'CShapreau' #Put the name of the Drive Folder where you would like results placed. 
projectname = 'cstest1'         #Put the name of the folder in your scratch folder you would like data stored in


In [51]:
runFolder = '/global/scratch/' + savioUsername + '/' + projectname + '/'

tesseractimage = '/global/scratch/groups/dh/tesseract4.img'
tesseractdatadir = '/opt/tessdata/'
pdfnamelist = []

scratchDataDirectory = '/global/scratch/' + savioUsername + '/' + projectname + '/data/'
tesseractScratchDataDirectory = '/scratch/'

SINGULARITYCMD = 'singularity exec -B ' + runFolder + ':/scratch/  ' + tesseractimage

In [52]:
#Make Directories if they do not exist
import os
import sys
import errno

def dir_create(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise
        else:
            print('Folder at: ' + path + ' already exists. Skipping...')
if ( not (os.path.exists(runFolder) and os.path.isdir(runFolder) )  ):
    dir_create(runFolder)
if ( not (os.path.exists(scratchDataDirectory) and os.path.isdir(scratchDataDirectory) )  ):
    dir_create(scratchDataDirectory)

In [54]:

def find_folder_id(folder_name):
    folderlist = client.search(query=folder_name, result_type='folder', limit=50, offset=0)
    
    if len(folderlist) == 0 or len(folderlist) > 1:
        print('folder not found: ', folder_name)
        return 0
    else:
        return folderlist[0]['id']

### bDrive Authorization

In [55]:
import codecs
import httplib2
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.http import MediaFileUpload
from apiclient import discovery, errors
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

In [56]:
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'gDriveConnect'

In [57]:
import argparse
parser = argparse.ArgumentParser(parents=[tools.argparser])
parser.add_argument('-f', help=argparse.SUPPRESS)

flags = parser.parse_known_args()[0]
flags.noauth_local_webserver = True

In [58]:
def get_credentials():
    
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir, 'gDriveConnect.json')
    
    store = Storage(credential_path)    
    credentials = store.get()
    
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
        
    return credentials

In [59]:
credentials = get_credentials()
print('credential_path:', credentials)
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)

credential_path: <oauth2client.client.OAuth2Credentials object at 0x2aad8d48a860>


In [83]:
# loop thru folders to get the target dfolder for the download
page_token=None
response = service.files().list(q="mimeType='application/vnd.google-apps.folder'",
                                     spaces='drive',
                                     fields='files(id, name)',
                                     pageToken=page_token).execute()
targetFolderId = ''
resultsFolderId = ''
for file in response.get('files', []):
    if file.get('name')== bDriveProjectFolder:
        targetFolderId = file.get('id')
    if file.get('name')== bDriveResultsFolder:
        resultsFolderId = file.get('id')
print('target folder id:' + targetFolderId)
print('results folder id:' + resultsFolderId)




target folder id:1F7u35MJvMBkImGUuc1mX_9zj7kEi9x_N
results folder id:1F7u35MJvMBkImGUuc1mX_9zj7kEi9x_N


In [61]:

# Get names of all the folders
def get_drive_contents(folder_id):
    filemap = {}
    query="'" + folder_id + "' in parents and trashed=false"
    
    # This implementation is copied from below (commented out)
    page_token = None
    while True:
        response = service.files().list(q=query,
                                             spaces='drive',
                                             fields='nextPageToken, files(id, name)',
                                             pageToken=page_token).execute()
        for file in response.get('files', []):
            # Process change
            print('Found file: %s (%s)' % (file.get('name'), file.get('id')) )
            filemap[file.get('id')] = file.get('name')

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break;
    return filemap


In [62]:

gsCommandScript = runFolder + 'gsCommandScript.sh'
t4CommandScript = runFolder + 't4CommandScript.sh'
slurmScript = runFolder + 'slurmscript.sh'


In [63]:
# Added by Nicolas Chan based on previous code, 10/12/2017
import io
def download_file(google_id, destination):
    """Downloads a file from Google Drive"""
    
    request = service.files().get_media(fileId=google_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    
    done = False
    while done is False:
        try:
            status, done = downloader.next_chunk()
            sys.stdout.write('.')
        except errors.HttpError as error :
            print('Error file:', value, '   id:', key)
            print('An error occurred pulling the next chunk:', error)
            break

    fh.seek(0)

    print('\nwriting:', destination)
    with open( destination, 'wb',) as f2:
        f2.write(fh.getvalue())
        f2.close()

    fh.close()

In [65]:

filemap = get_drive_contents(targetFolderId)

print('files_to_download   ' + str(filemap) )
print('scratchDataDirectory   ' + scratchDataDirectory)

# Download specified files
for key, val in filemap.items():
    print('file   ' + val)
    download_file(key, scratchDataDirectory + key + '.jpg')

Found file: shapreau scan5.jpg (149YlCGjWAySaAPWVqz24mYz24KMAf890)
Found file: shapreau scan2.jpg (1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5)
Found file: shapreau scan3.jpg (10tJPlTHs2HgDVnOpad66jVJcMRaCBi8P)
Found file: shapreau scan6.jpg (120uJh1a_zfgCcSvODL7cYWnRPynmBzjT)
Found file: shapreau scan7.jpg (1ktfVDmMQ6Z8CWTxuKxVsr36ld9Rpd1k3)
Found file: shapreau scan1.jpg (1t0wLN71g-mZiSNSvQuad49ppCQdPgBIb)
Found file: shapreau scan4.jpg (17Ll7A5qMRz_U5ntIxp-9lBg6aAzLkB0a)
files_to_download   {'120uJh1a_zfgCcSvODL7cYWnRPynmBzjT': 'shapreau scan6.jpg', '1t0wLN71g-mZiSNSvQuad49ppCQdPgBIb': 'shapreau scan1.jpg', '10tJPlTHs2HgDVnOpad66jVJcMRaCBi8P': 'shapreau scan3.jpg', '149YlCGjWAySaAPWVqz24mYz24KMAf890': 'shapreau scan5.jpg', '1ktfVDmMQ6Z8CWTxuKxVsr36ld9Rpd1k3': 'shapreau scan7.jpg', '1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5': 'shapreau scan2.jpg', '17Ll7A5qMRz_U5ntIxp-9lBg6aAzLkB0a': 'shapreau scan4.jpg'}
scratchDataDirectory   /global/scratch/mmanning/cstest1/data/
file   shapreau scan6.jpg
..
writin

### Utility functions

__Validate all the task log files produced by ht_helper __

In [66]:
def validateTaskResults(fileroot, totalTasks):
    # file root is job-name.jobId.taskNumber.log
    
    errorList = []
    
    for i in range(0, totalTasks-1):
        fn = fileroot + '.' + str(i)
        if os.path.exists(fn):
            out = !tail -1 {fn}
            retval = out[0]
            #print ('return code: ', out[0])
        else:
            print ('warning: log file not available: ', fn)
        
        if ( retval != '0' ):
            errorList.append(i)
            
    return errorList


__SLURM job script__ normal

In [67]:
# batch script
batchtemplate = '#!/bin/bash -l  \n\
# Job name: \n\
#SBATCH --job-name=' + projectname + '\n\
# \n\
# Account: \n\
#SBATCH --account=ac_scsguest \n\
# \n\
# Partition: \n\
#SBATCH --partition=savio2 \n\
# \n\
## Scale by increasing the number of nodes \n\
#SBATCH --nodes=1  \n\
## DO NOT change ntasks-per-node setting as T4 also distributes across cores \n\
#SBATCH --ntasks-per-node=6 \n\
#SBATCH --qos=savio_normal \n\
# \n\
# Wall clock limit: \n\
#SBATCH --time={} \n\
# \n\
## Command(s) to run: \n\
module load gcc openmpi  \n\
/global/home/groups/allhands/bin/ht_helper.sh  -t {} -n1 -s1 -vL \n' 


### Create script to ocr all png files in working directory to text

In [71]:
import glob, os
os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())
SINGULARITYCMD = 'singularity exec -B {}:/scratch/ /global/scratch/groups/dh/tesseract4.img ' 

# template: tesseract --tessdata-dir /opt/tessdata /scratch/germanocr_Page_01.png  germanout  -l deu
#TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata \"{}\" \"{}\" \'  -l deu+eng+tur+fra -c tessedit_create_hocr=1;  echo $?'
#TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu+eng+tur+fra -c tessedit_create_hocr=1 \"{}\" \"{}\" \';  echo $?'
TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu \"{}\" \"{}\" \';  echo $?'

#

scmd = SINGULARITYCMD.format(scratchDataDirectory)
# total number of tesseract tasks
t4CommandTotal = 0

with open(t4CommandScript, 'w') as f:

    for entry in glob.glob('*.jpg'):
        print ('entry: ', entry)
        filename, file_extension = os.path.splitext(entry)
        tcmd = TCMD.format(tesseractScratchDataDirectory+entry, tesseractScratchDataDirectory+filename )
        print(scmd + tcmd)
        f.write(scmd + tcmd + '\n')
        t4CommandTotal += 1
    
    
#set time limit for this batch run
outputbatchscript = batchtemplate.format('00:15:00',  t4CommandScript)
with open(slurmScript, 'w') as f:  
    f.write(outputbatchscript)

current working directory:  /global/scratch/mmanning/cstest1/data
entry:  1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5.jpg
singularity exec -B /global/scratch/mmanning/cstest1/data/:/scratch/ /global/scratch/groups/dh/tesseract4.img  sh -c 'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu "/scratch/1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5.jpg" "/scratch/1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5" ';  echo $?
entry:  149YlCGjWAySaAPWVqz24mYz24KMAf890.jpg
singularity exec -B /global/scratch/mmanning/cstest1/data/:/scratch/ /global/scratch/groups/dh/tesseract4.img  sh -c 'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu "/scratch/149YlCGjWAySaAPWVqz24mYz24KMAf890.jpg" "/scratch/149YlCGjWAySaAPWVqz24mYz24KMAf890" ';  echo $?
entry:  120uJh1a_zfgCcSvODL7cYWnRPynmBzjT.jpg
singularity exec -B /global/scratch/mmanning/cstest1/data/:/scratch/ /global/scratch/groups/dh/tesseract4.img  sh -c 'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu "/scratch/120uJh1a_zfgCcSvODL7cYWnR

__Execute the task script with ht_helper__

In [72]:
os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print ('Execute tesseract4 output: ', out ) 
jobId =  out[0].split()[3]
print (jobId)

current working directory:  /global/scratch/mmanning/cstest1
Execute tesseract4 output:  ['Submitted batch job 2006473']
2006473


In [73]:
#Henry Ang 10/13/2017
from time import localtime, strftime
logMsg = "{1} Start converting PNG to text, job ID:{0}".format(jobId, strftime("%Y-%m-%d %H:%M:%S", localtime()))
print(logMsg)
!echo $logMsg >> /global/scratch/groups/dh/process_log.txt

2017-12-05 14:23:50 Start converting PNG to text, job ID:2006473


In [75]:
import time
# print the users queue and the job status by id
!squeue -u $savioUsername #possibly do not need
print('--------------------------------')
print('Savio Job has been submitted. This cell will notify you when the job is done.')
jobState = False
while not jobState:
    out = !scontrol show job $jobId
    if any("COMPLETED" in s for s in out):
        print('\n******Savio Job finished******')
        jobState = True
    else:
       print('.', end='')
       time.sleep(10) #Can tweak this so that people can see it moving
        

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
--------------------------------
Savio Job has been submitted. This cell will notify you when the job is done.

******Savio Job finished******


__Check all task log files for bad exit code__

In [76]:

os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

fileroot = projectname + '.' + jobId + '.log'
#tasklist = validateTaskResults(fileroot, 10) first check a small subset
tasklist = validateTaskResults(fileroot, t4CommandTotal)
print ('these tasks in task script failed: ', tasklist)

# Remove task logs
#filter = fileroot + '*'
#for f in glob.glob(filter):
#    os.remove(f)

current working directory:  /global/scratch/mmanning/cstest1
these tasks in task script failed:  []


### Send output text file to Google Translate

In [82]:
from google.cloud import translate
os.chdir(scratchDataDirectory)
target_language = 'en'

# create client from service account json
client = translate.Client.from_service_account_json('/global/home/users/mmanning/gdrive_connect_service_account.json')

for entry in glob.glob('*.txt'):
    filename, file_extension = os.path.splitext(entry)
    print ('filename: ', filename, 'file_extension: ', file_extension)
    if ( entry.endswith('.txt')):
        
        print ('translate: ', filename)
        
        # Read input file contents
        input_contents = ''
        with open(filename + file_extension, encoding='utf8') as file:
            for line in file.readlines():
                input_contents += line
                
        # Translate file contents
        translation = client.translate(input_contents, target_language=target_language)
        
        # Translation contains escaped HTML charcaters such as '&#39;' for an apostrophe.
        # To fix this, unescape HTML
        from html import unescape
        translation_text = unescape(translation['translatedText'])
        
        
        # Write output to output file
        output_file = open((filename + '-' + target_language + '.txt'), 'w', encoding='utf8')
        output_file.write(translation_text)
        output_file.close()
        
        
        

filename:  10tJPlTHs2HgDVnOpad66jVJcMRaCBi8P file_extension:  .txt
translate:  10tJPlTHs2HgDVnOpad66jVJcMRaCBi8P
filename:  149YlCGjWAySaAPWVqz24mYz24KMAf890 file_extension:  .txt
translate:  149YlCGjWAySaAPWVqz24mYz24KMAf890
filename:  1t0wLN71g-mZiSNSvQuad49ppCQdPgBIb file_extension:  .txt
translate:  1t0wLN71g-mZiSNSvQuad49ppCQdPgBIb
filename:  17Ll7A5qMRz_U5ntIxp-9lBg6aAzLkB0a file_extension:  .txt
translate:  17Ll7A5qMRz_U5ntIxp-9lBg6aAzLkB0a
filename:  1ktfVDmMQ6Z8CWTxuKxVsr36ld9Rpd1k3 file_extension:  .txt
translate:  1ktfVDmMQ6Z8CWTxuKxVsr36ld9Rpd1k3
filename:  120uJh1a_zfgCcSvODL7cYWnRPynmBzjT file_extension:  .txt
translate:  120uJh1a_zfgCcSvODL7cYWnRPynmBzjT
filename:  1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5 file_extension:  .txt
translate:  1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5


#### Move the resulting files to bDrive.

In [84]:
def upload_txt_file(name, path, destination_folder=None):
    """Upload a text file to Google Drive"""
    
    file_metadata = { 'name': name }
    if destination_folder:
        file_metadata['parents'] = [destination_folder]
    media = MediaFileUpload(path, mimetype='text/plain')
    file = service.files().create(
        body=file_metadata,
        media_body=media,
        fields='id'
    ).execute()
    print('Uploaded', name, '; ID:', file.get('id'))


In [86]:

os.chdir(scratchDataDirectory)
for entry in glob.glob('*-' + target_language + '.txt'):
    print ("entry: ", entry )
    filename, file_extension = os.path.splitext(entry)
    if ( filename.endswith('-' + target_language )):
        upload_txt_file( entry, entry, resultsFolderId) 

        
        

entry:  149YlCGjWAySaAPWVqz24mYz24KMAf890-en.txt
Uploaded 149YlCGjWAySaAPWVqz24mYz24KMAf890-en.txt ; ID: 1CwvpPPXTamR4cqGqRZXHYkhTncF-8kGT
entry:  120uJh1a_zfgCcSvODL7cYWnRPynmBzjT-en.txt
Uploaded 120uJh1a_zfgCcSvODL7cYWnRPynmBzjT-en.txt ; ID: 1PlAaK0RSiqnOJk2AkzZ3LeApIyoga_13
entry:  1t0wLN71g-mZiSNSvQuad49ppCQdPgBIb-en.txt
Uploaded 1t0wLN71g-mZiSNSvQuad49ppCQdPgBIb-en.txt ; ID: 17UM0l3qbnsB6-PJTV3kgpYaSP8yuQxEl
entry:  1ktfVDmMQ6Z8CWTxuKxVsr36ld9Rpd1k3-en.txt
Uploaded 1ktfVDmMQ6Z8CWTxuKxVsr36ld9Rpd1k3-en.txt ; ID: 1ylD_Au_T5qEI2DIHl1Iq1M7Luf2hBoce
entry:  17Ll7A5qMRz_U5ntIxp-9lBg6aAzLkB0a-en.txt
Uploaded 17Ll7A5qMRz_U5ntIxp-9lBg6aAzLkB0a-en.txt ; ID: 1je9v-Lyce3_FQjmIKTEFIcUBAgXW06A9
entry:  10tJPlTHs2HgDVnOpad66jVJcMRaCBi8P-en.txt
Uploaded 10tJPlTHs2HgDVnOpad66jVJcMRaCBi8P-en.txt ; ID: 1x23PHQ8MDoaSvj9CAJmI8hGPaTi8PkFa
entry:  1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5-en.txt
Uploaded 1eo1-nowSdK_ppWTsj18Yw2yfwGO11u-5-en.txt ; ID: 1dDExQq-NgUcndWuv2WPg6fcHJBoI5L3D
