### This notebook is an exemplar which demonstrates transferring zip files between a bDrive folder and Savio scratch to run OCR on images using Tesseract (inside a Singularity container)

( tested with boxsdk (2.0.0a2) on python 3.5 kernel)
pip install -Iv boxsdk==2.0.0a2 


_This software is available under the terms of the Educational Community License, Version 2.0 (ECL 2.0). This software is Copyright 2016 The Regents of the University of California, Berkeley ("Berkeley")._

The text of the ECL license is reproduced below.

Educational Community License, Version 2.0
*************************************
Copyright 2017 The Regents of the University of California, Berkeley ("Berkeley")

Educational Community License, Version 2.0, April 2007

The Educational Community License version 2.0 ("ECL") consists of the
Apache 2.0 license, modified to change the scope of the patent grant in
section 3 to be specific to the needs of the education communities using
this license. The original Apache 2.0 license can be found at:[http://www.apache.org/licenses/LICENSE-2.0]

### Notebook configuration section
Set of target and source directories, script file names and other used as parameters in processing below.

In [1]:
savioUsername = 'sahilhasan' #Put your savio username here
boxProjectFolder = 'aatest' #Put the name of your Google Drive Folder with data here, ensure that it is NOT nested
boxResultsFolder = 'aatest' #Put the name of the Drive Folder where you would like results placed. 
projectname = 'test'         #Put the name of the folder in your scratch folder you would like data stored in



In [2]:

runFolder = '/global/scratch/' + savioUsername + '/' + projectname + '/'

tesseractimage = '/global/scratch/groups/dh/tesseract2_3.img'
tesseractdatadir = '/opt/tessdata/'
pdfnamelist = []

scratchDataDirectory = '/global/scratch/' + savioUsername + '/' + projectname + '/data/'
tesseractScratchDataDirectory = '/scratch/'

SINGULARITYCMD = 'singularity exec -B ' + runFolder + ':/scratch/  ' + tesseractimage

gsCommandScript = runFolder + 'gsCommandScript.sh'
t4CommandScript = runFolder + 't4CommandScript.sh'
slurmScript = runFolder + 'slurmscript.sh'



In [3]:
#Make Directories if they do not exist
import os
import sys
import errno

def dir_create(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise
        else:
            print('Folder at: ' + path + ' already exists. Skipping...')
dir_create(runFolder)
dir_create(scratchDataDirectory)

### bDrive Authorization

In [4]:
import codecs
import httplib2
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.http import MediaFileUpload
from apiclient import discovery, errors
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

In [5]:
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'gDriveConnect'

In [6]:
def get_credentials():
    
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir, 'gDriveConnect.json')
    
    store = Storage(credential_path)    
    credentials = store.get()
    
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
        
    return credentials

In [7]:
credentials = get_credentials()
print('credential_path:', credentials)
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)

credential_path: <oauth2client.client.OAuth2Credentials object at 0x2b2f485e5d30>


In [8]:
# loop thru folders to get the target dfolder for the download
page_token=None
response = service.files().list(q="mimeType='application/vnd.google-apps.folder'",
                                     spaces='drive',
                                     fields='files(id, name)',
                                     pageToken=page_token).execute()
targetFolderId = ''
for file in response.get('files', []):
    if file.get('name')== boxProjectFolder:
        targetFolderId = file.get('id')
print('target folder id:' + targetFolderId)

target folder id:0B3QqxeoUcqoAaHV4X1hPYmZNUVE


In [9]:

downloadMap = {}

page_token = None
while True:
    response = service.files().list(q="mimeType='application/pdf' and '0B3QqxeoUcqoAaHV4X1hPYmZNUVE' in parents",
                                         spaces='drive',
                                         fields='nextPageToken, files(id, name)',
                                         pageToken=page_token).execute()
    for file in response.get('files', []):
        # Process change
        print('Found file: %s (%s)' % (file.get('name'), file.get('id')) )
        tup = (file.get('name'), file.get('id'))
        downloadMap[file.get('id')]=file.get('name')
    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break;

Found file: CNIP 38.pdf (0B3QqxeoUcqoAOFdncU5jTUlyeFE)
Found file: AKT 7a.pdf (0B3QqxeoUcqoAdmc5cF9VU0dWSFU)


In [10]:
os.chdir(scratchDataDirectory)
out = !find -name "*.pdf"
safeDict = dict()
for s in out:
    temp = s[2:][:-4]
    if temp in list(downloadMap):
        downloadMap.pop(temp)
print(str(len(list(downloadMap))) + " more items to download")
    


2 more items to download


In [11]:
import io

for key, value in downloadMap.items():
    print('downloading:', value)
    request = service.files().get_media(fileId=key)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        try:
            status, done = downloader.next_chunk()
            sys.stdout.write('.')
        except errors.HttpError as error :
            print('Error file:', value, '   id:', key)
            print('An error occurred pulling the next chunk:', error)
            break

    # failed during download?
    if done is False:
        continue

    outfile = scratchDataDirectory + key + '.pdf'
    fh.seek(0)

    print('\nwriting:', outfile)
    with open( outfile, 'wb',) as f2:
        f2.write(fh.getvalue())
        f2.close()

    fh.close()
print( 'downloads completed.')


downloading: AKT 7a.pdf
..............................................................................................................
writing: /global/scratch/sahilhasan/test/data/0B3QqxeoUcqoAdmc5cF9VU0dWSFU.pdf
downloading: CNIP 38.pdf
.................................................................................
writing: /global/scratch/sahilhasan/test/data/0B3QqxeoUcqoAOFdncU5jTUlyeFE.pdf
downloads completed.


### Utility functions

In [12]:
import re

def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)]  

__function to return all files in directory tree.__

In [13]:
import os
def scantreeForFiles(path):
    """Recursively yield DirEntry objects for given directory."""
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            yield from scantreeForFiles(entry.path) 
        else:
            yield entry.path


__function to return list of all folders in directory tree.__

In [14]:
import os
def scandirForFolders(path, dirlist):
    """Recursively yield DirEntry objects for given directory."""
    for entry in os.scandir(path):
        if entry.is_dir(follow_symlinks=False):
            dirlist.append(entry.path)
            scandirForFolders(entry.path, dirlist)    


__Validate all the task log files produced by ht_helper __

In [15]:
def validateTaskResults(fileroot, totalTasks):
    # file root is job-name.jobId.taskNumber.log
    
    errorList = []
    
    for i in range(0, totalTasks-1):
        fn = fileroot + '.' + str(i)
        if os.path.exists(fn):
            out = !tail -1 {fn}
            retval = out[0]
            #print ('return code: ', out[0])
        else:
            print ('warning: log file not available: ', fn)
        
        if ( retval != '0' ):
            errorList.append(i)
            
    return errorList


__SLURM job script__ normal

In [16]:
# batch script
batchtemplate = '#!/bin/bash -l  \n\
# Job name: \n\
#SBATCH --job-name=' + projectname + '\n\
# \n\
# Account: \n\
#SBATCH --account=ac_scsguest \n\
# \n\
# Partition: \n\
#SBATCH --partition=savio2 \n\
# \n\
## Scale by increasing the number of nodes \n\
#SBATCH --nodes=5  \n\
## DO NOT change ntasks-per-node setting as T4 also distributes across cores \n\
#SBATCH --ntasks-per-node=6 \n\
#SBATCH --qos=savio_normal \n\
# \n\
# Wall clock limit: \n\
#SBATCH --time={} \n\
# \n\
## Command(s) to run: \n\
module load gcc openmpi  \n\
/global/home/groups/allhands/bin/ht_helper.sh  -t {} -n1 -s1 -vL \n' 


### Create script to convert all pdf files in working directory to images


In [17]:
import glob, os
import shutil 

# Ghostscript executable is inside the container.
# TEMPLATE: gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=/scratch/test/output/test-%d.png -r300 /scratch/test/germanocr.pdf
SINGULARITYCMD = 'singularity exec -B {}:/scratch/ /global/scratch/groups/dh/tesseract2_3.img ' 
GHOSTSCRIPTCMD = 'gs -dBATCH -dNOPAUSE -dQUIET -sDEVICE=png16m -sOutputFile=\"{}-%d.png\" -r300 \"{}\" ;  echo $?'

os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())

scmd = SINGULARITYCMD.format(scratchDataDirectory)

# total number of ghostscript tasks
gsCommandTotal = 0

with open(gsCommandScript, 'w') as f:  

    for entry in scantreeForFiles(scratchDataDirectory):
        filename, file_extension = os.path.splitext(entry)
        if ( entry.endswith('.pdf')):
            relativepath1 = entry[len(scratchDataDirectory):]
            relativepath2 = filename[len(scratchDataDirectory):]
            gcmd = GHOSTSCRIPTCMD.format(tesseractScratchDataDirectory+relativepath2, tesseractScratchDataDirectory+relativepath1 )
            f.write(scmd + gcmd + '\n')
            gsCommandTotal += 1
    
    
#set time limit for this batch run
outputbatchscript = batchtemplate.format('00:30:00',  gsCommandScript)
with open(slurmScript, 'w') as f:  
    f.write(outputbatchscript)

current working directory:  /global/scratch/sahilhasan/test/data


__Execute the task script with ht_helper__

In [18]:
os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print ('Execute ghostscript output: ', out ) 
jobId =  out[0].split()[3]
print (jobId)

current working directory:  /global/scratch/sahilhasan/test
Execute ghostscript output:  ['Submitted batch job 1552824']
1552824


In [19]:
import time
# print the users queue and the job status by id
!squeue -u $savioUsername #possibly do not need
print('--------------------------------')
print('Savio Job has been submitted. This cell will notify you when the job is done.')
jobState = False
while not jobState:
    out = !scontrol show job $jobId
    if any("COMPLETED" in s for s in out):
        print('\n******Savio Job finished******')
        jobState = True
    else:
       print('.', end='')
       time.sleep(30) #Can tweak this so that people can see it moving
        


             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1552824    savio2     test sahilhas PD       0:00      5 (Priority)
--------------------------------
Savio Job has been submitted. This cell will notify you when the job is done.
................
******Savio Job finished******


__Check all task log files for bad exit code__  
task numbers align with lines in the task script  
check the log file of tasks in the returned array of failures  

In [20]:
import glob, os
if (not jobState):
    print("WARNING: Your SLURM Job has not finished processing! Please wait for the cell above to complete.")
else:
    print ('current working directory: ', os.getcwd())
    fileroot = projectname + '.' + jobId + '.log'
    tasklist = validateTaskResults(fileroot, gsCommandTotal)
    print ('these tasks in task script failed: ', tasklist)


current working directory:  /global/scratch/sahilhasan/test
these tasks in task script failed:  []


__Remove task logs after any errors have been resolved__

In [21]:
 
filter = fileroot + '*'
print ('filter: ', filter)
for f in glob.glob(filter):
    os.remove(f)

filter:  test.1552824.log*


### Create script to ocr all png files in working directory to text

In [22]:
import glob, os
os.chdir(scratchDataDirectory)
print ('current working directory: ', os.getcwd())
# template: tesseract --tessdata-dir /opt/tessdata /scratch/germanocr_Page_01.png  germanout  -l deu
#TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata \"{}\" \"{}\" \'  -l deu+eng+tur+fra -c tessedit_create_hocr=1;  echo $?'
TCMD = ' sh -c \'OMP_NUM_THREADS=1 tesseract --tessdata-dir /opt/tessdata  -l deu+eng+tur+fra -c tessedit_create_hocr=1 \"{}\" \"{}\" \';  echo $?'

#

scmd = SINGULARITYCMD.format(scratchDataDirectory)
# total number of tesseract tasks
t4CommandTotal = 0

with open(t4CommandScript, 'w') as f:

    for entry in scantreeForFiles(scratchDataDirectory):
        if ( entry.endswith('.png')):
            filename, file_extension = os.path.splitext(entry)
            relativepath1 = entry[len(scratchDataDirectory):]
            relativepath2 = filename[len(scratchDataDirectory):]
            tcmd = TCMD.format(tesseractScratchDataDirectory+relativepath1, tesseractScratchDataDirectory+relativepath2 )
            #print(scmd + tcmd)
            f.write(scmd + tcmd + '\n')
            t4CommandTotal += 1
    
    
#set time limit for this batch run
outputbatchscript = batchtemplate.format('03:00:00',  t4CommandScript)
with open(slurmScript, 'w') as f:  
    f.write(outputbatchscript)

current working directory:  /global/scratch/sahilhasan/test/data


__Execute the task script with ht_helper__

In [23]:
os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

out = !sbatch slurmscript.sh   
    
print ('Execute tesseract4 output: ', out ) 
jobId =  out[0].split()[3]
print (jobId)

current working directory:  /global/scratch/sahilhasan/test
Execute tesseract4 output:  ['Submitted batch job 1552829']
1552829


In [24]:
import time
# print the users queue and the job status by id
!squeue -u $savioUsername #possibly do not need
print('--------------------------------')
print('Savio Job has been submitted. This cell will notify you when the job is done.')
jobState = False
while not jobState:
    out = !scontrol show job $jobId
    if any("COMPLETED" in s for s in out):
        print('\n******Savio Job finished******')
        jobState = True
    else:
       print('.', end='')
       time.sleep(30) #Can tweak this so that people can see it moving
        

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1552829    savio2     test sahilhas PD       0:00      5 (Priority)
--------------------------------
Savio Job has been submitted. This cell will notify you when the job is done.
...................................
******Savio Job finished******


__Check all task log files for bad exit code__

In [25]:

os.chdir(runFolder)
print ('current working directory: ', os.getcwd())

fileroot = projectname + '.' + jobId + '.log'
#tasklist = validateTaskResults(fileroot, 10) first check a small subset
tasklist = validateTaskResults(fileroot, t4CommandTotal)
print ('these tasks in task script failed: ', tasklist)

# Remove task logs
#filter = fileroot + '*'
#for f in glob.glob(filter):
#    os.remove(f)

current working directory:  /global/scratch/sahilhasan/test
these tasks in task script failed:  []


### Merge text files and upload to Box

In [26]:
from scandir import scandir
dirlist = []

scandirForFolders(scratchDataDirectory, dirlist)

print("num dirs: ", len(dirlist) ) 


num dirs:  0


__check that for every .png there is a .hocr in each directory__

In [27]:
missingResultList = []
for currentdir in dirlist:
    os.chdir(currentdir)
    #print ('current working directory: ', os.getcwd())
    
    
    # get a list of all pdf names
    for filename in os.listdir(os.getcwd()):
        if  os.path.isfile(filename)  and filename.endswith('.png'):
            fn, fe = os.path.splitext(filename)
            if not os.path.exists(fn + '.hocr'):
                missingResultList.append(currentdir + '/' + filename)
                print ('missing result: ', currentdir + '/' + filename )
print("missingResultList size: ", len(missingResultList) ) 

missingResultList size:  0


__process hocr files__

Current scoring approach:
- To be counted, the score for a word must be between 25 and 70, this removes some garbage characters at the low end.
- If a line contains between 6 and 9 scoring words it registers as a small hit, if it contains 10 or more then the line registers as a big hit.
- If the total score for the last three lines (a rolling window) is > 15 then that is a small hit and > 25 is a big hit.
- if a "paragraph" (currently using the dev tag in the hocr xml) has > 25 scoring words that is a small hit and > 40 is a bit hit.
This scoring approach seems to be doing a good job of finding target text. However, it also includes a number of false positives that I 
have not been able to reduce significantly. Table and figures are usually tagged as hits. The Teissier_Sealings doc has a number of tables, 
rotated to landscape, which show up in the big hits list. I have not found a way to identify a table or figure from the xml results. 
There are a couple papers online with complex detection algorithms but nothing I could implement without significant development time.

In [28]:
import sys

srch = re.compile(r'.*[-—]+.*[-—]+[.-—]*')

def vetHitList(hitlist):

    #logging.info('vetHitList  hitlist: %s ', hitlist )
    totalHits= len(hitlist)
    count = 0
    for hit in hitlist:
        found = srch.search(hit)
        if found:
            #logging.info('good hit: %s ', hit )
            count = count + 1
        #else:
            #logging.info('BAD hit:  %s ', hit )

    percent = count / totalHits
    #logging.info('vetHitList percent: %s ', percent)
    if percent > .666:
        return True
    else:
        return False


In [29]:
import collections
from bs4 import BeautifulSoup
import logging
import lxml
import lxml.etree
import bs4.builder._lxml
import bs4.builder._html5lib



def parseHocrFiles(filenameroot, fileList):

    logging.basicConfig(handlers=[logging.FileHandler('ocrparse.log', 'w', 'utf-8')], level=logging.INFO, format='%(message)s',  datefmt='%Y-%m-%d %H:%M:%S')

    last_three_lines = collections.deque(3*[0], 3)
    last_three_line_words = collections.deque(3*[''], 3)
    div_count=0
    avg_low_score = 0
    low_score_ctr = 0
    low_score_words = []
    div_words = []
    line_id = ''
    bighits = []
    smallhits = []

    for filename in fileList :

        if not filename.endswith(".hocr"):
            continue

        #print (' filename: ', filename)

        # split out the file name and the page (image) number
        splittokens = re.split(r"-|\.", filename)
        tot = len(splittokens)
        image_number = splittokens[tot - 2]
        image_number_decimal = int(image_number.strip())
        #doc_name = splittokens[0]
        #print (' doc name: ', filenameroot)
        #print (' image number: ', image_number)
        soup = BeautifulSoup(open(filename, encoding='utf-8'), "lxml")
        #print ('==========>', filename )
        logging.info("==========> %s", filename )

        last_three_lines.clear()
        last_three_line_words.clear()

        for div_tag in soup.find_all('div'):


            div_id = div_tag['id']
            if div_id is None:
                div_id = 'None'
            else:
                div_id = div_id.strip()


            div_count = len(div_words)
            #check words in hit list for hypens
            #logging.info("div words: %s ", div_words)

            if div_count > 0 :
                gooddivwords = vetHitList( div_words )
            else:
                gooddivwords = False

            if gooddivwords :
                logging.info("good div words: %s ", div_words)

            # if more than 25 words in this dev section then add to hit list
            if div_count > 20 and gooddivwords:
                bighits.append([filenameroot, image_number_decimal, "div count: " + str(div_count), div_words] )
                logging.info("file: %s  div count: %d tag: %s ", filename, div_count, div_id )
            elif div_count > 10 and gooddivwords:
                smallhits.append([filenameroot, image_number_decimal, "div count: " + str(div_count), div_words] )
                logging.info("file: %s  div count: %d tag: %s ", filename, div_count, div_id )

            div_count = 0
            div_words = []


            #print 'tag initial: ', tag
            #print ('tag class: ', div_tag['class'] )
            if 'ocr_page' in div_tag['class']:
                #logging.info("ocr_page: %s" % tag['title'])
                #print 'tag filtered: ', tag
                for span_tag in div_tag.find_all('span'):
                    #print spantag

                    if 'ocr_line' in span_tag['class']:
                        line_id = span_tag['id'].strip().encode('utf-8')
                        #print ('new line :', line_id,  ' process prev set then reset counters')

                        #check words in hit list for hypens
                        if len(low_score_words) > 0:
                            goodwords = vetHitList( low_score_words )
                        else:
                            goodwords = False
                        #print("goodwords: ", goodwords)
                        lsw = [x.encode('utf-8') for x in low_score_words]
                        if low_score_ctr > 6 and low_score_ctr <= 9 and goodwords :
                            
                            print ('mid range hit: ',  lsw  )
                            logging.info("line:  %s   score: %d   avg low score: %f  words:  %s",  line_id, low_score_ctr, (avg_low_score/low_score_ctr) , low_score_words  )
                            #smallhits.append( [filenameroot, image_number_decimal, low_score_ctr, low_score_words] )
                            smallhits.append( [filenameroot, image_number_decimal, low_score_ctr, lsw] )

                        if low_score_ctr >= 10 and goodwords :
                            
                            print ('high range hit', lsw  )
                            logging.info("line:  %s   score: %d    avg low score: %f  words:  %s",  line_id, low_score_ctr, (avg_low_score/low_score_ctr),   low_score_words )
                            #bighits.append( [filenameroot, image_number_decimal, low_score_ctr, low_score_words] )
                            bighits.append( [filenameroot, image_number_decimal, low_score_ctr, lsw] )

                        div_words.extend(low_score_words)


                        # add to the counter of the last three lines and if total is over the threahold then log
                        last_three_lines.appendleft(low_score_ctr)
                        last_three_line_words.appendleft(low_score_words)
                        total_last_three_lines = sum(last_three_lines)
                        if total_last_three_lines > 25 :
                            logging.info("line:  %s   last three lines:  %s",  line_id, last_three_lines )
                            bighits.append( [filenameroot, image_number_decimal, "three line total:" + str(total_last_three_lines) , list(last_three_line_words) ] )
                        elif total_last_three_lines > 15 :
                            logging.info("line:  %s   last three lines:  %s",  line_id, last_three_lines )
                            smallhits.append( [filenameroot, image_number_decimal, "three line total:" + str(total_last_three_lines), list(last_three_line_words) ] )


                        low_score_words = []
                        avg_low_score = 0
                        low_score_ctr = 0

                        # that is all the processing when a new line is reached
                        continue

                    if span_tag.string is None:
                        continue

                    spantagword = span_tag.string.strip()
                    #print ('span tag: ', spantagword.encode("utf-8")  )
                    span_title_split = span_tag['title'].split(';')
                    for span_title_element in span_title_split:
                        if 'x_wconf' in span_title_element:
                            #label, score = title_element.split(' ')
                            score = span_title_element.replace('x_wconf', '').strip()
                            #print( 'word: ', spantagword.encode("utf-8"), 'score: ', int(score.strip()) )

                            # if score less than 25 the could be table. diagram, or figure
                            if int(score.strip())  < 70 and int(score.strip()) > 25 :
                                #logging.info('word:  %s score: %s ',  spantagconverted, score.strip() )
                                low_score_ctr = low_score_ctr + 1
                                low_score_words.append( spantagword )
                                avg_low_score = avg_low_score + int(score.strip())




    #files to hold totalsi
    #print("create results files for: ", filenameroot)
    bighitssorted = open( scratchDataDirectory + filenameroot + '_bighits.txt', 'w', encoding="utf-8")
    smallhitssorted = open( scratchDataDirectory + filenameroot + '_smallhits.txt', 'w', encoding="utf-8")

    bigsortedlist =  sorted(bighits, key=lambda row: row[1], reverse=False)
    logging.info('bigsortedlist: %s ', bigsortedlist)
    smallsortedlist =  sorted(smallhits, key=lambda row: row[1], reverse=False)
    logging.info('smallsortedlist: %s ', smallsortedlist)

    unique = []
    for hit in bigsortedlist:
        if hit[1] not in unique :
            unique.append( hit[1] )
            print("big hit:", hit)
            #bighitssorted.write(hit[0] + ';' + hit[1] + ';' + hit[3]  + "\n"  )
            thehit = str(hit[3])
            #bighitssorted.write( str(hit[3])   )
            bighitssorted.write(str(hit[0]) + ';' + str(hit[1]) + ';' + str(hit[3])  + "\n" )
    bighitssorted.close()

    for hit in smallsortedlist:
        if hit[1] not in unique :
            unique.append( hit[1] )
            print("small hit:", hit)
            smallhitssorted.write(str(hit[0]) + ';' + str(hit[1]) + ';' + str(hit[3])  + "\n"  )
            #smallhitssorted.write(str(hit[0]).encode('utf-8') + ';' + str(hit[1]).encode('utf-8') + ';' + str(hit[3]).encode('utf-8') + "\n"  )
    smallhitssorted.close()



In [30]:
filenameList = []

import fnmatch
#for filename in os.listdir(scratchDataDirectory):
    #print("filename: ", filename)
#    if filename.endswith(".pdf") :

        # split out the file name and the page (image) number
#        splittokens = re.split(r"-|\.", filename)
#        tot = len(splittokens)
#        filenameList.append( splittokens[0] )
#print("filenameList: ", filenameList )
            
pattern = '*.hocr'
#for filenameroot in filenameList:
for key, value in downloadMap.items():
    #print('processing: ' + value)
    hocrfileList = []
    for hocrname in os.listdir(scratchDataDirectory):
        if (fnmatch.fnmatch(hocrname, pattern) and hocrname.startswith(key) ):
            hocrfileList.append(scratchDataDirectory+hocrname)

    #print("\n\nparseOcrOutputForFileset list: ", hocrfileList)
    parseHocrFiles(key, hocrfileList)
    #print("parseOcrOutputForFileset completed file set parse: ", key)
print('\n\nFinished Processing OCR Files.')

big hit: ['0B3QqxeoUcqoAdmc5cF9VU0dWSFU', 101, 'div count: 28', ['l—di—Sü-enö', 'Puzum-A-ğur', 'Sü-enö-na-da', 'Êu-da-ad', 'Puzum-A—ğur', 'l—di-Sü-enâ', 'sa', '1/3', 'i-sé-er', 'Puzum-A-ğur', 'Sü-enÖ—na-da', 'Êu-da-ad', 'A—ğur-SIPA', 'i-ğu-ü', 'is‘-li:', "ha—muğ-lı'm", 'sa', "Êu-A-nl'm", 'Kur-ub-lğlar', '61-116!', "ha-am-ğa-lı'm", 'i-ğa—qü-lu', 'sıt—ma', 'u4-mi-ğu—nu', 'iğ-qü—lu', 'ğdl-mi—ğu-nu', 'sa', 'Puzum-A-ğur']]
big hit: ['0B3QqxeoUcqoAdmc5cF9VU0dWSFU', 202, 'div count: 24', ["Su-Ku-bı'4-ı'm", 'Ğu-Sü-eng', 'En-um-A-ğur', 'i-sé-er', 'A-§ur-SIPA', 'i—§u', 'iğ-liı', "ha—muğ-lı'm", 'sa', 'U-Şü-ur-ğa-A—ğur', 'Àb—ÿa-ra-ni', 'ğa-na-al', 'i-ğa-qal', "ğa-ni-lı'm", 'sel—lim', 'i-ğa-qal', "ğa-li-ı'ğ-lı'm", 'sel—lim', 'i-ğa-qal', 'sıt—ma', 'u4-mi-ğu', 'iğ-qü-ul', '61-716!', 'sa']]
big hit: ['0B3QqxeoUcqoAdmc5cF9VU0dWSFU', 344, 'div count: 21', ['Ld-ma-Ëa', 'Kur—ub—lğlar', "Ğu-Be-lı'm", 'sa', 'A-ğur-SIPA', 'um—me-a-ni-ğu', "i-kc'ı-Şü-ru-ğu-nı'", "iğ-lı'", 'um—me-a-ni-ğu', '_lup-pu-ğu', 'sıt—m

__merge all the big hit and small hit result files__

In [31]:
big_hit_list = runFolder + 'bighitlist.txt'
small_hit_list = runFolder + 'smallhitlist.txt'
completedSet = set()

fbigout = open (big_hit_list, 'a')
fsmallout = open (small_hit_list, 'a')


smallfiles = glob.glob('*smallhits*')

for fs in smallfiles:
    hitid, *rest  = fs.split('_')
    completedSet.add(hitid)

    idname = ''
    if hitid in totalMap:
       idname = totalMap[hitid]
       print(' ******name: ', idname)

   
    for line in open(fs):
        print('line1: ', line)
        out = line.strip('\n') + ';' + idname + '\n'
        sys.stdout.write('out1: '+ out)
        fsmallout.write(out)
    

fsmallout.close()


bigfiles = glob.glob('*bighits*')

for fb in bigfiles:
    hitid, *rest  = fb.split('_')
    completedSet.add(hitid)

    name = ''
    if hitid in totalMap:
       name = totalMap[hitid]
       #print(' ******name: ', idname)

   
    for line in open(fb):
        #print('line2: ', line)
        out = line.strip('\n') + ';' + idname + '\n'
        sys.stdout.write('out2: ' + out)
        fbigout.write(out)
     

fbigout.close()

__cleanup__

In [32]:
print("num dirs: ", len(dirlist) ) 

for currentdir in dirlist:
    os.chdir(currentdir)
    print ('current working directory: ', os.getcwd())
    
    # remove all pdf and png files
    for currentFile in os.listdir(os.getcwd()):
        if os.path.isfile(currentFile) and not currentFile.endswith('hits.txt'):
                os.remove(os.path.join(currentdir, currentFile))
    

num dirs:  0


#### Move the resulting zip file to bDrive.

In [None]:
file_metadata = { 'name' : 'hitlist.txt' }
media = MediaFileUpload(runFolder + 'hitlist.txt',
                        mimetype='text/plain')
file = drive_service.files().create(body=file_metadata,
                                    media_body=media,
                                    fields='id').execute()
print ('File ID: ' +  str(file.get('id')))