In [56]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import re
from io import StringIO
import PyPDF2
import pandas as pd
from datetime import datetime 
import datetime as dt
from timeit import default_timer as timer
import logging
import csv

### UDF

In [47]:
def convertPdfToTxt(path):
#     startTime = timer()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    #codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages,caching = caching, check_extractable = True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
#     endTime = timer()
#     print("Total time: %0.4fs" % (endTime - startTime))
    return text

def extractText(desc, startStr, endStr):
    try:
        startStrIndex = desc.index(startStr)
        startStrLen = len(startStr)
        endStrIndex = desc.index(endStr)         
        if startStr == '':
            result = desc[:endStrIndex]
            return result
        elif endStr!='':
            result = desc[startStrIndex + startStrLen:endStrIndex]
            remainDesc = desc[endStrIndex:]
            return result, remainDesc
        else:
            result = desc[startStrIndex + startStrLen:]
            return result        
    except Exception as e:
#         print(e)
        print("Failed to extract text")

def returnTableList(textCopy, no):
    tempList = []
#     textCopy = text
    for i in range(1, no + 1):
#     for i in range(1, 3):
        table, remain = extractText(textCopy, 'Page %s of %s'%(i, no), 'Penyata ini dicetak melalui komputer.')
        index = remain.index('Penyata ini dicetak melalui komputer.')
        termLen = len('Penyata ini dicetak melalui komputer.')
        textCopy = remain[index + termLen:]
        table = re.sub("TARIKHURUS NIAGADEBITKREDITBAKIDATETRANSACTIONDEBITCREDITBALANCE", '', table)
        tempList.append(table)
    return tempList

### Define parameters

In [49]:
folder = "Test"
failedFileCsv = 'Log/FailedFileList_{}.csv'.format(datetime.now().strftime("%Y-%m-%d"))
#log file
logFilePath = 'Log/ProcessingRecord'

logFileName = datetime.now().strftime('{}__%Y-%m-%d.log'.format(logFilePath))
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)    
logging.basicConfig(filename = logFileName, filemode = 'a', 
                    level = logging.DEBUG, format = '%(levelname)s:%(message)s')

### Store metadata

In [58]:
#store metadata - (filename, date, previous date, excel filename)
metaDataList = []

fileList = [i for i in os.listdir(folder)]
for filename in fileList:
    infoMsg = "Storing metadata - %s"%filename
    logging.info(infoMsg)

    infoMsg = "Extracting date (day/month) - %s \n"%filename
    logging.info(infoMsg)
    
    try:
        date = re.sub('\.', '/', filename.split(' ')[0])
        date = datetime.strptime(date, "%d/%m").strftime('%d/%m')
        previousDate = (datetime.strptime(date, "%d/%m") - dt.timedelta(days=1)).strftime('%d/%m')
        excelFile = re.sub('\/|\.', '_', date + ' ' + filename.split('-')[1]) + '.xlsx'
        metaDataList.append((filename, date, previousDate, excelFile))
    except Exception as e:
        logging.error(e)
    
#checking if any files 
failedFileList = [i for i in fileList if i not in [j[0] for j in metaDataList]]
if len(failedFileList)!=0:
    for filename in failedFileList:
#     for filename in fileList:
        infoMsg = "Extracting date (month/date) - %s \n"%filename
        logging.info(infoMsg)
        try:
            date = re.sub('\.', '/', filename.split(' ')[0])
            date = datetime.strptime(date, "%m/%d").strftime('%d/%m')
            previousDate = (datetime.strptime(date, "%d/%m") - dt.timedelta(days=1)).strftime('%d/%m')
            excelFile = re.sub('\/|\.', '_', date + ' ' + filename.split('-')[1]) + '.xlsx'
            metaDataList.append((filename, date, previousDate, excelFile))
            #update failedfileList
            failedFileList = [i for i in failedFileList if i != filename]
        except Exception as e:
            logging.error(e)
            
#shut down logging
logging.shutdown()

### Reading Text

In [57]:
startTime = timer()

#{filename: (full text, tableList)}
textDict = {}

for info in metaDataList:
    filename = info[0]
    path = folder + '/' + filename
    
    infoMsg = "Extracting text/table - %s \n"%filename
    logging.info(infoMsg)
    
    try:
        #extract text
        text = convertPdfToTxt(path)
        
        #extract no pages
        noPage = len(re.findall("NIAGADEBITKREDITBAKIDATETRANSACTIONDEBITCREDITBALANCE", text))
        
        #extract table
        tableList = returnTableList(text, noPage)
        
        #storing text, table
        textDict[filename] = [text, tableList]
        
    except Exception as e:
        #store file which has failed extraction
        failedFileList.append(filename)
        logging.error(e)

        
#save failedFileList into csv 
with open(failedFileCsv, 'w') as writeFile:
    writer = csv.writer(writeFile)
    #extraction failure
    writer.writerow(['Extraction Failed'])
    for file in failedFileList:
        writer.writerow([file])
        
endTime = timer()    
print("Total time: %0.4fs" % (endTime - startTime))
#shut down logging
logging.shutdown()

Total time: 0.3414s


In [None]:
firstPageInfoList = []

for no, text in textList:
    filename = info[0]
    path = folder + '/' + filename
    
    infoMsg = "Extracting info from first page - %s \n"%filename
    logging.info(infoMsg)
    
    accountInfoDict = {}
    accountCol = ["Nombor Akaun / Account Number", 'Jenis Akaun / Account Type',
                 'Tarikh Penyata / Statement Date', 'RINGKASAN / SUMMARY']
    for no, desc in enumerate(accountCol):
        if no != len(accountCol)-1:
            descIndex = text.index(desc)
            descLen = len(desc)
            nextDescIndex = text.index(accountCol[no + 1])
            result = text[descIndex + descLen: nextDescIndex]
            accountInfoDict[desc] = result
accountInfoDict

In [None]:
summaryDict = {}
summaryCol = ['Baki Penutup / Closing Balance',
              'Jumlah Debit / Total Debits', 'Bil. Debit / No. of Debits',
              'Jumlah Kredit / Total Credits', 'Bil. Kredit / No. of Credits']
#extract series of number 
numberSeries = extractText(text, summaryCol[4], 'Muka Surat 1')[0]

#extract closing balance amount
closingBalanceAmount = extractAmount(numberSeries, greedy = False)
summaryDict[summaryCol[0]] = closingBalanceAmount
numberSeries = re.sub(closingBalanceAmount, '', numberSeries)

#extract debit amount
debitAmount = extractAmount(numberSeries, greedy = False)
summaryDict[summaryCol[1]] = debitAmount
numberSeries = re.sub(debitAmount, '', numberSeries)

#credit no credit
noCredit = re.sub(extractAmount(numberSeries, greedy = False), '', numberSeries)
summaryDict[summaryCol[4]] = noCredit
numberSeries = re.sub(noCredit, '', numberSeries)