In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import re
from io import StringIO
import PyPDF2
import pandas as pd
from datetime import datetime 
import datetime as dt
from timeit import default_timer as timer

In [178]:
path = "Nov2020/07.11 3207971832_2020-November_eStatement.pdf"
excelFile = re.sub('\.', '_', path.split('/')[1].split('-')[0]) + '.xlsx'

### Extract date

In [3]:
date = re.sub('\.', '/', re.search("\/(.*) ",path).group(1))
previousDate = (datetime.strptime(date, "%d/%m") - dt.timedelta(days=1)).strftime('%d/%m')
date, previousDate

('07/11', '06/11')

### Read PDF to text

In [4]:
def convertPdfToTxt(path):
    startTime = timer()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    #codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages,caching = caching, check_extractable = True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    endTime = timer()
    print("Total time: %0.4fs" % (endTime - startTime))
    return text

In [5]:
text = convertPdfToTxt(path)

#extract no pages
noPage = len(re.findall("NIAGADEBITKREDITBAKIDATETRANSACTIONDEBITCREDITBALANCE", text))

Total time: 0.0649s


In [7]:
noPage

2

### Extract account information

In [8]:
accountInfoDict = {}
accountCol = ["Nombor Akaun / Account Number", 'Jenis Akaun / Account Type',
             'Tarikh Penyata / Statement Date', 'RINGKASAN / SUMMARY']
for no, desc in enumerate(accountCol):
    if no != len(accountCol)-1:
        descIndex = text.index(desc)
        descLen = len(desc)
        nextDescIndex = text.index(accountCol[no + 1])
        result = text[descIndex + descLen: nextDescIndex]
        accountInfoDict[desc] = result
accountInfoDict

{'Nombor Akaun / Account Number': '3207971832',
 'Jenis Akaun / Account Type': 'RM Plus Current Account',
 'Tarikh Penyata / Statement Date': '07 Nov 2020'}

### Extract summary information

In [10]:
def extractText(desc, startStr, endStr):
    try:
        startStrIndex = desc.index(startStr)
        startStrLen = len(startStr)
        endStrIndex = desc.index(endStr)         
        if startStr == '':
            result = desc[:endStrIndex]
            return result
        elif endStr!='':
            result = desc[startStrIndex + startStrLen:endStrIndex]
            remainDesc = desc[endStrIndex:]
            return result, remainDesc
        else:
            result = desc[startStrIndex + startStrLen:]
            return result
        
    except Exception as e:
#         print(e)
        print("Failed to extract text")

In [13]:
def extractAmount(desc, greedy = True):
#     greedyPattern = '([\d\.\,]+)'
    greedyPattern = '([\d\.\,]+\.\d{2})'
    notGreedyPattern = '([\d\.\,]+?\.\d{2}?)'
    try:
        if greedy:
            result = re.search(greedyPattern, desc).group(1)
            return result
        else:
            result = re.search(notGreedyPattern, desc).group(1)
            return result
    except Exception as e:
#         print(e)
        print("Wrong number pattern")

In [11]:
summaryDict = {}
summaryCol = ['Baki Penutup / Closing Balance',
              'Jumlah Debit / Total Debits', 'Bil. Debit / No. of Debits',
              'Jumlah Kredit / Total Credits', 'Bil. Kredit / No. of Credits']
#extract series of number 
numberSeries = extractText(text, summaryCol[4], 'Muka Surat 1')[0]
numberSeries

'19,694,282.29107,566.81626,282.691'

In [14]:
#extract closing balance amount
closingBalanceAmount = extractAmount(numberSeries, greedy = False)
summaryDict[summaryCol[0]] = closingBalanceAmount
numberSeries = re.sub(closingBalanceAmount, '', numberSeries)
numberSeries

'107,566.81626,282.691'

In [15]:
#extract debit amount
debitAmount = extractAmount(numberSeries, greedy = False)
summaryDict[summaryCol[1]] = debitAmount
numberSeries = re.sub(debitAmount, '', numberSeries)
numberSeries

'626,282.691'

In [16]:
#credit no credit
noCredit = re.sub(extractAmount(numberSeries, greedy = False), '', numberSeries)
summaryDict[summaryCol[4]] = noCredit
numberSeries = re.sub(noCredit, '', numberSeries)
numberSeries

'626,282.69'

In [17]:
tempNumberSeries = numberSeries
creditAmount1 = re.search('(\d{2}\,\d+\.\d{2})', numberSeries).group(1)
noDebit1 = re.sub(creditAmount1, '', tempNumberSeries)
creditAmount2 = re.search('(\d{1}\,\d+\.\d{2})', numberSeries).group(1)
noDebit2 = re.sub(creditAmount2, '', tempNumberSeries)

In [18]:
noDebitList = [noDebit1, noDebit2]
creditAmountList = [creditAmount1, creditAmount2]
noDebitList, creditAmountList

(['6', '62'], ['26,282.69', '6,282.69'])

### extract all tables and process content separately

In [19]:
def returnTableList(no):
    tempList = []
    textCopy = text
#     for i in range(1, no + 1):
    for i in range(1, 3):
        table, remain = extractText(textCopy, 'Page %s of %s'%(i, no), 'Penyata ini dicetak melalui komputer.')
        index = remain.index('Penyata ini dicetak melalui komputer.')
        termLen = len('Penyata ini dicetak melalui komputer.')
        textCopy = remain[index + termLen:]
        table = re.sub("TARIKHURUS NIAGADEBITKREDITBAKIDATETRANSACTIONDEBITCREDITBALANCE", '', table)
        tempList.append(table)
    return tempList

In [20]:
tableList = returnTableList(noPage)
tableList

['06/11Balance From Last Statement19,775,566.4107/1126,282.6919,801,849.10SI CR FROM 3999XXXXXX8,688.7819,793,160.32DR-ECP 000001 3207971832PR07112002SALARY OCT 202032,443.3719,760,716.95DR-ECP 000001 3207971832PR07112003SALARY OCT 202066,430.8619,694,286.09DR-ECP 000001 3207971832PR07112001SALARY OCT 20200.4019,694,285.69HANDLING CHRG 000006FEE 3207971832PR07112002SALARY OCT 2020Balance C/F19,694,285.69',
 "07/11Balance B/F19,694,285.690.5019,694,285.19HANDLING CHRG 000007FEE 3207971832PR07112003SALARY OCT 20202.9019,694,282.29HANDLING CHRG 000031FEE 3207971832PR07112001SALARY OCT 2020Closing Balance In This Statement19,694,282.29Baki Harian Dan Penutup Meliputi Semua Cek Yang Telah Didepositkan, Dijelaskan Dan Yang Belum Dijelaskan.Terima Kasih Kerana Berurus Niaga Dengan Public Bank.  Kecemerlangan Adalah Iltizam Kami.Kemusykilan anda mengenai perbankan DIJAWAB! Untuk maklumat lanjut, sila layari www.bankinginfo.com.my.Daily And Closing Balances Include All Cheques Deposited, Cleare

In [168]:
#extract content of table from text
table = tableList[0]
table

'06/11Balance From Last Statement19,775,566.4107/1126,282.6919,801,849.10SI CR FROM 3999XXXXXX8,688.7819,793,160.32DR-ECP 000001 3207971832PR07112002SALARY OCT 202032,443.3719,760,716.95DR-ECP 000001 3207971832PR07112003SALARY OCT 202066,430.8619,694,286.09DR-ECP 000001 3207971832PR07112001SALARY OCT 20200.4019,694,285.69HANDLING CHRG 000006FEE 3207971832PR07112002SALARY OCT 2020Balance C/F19,694,285.69'

In [196]:
#extract first line of transaction

#trx date, cheque no., ref1, ref2, ref3, ref4, credit amount, debit amount, balance
resultList = []

#extract content of first table from text
table = tableList[0]

#extract balance amount
balanceAmount, tempText = extractText(table, "Balance From Last Statement", date)

#store record of balance from last statement
resultList.append([previousDate, '','', '','', "Balance From Last Statement", '', '', balanceAmount])

In [182]:
tempText

'07/1126,282.6919,801,849.10SI CR FROM 3999XXXXXX8,688.7819,793,160.32DR-ECP 000001 3207971832PR07112002SALARY OCT 202032,443.3719,760,716.95DR-ECP 000001 3207971832PR07112003SALARY OCT 202066,430.8619,694,286.09DR-ECP 000001 3207971832PR07112001SALARY OCT 20200.4019,694,285.69HANDLING CHRG 000006FEE 3207971832PR07112002SALARY OCT 2020Balance C/F19,694,285.69'

In [42]:
def extractText(desc, startStr, endStr):
    try:
        startStrIndex = desc.index(startStr)
        startStrLen = len(startStr)
        endStrIndex = desc.index(endStr)         
        if startStr == '':
            result = desc[:endStrIndex]
            return result
        elif endStr!='':
            result = desc[startStrIndex + startStrLen:endStrIndex]
            remainDesc = desc[endStrIndex:]
            return result, remainDesc
        else:
            result = desc[startStrIndex + startStrLen:]
            return result
        
    except Exception as e:
#         print(e)
        print("Failed to extract text")

In [186]:
#extract amountList and descList
#steps: 
#1. extract first and second number, store first number in descList
#2. extract desc between first and second number
#3. store desc in descList
#repeat 1 till 3 until closing balance's line

amountList = []
descList = []
closingLine = 0

# for no, table in enumerate([tableList[0]]):
for no, table in enumerate(tableList):
    if no == 0:
        tempText = tempText
    else:
        tempText = table
    # for i in range(3):
    while closingLine == 0:
        #remove date first
        if re.search(date, tempText):
            tempText = re.sub(date, '', tempText)
        #extract amount first
        tempAmount1 = extractAmount(tempText)
        #store amount
        amountList.append(tempAmount1)
        #temporary subbed text to get second amount
        tempSubText = re.sub(tempAmount1, '', tempText)
        tempAmount2 = extractAmount(tempSubText)
        tempDesc = extractText(tempSubText, '', tempAmount2)
        
        #if Balance C/F, last line of transaction for that page
        if re.search(r"Balance C/F", str(tempDesc)):
            tempDesc = re.sub(r"Balance C/F", "", str(tempDesc))
            amountList.append(tempAmount2)
            descList.append(tempDesc)
            descList.append("Balance C/F")
            break
            
        #if Balance B/F, first line of transaction for that page
        elif re.search(r"Balance B/F", str(tempDesc)):
            tempDesc = re.sub(r"Balance B/F", "", str(tempDesc))
            descList.append("Balance B/F")
            tempAmount1 = extractAmount(re.sub(date, '', tempText), greedy = False)
            amountList[-1] = tempAmount1
            tempAmount1Index = tempText.index(tempAmount1)
            tempAmount1Len = len(tempAmount1)
            tempText = tempText[tempAmount1Index+tempAmount1Len:]
            
        #if last line for whole file
        elif re.search("Closing Balance In This Statement", str(tempDesc)):
            closingAmount = tempAmount2
            closingAmountIndex = tempSubText.index(closingAmount)
            lastDesc = tempSubText[:closingAmountIndex]
            #sub closing amount with '' in lastDesc
            lastDesc = re.sub("Closing Balance In This Statement", '', lastDesc)
            descList.append(lastDesc)

            #store last line - amount and desc
            amountList.append(closingAmount)
            descList.append("Closing Balance In This Statement")
#             closingLine = 1
            break
        #extracted all ady
        elif not tempAmount2:
#             closingLine = 1
            break
        else:
            tempText = re.sub(tempDesc, '', tempSubText)
            descList.append(tempDesc)

In [187]:
amountList, descList

(['26,282.6919,801,849.10',
  '8,688.7819,793,160.32',
  '202032,443.3719,760,716.95',
  '202066,430.8619,694,286.09',
  '20200.4019,694,285.69',
  '19,694,285.69',
  '19,694,285.69',
  '0.5019,694,285.19',
  '20202.9019,694,282.29',
  '19,694,282.29'],
 ['SI CR FROM 3999XXXXXX',
  'DR-ECP 000001 3207971832PR07112002SALARY OCT ',
  'DR-ECP 000001 3207971832PR07112003SALARY OCT ',
  'DR-ECP 000001 3207971832PR07112001SALARY OCT ',
  'HANDLING CHRG 000006FEE 3207971832PR07112002SALARY OCT 2020',
  'Balance C/F',
  'Balance B/F',
  'HANDLING CHRG 000007FEE 3207971832PR07112003SALARY OCT ',
  'HANDLING CHRG 000031FEE 3207971832PR07112001SALARY OCT 2020',
  'Closing Balance In This Statement'])

In [130]:
tableList[1]

"07/11Balance B/F19,694,285.690.5019,694,285.19HANDLING CHRG 000007FEE 3207971832PR07112003SALARY OCT 20202.9019,694,282.29HANDLING CHRG 000031FEE 3207971832PR07112001SALARY OCT 2020Closing Balance In This Statement19,694,282.29Baki Harian Dan Penutup Meliputi Semua Cek Yang Telah Didepositkan, Dijelaskan Dan Yang Belum Dijelaskan.Terima Kasih Kerana Berurus Niaga Dengan Public Bank.  Kecemerlangan Adalah Iltizam Kami.Kemusykilan anda mengenai perbankan DIJAWAB! Untuk maklumat lanjut, sila layari www.bankinginfo.com.my.Daily And Closing Balances Include All Cheques Deposited, Cleared And Uncleared.Thank You For Banking With Public Bank.  Excellence Is Our Commitment.Your banking questions ANSWERED! For more info, log on to www.bankinginfo.com.my.Anda boleh melihat Notis Privasi Public Bank di laman web www.pbebank.com.You may view Public Bank's Privacy Notice at www.pbebank.com."

### split amount in amountList and desc in descList

In [46]:
def splitAmount(desc):
    try:
        amountPattern = '(.*?\.\d{2}?)'
        result = re.search(amountPattern, desc).group(1)
        desc2 = re.sub(result, '', desc)
        return result, desc2
    except Exception as e:
        print(e)
        print("Wrong amount pattern")

In [190]:
splitAmountList = [splitAmount(i) for i in amountList]
splitAmountList

[('26,282.69', '19,801,849.10'),
 ('8,688.78', '19,793,160.32'),
 ('202032,443.37', '19,760,716.95'),
 ('202066,430.86', '19,694,286.09'),
 ('20200.40', '19,694,285.69'),
 ('19,694,285.69', ''),
 ('19,694,285.69', ''),
 ('0.50', '19,694,285.19'),
 ('20202.90', '19,694,282.29'),
 ('19,694,282.29', '')]

In [152]:
def checkCreditDebit(amountTuple, previousBalance):
    amount1 = float(re.sub(',', '', amountTuple[0]))
    currentBalance = amountTuple[1]
    if currentBalance !='':
        currentBalance = float(re.sub(',', '', currentBalance))
        previousBalance = float(re.sub(',', '', previousBalance))
        if currentBalance - previousBalance == amount1:
            amountTuple = ('', round(amount1, 2), currentBalance)
        elif previousBalance - currentBalance == amount1:
            amountTuple = (round(amount1, 2), '', currentBalance)
        #if amount 1 is wrong
        elif currentBalance > previousBalance:
            credit  = currentBalance - previousBalance
            amountTuple = ('', round(credit,2), currentBalance)
        elif previousBalance > currentBalance:
            debit  = previousBalance - currentBalance
            amountTuple = (round(debit,2), '', currentBalance)            
    else:
        amountTuple = ('', '', amount1)
    return amountTuple, str(amountTuple[2])

In [191]:
#categorize one amount as credit/debit
splitAmountList2 = []
for no, i in enumerate(splitAmountList):
    if no == 0:
        resultTuple, previousBalanceAmount = checkCreditDebit(i, balanceAmount)
        splitAmountList2.append(resultTuple)
    else:
        resultTuple, previousBalanceAmount = checkCreditDebit(i, previousBalanceAmount)
        splitAmountList2.append(resultTuple)
splitAmountList2

[('', 26282.69, 19801849.1),
 (8688.78, '', 19793160.32),
 (32443.37, '', 19760716.95),
 (66430.86, '', 19694286.09),
 (0.4, '', 19694285.69),
 ('', '', 19694285.69),
 ('', '', 19694285.69),
 (0.5, '', 19694285.19),
 (2.9, '', 19694282.29),
 ('', '', 19694282.29)]

In [156]:
#regex pattern based on transDesc
transTypePatternDict = {'INSTANT TRSF CR':'(.*CR)\s*(\d{6})(.+)',
                       'TSFR FUND DR-ATM/EFT':'(TSFR FUND DR-ATM/EFT)\s*(\d{6})(.+)'}

def splitStr(desc):
    foundPattern = 0
    for transType in transTypePatternDict:
        if re.search(transTypePatternDict[transType], desc):
            pattern = transTypePatternDict[transType]
            transType = re.search(pattern, desc).group(1)
            chequeNo = re.search(pattern, desc).group(2)
            ref1  = re.search(pattern, desc).group(3)
            foundPattern = 1
            finalResult = [chequeNo, transType, ref1] + ['']*2
            break
    if foundPattern == 0:
        finalResult = ['']*2 + [desc] + ['']*2
    #[chequeNo, transactionDescType, ref1, ref2, ref3]
    return finalResult

In [454]:
#INSTANT TRSF CR pattern
pattern = '(.*CR)\s*(\d{6})(.+)'
example = 'INSTANT TRSF CR 019141BEATRICE JOHNNY ESHOP'
transType = re.search(pattern, example).group(1)
chequeNo = re.search(pattern, example).group(2)
ref1  = re.search(pattern, example).group(3)
print([chequeNo, transType, ref1])
print(splitStr(example))
print('\n')

#TSFR FUND DR-ATM/EFT pattern
pattern = '(TSFR FUND DR-ATM/EFT)\s*(\d{6})(.+)'
example = 'TSFR FUND DR-ATM/EFT 7621783812XXXXXX DATO LIN SOW YINGCONSULTANT FEE'
transType = re.search(pattern, example).group(1)
chequeNo = re.search(pattern, example).group(2)
ref1  = re.search(pattern, example).group(3)
print([chequeNo, transType, ref1])
print(splitStr(example))

['019141', 'INSTANT TRSF CR', 'BEATRICE JOHNNY ESHOP']
['019141', 'INSTANT TRSF CR', 'BEATRICE JOHNNY ESHOP', '', '']


['762178', 'TSFR FUND DR-ATM/EFT', '3812XXXXXX DATO LIN SOW YINGCONSULTANT FEE']
['762178', 'TSFR FUND DR-ATM/EFT', '3812XXXXXX DATO LIN SOW YINGCONSULTANT FEE', '', '']


In [192]:
splitStrList = [splitStr(i) for i in descList]
splitStrList

[['', '', 'SI CR FROM 3999XXXXXX', '', ''],
 ['', '', 'DR-ECP 000001 3207971832PR07112002SALARY OCT ', '', ''],
 ['', '', 'DR-ECP 000001 3207971832PR07112003SALARY OCT ', '', ''],
 ['', '', 'DR-ECP 000001 3207971832PR07112001SALARY OCT ', '', ''],
 ['',
  '',
  'HANDLING CHRG 000006FEE 3207971832PR07112002SALARY OCT 2020',
  '',
  ''],
 ['', '', 'Balance C/F', '', ''],
 ['', '', 'Balance B/F', '', ''],
 ['', '', 'HANDLING CHRG 000007FEE 3207971832PR07112003SALARY OCT ', '', ''],
 ['',
  '',
  'HANDLING CHRG 000031FEE 3207971832PR07112001SALARY OCT 2020',
  '',
  ''],
 ['', '', 'Closing Balance In This Statement', '', '']]

### concolidating result and form table

In [160]:
#column names
colList = ['Trn. Date', 'Cheque No/Ref No', 'Transaction Description',
           'Debit Amount', 'Credit Amount', 
           'Reference 1', 'Reference 2', 'Reference 3', 'Balance']

for no, finalStrList in enumerate(splitStrList):
    tempList =  [date] + [i for i in finalStrList[:2]]
    tempList += [i for i in splitAmountList2[no][:2]] + [i for i in finalStrList[2:]] + [splitAmountList2[no][-1]]
    resultList.append(tempList)

In [198]:
df = pd.DataFrame(resultList, columns = colList)
df

Unnamed: 0,Trn. Date,Cheque No/Ref No,Transaction Description,Debit Amount,Credit Amount,Reference 1,Reference 2,Reference 3,Balance
0,06/11,,,,,Balance From Last Statement,,,19775566.41
1,07/11,,,,26282.7,SI CR FROM 3999XXXXXX,,,19801800.0
2,07/11,,,8688.78,,DR-ECP 000001 3207971832PR07112002SALARY OCT,,,19793200.0
3,07/11,,,32443.4,,DR-ECP 000001 3207971832PR07112003SALARY OCT,,,19760700.0
4,07/11,,,66430.9,,DR-ECP 000001 3207971832PR07112001SALARY OCT,,,19694300.0
5,07/11,,,0.4,,HANDLING CHRG 000006FEE 3207971832PR07112002SA...,,,19694300.0
6,07/11,,,,,Balance C/F,,,19694300.0
7,07/11,,,,,Balance B/F,,,19694300.0
8,07/11,,,0.5,,HANDLING CHRG 000007FEE 3207971832PR07112003SA...,,,19694300.0
9,07/11,,,2.9,,HANDLING CHRG 000031FEE 3207971832PR07112001SA...,,,19694300.0


### validate no debit and credit amount

In [199]:
trueNoDebit = df.query("`Debit Amount`!=''").shape[0]
trueNoDebit

6

In [200]:
#confirm no debit
noDebitIndex = noDebitList.index(str(trueNoDebit))
creditAmount = creditAmountList[noDebitIndex]
summaryDict[summaryCol[2]] = trueNoDebit
summaryDict[summaryCol[3]] = creditAmount

In [201]:
summaryDict

{'Baki Penutup / Closing Balance': '19,694,282.29',
 'Jumlah Debit / Total Debits': '107,566.81',
 'Bil. Kredit / No. of Credits': '1',
 'Bil. Debit / No. of Debits': 6,
 'Jumlah Kredit / Total Credits': '26,282.69'}

### Write to excel file

In [202]:
import xlsxwriter 
  

workbook = xlsxwriter.Workbook(excelFile)
worksheet = workbook.add_worksheet() 

# Rows and columns are zero indexed. 
row = 0

for col in accountInfoDict : 
  
    # write operation perform 
    worksheet.write(row, 0, col + ':')
    worksheet.write(row, 1, accountInfoDict[col])
  
    # incrementing the value of row by one 
    # with each iteratons. 
    row += 1
    
for col in summaryDict : 
  
    # write operation perform 
    worksheet.write(row, 0, col + ':')
    worksheet.write(row, 1, summaryDict[col])
  
    # incrementing the value of row by one 
    # with each iteratons. 
    row += 1
    
workbook.close()

from openpyxl import load_workbook
#write pandas 
writer = pd.ExcelWriter(excelFile, engine='openpyxl')
writer.book = load_workbook(excelFile)
# copy existing sheets
writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
reader = pd.read_excel(excelFile)
df.to_excel(writer, sheet_name='Sheet1', startrow = len(reader) + 2, header = True, index = False)
writer.save()