In [1]:
import pandas as pd
import re
import openpyxl 
import numpy as np
from timeit import default_timer as timer
from datetime import datetime
import os

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

### Define parameter

In [344]:
#predefine parameters
filterTransDesc = "HANDLING CHRG"

#name col to remove rows with missing na or to match bs
payoutNameCol = 'Member Name'
merchantNameCol = 'Merchant'
merchantBankNameCol = 'Bank Acc Name'
billPaymentNameCol = "Name"

#col of payout to match amount (payout, merchant and bill payment must have)
poAmountCol = "Amount"
#col of final output as name
poNameCol = "Member Name"
bsName = "BS Name"

#cols to match in bank statement
# bsColMatchList = ['Reference 1', 'Reference 2', 'Reference 3', 'Reference 4']
bsMatchCol = "Reference 2"
bsAmountCol = "Debit Amount"

#index col to track rows
euIndex = "EU_Index"
copIndex = "COP_Index"
manageCOPIndex = "ManageCOP_Index"
merchantIndex = "Merchant_Index"
billPaymentIndex = "BP_Index"
bsIndex = "BS_Index"

#ruleDict, which file type to use which set of rules
ruleDict = {"General":[1, 2, 3, 4, 6],
           "Merchant":[1, 2, 3, 4],
           "BillPayment":[2, 5]}

# payoutExcelFile = "Quinton_Payout_202011/PAYOUT - 19.11.2020 .xlsx"
payoutExcelFile = "Quinton_Payout_202012/PAYOUT - 31.12.2020.xlsx"

#get date
date = re.search('^.*?(\d+\.\d+)',payoutExcelFile.split('-')[1]).group(1)
date = datetime.strptime(date, '%d.%m')
date

datetime.datetime(1900, 12, 31, 0, 0)

In [205]:
os.listdir("Quinton_Payout_202012")

['PAYOUT - 01.12.2020.xlsx',
 'PAYOUT - 02.12.2020.xlsx',
 'PAYOUT - 03.12.2020.xlsx',
 'PAYOUT - 04.12.2020.xlsx',
 'PAYOUT - 07.12.2020.xlsx',
 'PAYOUT - 08.12.2020.xlsx',
 'PAYOUT - 09.12.2020.xlsx',
 'PAYOUT - 10.12.2020.xlsx',
 'PAYOUT - 11.12.2020.xlsx',
 'PAYOUT - 14.12.2020.xlsx',
 'PAYOUT - 15.12.2020.xlsx',
 'PAYOUT - 16.12.2020.xlsx',
 'PAYOUT - 17.12.2020.xlsx',
 'PAYOUT - 18.12.2020.xlsx',
 'PAYOUT - 21.12.2020.xlsx',
 'PAYOUT - 22.12.2020.xlsx',
 'PAYOUT - 23.12.2020.xlsx',
 'PAYOUT - 24.12.2020.xlsx',
 'PAYOUT - 28.12.2020.xlsx',
 'PAYOUT - 29.12.2020.xlsx',
 'PAYOUT - 30.12.2020.xlsx',
 'PAYOUT - 31.12.2020.xlsx']

### Read Payout

In [345]:
#get worksheet names
wb = openpyxl.load_workbook(payoutExcelFile)
wsList = wb.sheetnames

EU_ws = [i for i in wsList if re.search('EU\s*$', i)][0]
#preceding cannot be manage
COP_ws = [i for i in wsList if re.search('(?<!MANAGE[\s\-])COP\s*$', i)][0]
ManageCOP_ws = [i for i in wsList if re.search('MANAGE COP\s*$', i)][0]
Merchant_ws = [i for i in wsList if re.search('MERCHANT[^-]*', i)][0]
BillPayment_ws = [i for i in wsList if re.search('BILL PAYMENT[^-]*\s*$', i)][0]
print("EU WS - %s"%EU_ws)
print("COP WS - %s"%COP_ws)
print("ManageCOP WS - %s"%ManageCOP_ws)
print("Merchant WS - %s"%Merchant_ws)
print("BillPayment WS - %s"%BillPayment_ws)

#read file

#read payout files - EU, COP and ManageCOP
dfEU = pd.read_excel(payoutExcelFile, sheet_name = EU_ws)
#remove empty member name
dfEU = dfEU[pd.notnull(dfEU[payoutNameCol])]
dfEU['DataType'] = "EU" 

dfCOP = pd.read_excel(payoutExcelFile, sheet_name = COP_ws)
#remove empty member name
dfCOP = dfCOP[pd.notnull(dfCOP[payoutNameCol])]
dfCOP['DataType'] = "COP" 

dfManageCOP = pd.read_excel(payoutExcelFile, sheet_name = ManageCOP_ws)
#remove empty member name
dfManageCOP = dfManageCOP[pd.notnull(dfManageCOP[payoutNameCol])]
dfManageCOP['DataType'] = "ManageCOP" 

#merchant file
dfMerchant = pd.read_excel(payoutExcelFile, sheet_name = Merchant_ws)
#remove empty merchant col
dfMerchant = dfMerchant[pd.notnull(dfMerchant[merchantNameCol])]
dfMerchant['DataType'] = "Merchant" 

dfBillPayment = pd.read_excel(payoutExcelFile, sheet_name = BillPayment_ws)
#remove empty name
dfBillPayment = dfBillPayment[pd.notnull(dfBillPayment[billPaymentNameCol])]
dfBillPayment['DataType'] = "BillPayment" 

#create index
dfEU[euIndex] = np.arange(dfEU.shape[0])
dfCOP[copIndex] = np.arange(dfCOP.shape[0])
dfManageCOP[manageCOPIndex] = np.arange(dfManageCOP.shape[0])
dfMerchant[merchantIndex] = np.arange(dfMerchant.shape[0])
dfBillPayment[billPaymentIndex] = np.arange(dfBillPayment.shape[0])

EU WS - 23.12-EU
COP WS - 28.12-COP
ManageCOP WS - 28.12-MANAGE COP
Merchant WS - 28.12-MERCHANT 
BillPayment WS - 29.12-BILL PAYMENT 


In [346]:
wsList

['23.12-EU-ALL',
 '23.12-EU',
 '28.12-COP-ALL',
 '28.12-COP',
 '28.12-MANAGE COP',
 '28.12-MERCHANT ',
 '29.12-BILL PAYMENT ']

### Read Bank statement

In [347]:
if date.month == 11:
    bsFolder = 'Nov2020_ManipulationResult'
    outputFolder = "Nov2020_MatchingResult"
elif date.month == 12:
    bsFolder = "Dec2020_ManipulationResult"
    outputFolder = "Dec2020_MatchingResult"
bsDatePattern = date.strftime('%d_%m')
bsExcelFile = [i for i in os.listdir(bsFolder) if re.search(bsDatePattern, i)][0]
print("Output filename: %s"%bsExcelFile)

dfBankStatement = pd.read_excel(bsFolder + '/' + bsExcelFile, skiprows = 9)
# dfBankStatement = pd.read_excel("03_11 3207971832_2020_addRef.xlsx", skiprows = 9)

#filter out ref1 gas 
filterTransDesc = "HANDLING CHRG"
dfBankStatement2 = dfBankStatement[dfBankStatement[bsMatchCol].map(lambda x:
                                                    False if re.search(filterTransDesc, str(x))
                                                           else True)].copy()

#create bsIndex
dfBankStatement2[bsIndex] = np.arange(dfBankStatement2.shape[0])
#keep track bsIndex list
bsIndexList = dfBankStatement2[bsIndex].unique()

Output filename: 31_12 3207971832_2020.xlsx


### Exact Matching

In [348]:
def ExactMatch(df, indexCol, nameCol, dfBS, dtype):
    startTime = timer()
    nameList = [str(i).lower() for i in df[nameCol].values]
    amountList = df[poAmountCol].values
    indexList = df[indexCol].values
    
    bsIndexCheckList = []
    matchResult = []
    for no, name in enumerate(nameList):
        POindex = indexList[no]
        POname = name
        POname = re.sub("\‘|\’", "'", str(POname))
        POamount = amountList[no]
        dfMatch = dfBS[(dfBS[bsMatchCol].map(lambda x:POname in x.lower() and x.lower()!='nan'))
                        & (dfBS[bsAmountCol] == POamount)].copy()
        if dfMatch.shape[0] == 1:
            bsAmount = dfMatch[bsAmountCol].values[0]
            BSindex = dfMatch[bsIndex].values[0]
            bsName = dfMatch[bsMatchCol].values[0]
            #[POindex, POname, POamount, BSindex, BSname, BSamount]
            matchResult.append([POindex, POname, POamount,
                               BSindex, bsName, bsAmount])
            if BSindex not in bsIndexCheckList:
                bsIndexCheckList.append(BSindex)
            
        elif dfMatch.shape[0] > 1:
            for no, name in enumerate(dfMatch[bsMatchCol].values):
                #check if same bs is used for matching
                BSindex = dfMatch[bsIndex].values[no]
                if BSindex not in bsIndexCheckList:
                    BSname = dfMatch[bsMatchCol].values[no]
                    bsAmount = dfMatch[bsAmountCol].values[no]
                    NoRow = list(dfBS[bsIndex]).index(BSindex)
                    #nid to remove duplicate after consolidation
                    matchResult.append([POindex, POname, POamount, 
                                        BSindex, BSname, bsAmount])
                    bsIndexCheckList.append(BSindex)
                    break
            
    #form table
    colList = [indexCol, nameCol, poAmountCol, 
               bsIndex, bsMatchCol, bsAmountCol]
    dfResult = pd.DataFrame(matchResult, columns = colList).copy()
    dfResult['MatchCategory'] = "ExactMatch"
    dfResult['DataType'] = dtype
    dfResult.rename(columns = {nameCol: payoutNameCol}, inplace = True)
    endTime = timer()
    print("Total time: %0.4fs" % (endTime - startTime))
    return dfResult

In [349]:
#EU
#3363
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
# euList = [215]
# dfEU2 = dfEU.query("EU_Index in @euList")
dfExactMatchEU = ExactMatch(dfEU, euIndex, payoutNameCol, 
                            dfBankStatementTemp, "EU")
#find no match list
noMatchEUList = set(dfEU[euIndex].values)\
                    .difference(set(dfExactMatchEU[euIndex].values))
#remove bs duplicates
dfExactMatchEU.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfExactMatchEU['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Total row - EU: %s"%dfEU.shape[0])
print("Exact Matches: %s"%dfExactMatchEU.shape[0])
print("No Matches: %s"%len(noMatchEUList))

Total time: 14.0927s
Total row - BS before match: 6362
Total row - BS after match: 4875
Total row - EU: 1900
Exact Matches: 1487
No Matches: 413


In [350]:
#COP
#294
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
dfExactMatchCOP = ExactMatch(dfCOP, copIndex, payoutNameCol,
                             dfBankStatementTemp, "COP")
#find no match list
noMatchCOPList = set(dfCOP[copIndex].values)\
                    .difference(set(dfExactMatchCOP[copIndex].values))
#remove bs duplicates
dfExactMatchCOP.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfExactMatchCOP['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Total row: %s"%dfCOP.shape[0])
print("Exact Matches: %s"%dfExactMatchCOP.shape[0])
print("No Matches: %s"%len(noMatchCOPList))

Total time: 10.3209s
Total row - BS before match: 4875
Total row - BS after match: 3538
Total row: 1665
Exact Matches: 1337
No Matches: 328


In [351]:
#Manage COP
#24
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
dfExactMatchManageCOP = ExactMatch(dfManageCOP, manageCOPIndex, payoutNameCol, 
                                   dfBankStatementTemp, 'ManageCOP')
#find no match list
noMatchManageCOPList = set(dfManageCOP[manageCOPIndex].values)\
                       .difference(set(dfExactMatchManageCOP[manageCOPIndex].values))
#remove bs duplicates
dfExactMatchManageCOP.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfExactMatchManageCOP['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Total row: %s"%dfManageCOP.shape[0])
print("Exact Matches: %s"%dfExactMatchManageCOP.shape[0])
print("No Matches: %s"%len(noMatchManageCOPList))

Total time: 0.0835s
Total row - BS before match: 3538
Total row - BS after match: 3529
Total row: 12
Exact Matches: 9
No Matches: 3


In [352]:
#861

#exact matching for merchant file
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
exactMatchMerchantList = []
for colName in [merchantNameCol, merchantBankNameCol]:
    dfExactMatchMerchant = ExactMatch(dfMerchant, merchantIndex, colName,
                                      dfBankStatementTemp, "Merchant")
    exactMatchMerchantList.append(dfExactMatchMerchant)

dfExactMatchMerchant = pd.concat(exactMatchMerchantList).copy()
dfExactMatchMerchant.drop_duplicates(merchantIndex, inplace = True)
dfExactMatchMerchant.drop_duplicates(bsIndex, inplace = True)

#harmonize no match list
noMatchMerchantList = set(dfMerchant[merchantIndex].values)\
                        .difference(set(dfExactMatchMerchant[merchantIndex].values))

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfExactMatchMerchant['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Total row: %s"%dfMerchant.shape[0])
print("Exact Matches: %s"%dfExactMatchMerchant.shape[0])
print("No Matches: %s"%len(noMatchMerchantList))

Total time: 6.1156s
Total time: 6.1771s
Total row - BS before match: 3529
Total row - BS after match: 2938
Total row: 1225
Exact Matches: 591
No Matches: 634


In [353]:
#exact matching for bill Payment file
#1
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
exactMatchBPList = []
for colName in [billPaymentNameCol, 'Acc Holder']:
    dfExactMatchBP = ExactMatch(dfBillPayment, billPaymentIndex, colName,
                                dfBankStatementTemp, "BillPayment")
    exactMatchBPList.append(dfExactMatchBP)

dfExactMatchBP = pd.concat(exactMatchBPList).copy()
dfExactMatchBP.drop_duplicates(billPaymentIndex, inplace = True)
dfExactMatchBP.drop_duplicates(bsIndex, inplace = True)

#find no match list
noMatchBPList = set(dfBillPayment[billPaymentIndex].values)\
                    .difference(set(dfExactMatchBP[billPaymentIndex].values))

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfExactMatchBP['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Total row: %s"%dfBillPayment.shape[0])
print("Exact Matches: %s"%dfExactMatchBP.shape[0])
print("No Matches: %s"%len(noMatchBPList))

Total time: 1.3497s
Total time: 1.3827s
Total row - BS before match: 2938
Total row - BS after match: 2938
Total row: 292
Exact Matches: 0
No Matches: 292


In [354]:
#combine all exact matches
dfExactMatchFinal = pd.concat([dfExactMatchEU, dfExactMatchCOP, dfExactMatchManageCOP,
                               dfExactMatchBP, dfExactMatchMerchant]).copy()
matchedBSIndex = dfExactMatchFinal[bsIndex].values

dfExactMatchFinal.shape, len(dfExactMatchFinal[bsIndex].unique()), dfBankStatement2.shape[0]-len(bsIndexList)

((3424, 12), 3424, 3424)

In [355]:
dfNoMatchList = pd.DataFrame([
                            ["EU", noMatchEUList],
                            ["COP", noMatchCOPList],
                            ["ManageCOP", noMatchManageCOPList],
                            ["BP", noMatchBPList],
                            ["Merchant", noMatchMerchantList],
                            ["BS", bsIndexList]
                             ],
                            columns = ["DataType", "NoMatchList"])

#back up bsIndexList
bsBackUp = bsIndexList.copy()

# np.all(np.equal(bsBackUp, bsIndexList))

excelFile = 'ExactMatch/' + 'EM_' + bsExcelFile
print(excelFile)
with pd.ExcelWriter(excelFile) as writer:
    dfExactMatchFinal.to_excel(writer, index = False, sheet_name = "ExactMatch")
    dfNoMatchList.to_excel(writer, index = False, sheet_name = "NoMatchList")

ExactMatch/EM_31_12 3207971832_2020.xlsx


### Partial Matching

In [356]:
def matchFirst2Terms(name, series):
    ruleName = "matchFirst2Terms"
    
    #match series with first 1 or 2 terms of names
    if len(name.split(' ')) >= 3:
        match = ' '.join(name.split(' ')[:2])
        resultArray = series.map(lambda x:ruleName if re.search('[\s\d]%s'%re.escape(match), 
                                                                re.sub("\xa0", "", str(x)), flags = re.IGNORECASE)
                                else "False").values
    elif len(name.split(' ')) == 2:
        match = name.split(' ')[0]
        resultArray = series.map(lambda x:ruleName if re.search('[\s\d]%s'%re.escape(match), 
                                                                re.sub(" |\xa0","",str(x)), flags = re.IGNORECASE)
                                else "False").values
    else:
        resultArray = np.full(len(series), "False")

    return resultArray

# def matchTruncated(name, series):
#     ruleName = "matchTruncated"
#     #match series with truncated name if there is 
#     cleanName = re.sub(" binti ", " bt ", str(name), flags = re.IGNORECASE)
#     rule1 =  series\
#             .map(lambda x: True if re.search('%s'%re.escape(str(name)), str(x), flags = re.IGNORECASE) 
#                  and str(x) not in ['', 'nan'] else False).values
#     #match name with truncated name in the series if there is
#     rule2 = series\
#                 .map(lambda x: True if re.search('%s'%re.escape(re.sub("ENCIK |COP REFUND", '', 
#                                                                 re.sub(" binti ", " bt ", str(x), flags = re.I), 
#                                                                        flags = re.IGNORECASE).strip()), 
#                                                  str(name), flags = re.IGNORECASE) 
#                      and re.sub("ENCIK |COP REFUND", '', 
#                                 re.sub(" binti ", " bt ", str(x), flags = re.I), 
#                                 flags = re.IGNORECASE).strip() not in ['', 'nan'] else False).values
#     resultArray = (rule1|rule2)
#     resultArray = np.where(resultArray, ruleName, "False")
    
#     return resultArray

#for merchant only
def matchTruncated(name, series):
    ruleName = "matchTruncated"
    #match truncated name series with name if there is 
    rule1 = []
    for value in series:
        if re.search('.*\d\s*(.+?)\s*$', str(value)):
            truncatedName = re.search('.*\d\s*(.+?)\s*$', str(value)).group(1)
            truncatedName = re.escape(re.sub("COP REFUND|binti ", "", truncatedName, flags = re.IGNORECASE))
            if truncatedName not in ['\ ', 'nan'] and re.search(truncatedName, 
                                                                re.sub(" binti ", " ", name, flags = re.IGNORECASE), 
                                                                flags = re.IGNORECASE):
                rule1.append(True)
            else:
                rule1.append(False)
        else:
            rule1.append(False)
    resultArray = np.where(rule1, ruleName, "False")
    return resultArray

def noSpaceMatch(name, series):
    ruleName = "noSpaceMatch"
    cleanName = re.sub(" ", "", str(name))
    #remove space for BS reference to match wth name
    rule1 = series.map(lambda x:True if re.search('%s'%re.escape(cleanName), 
                                                  re.sub(" ", "", x), flags = re.IGNORECASE)
                                    and cleanName not in ['', 'nan']
                                else False).values
#     rule2 = series.map(lambda x:True if re.search('%s'%re.escape(re.sub(' ','',re.sub("COP REFUND", '',str(x)))), 
#                         str(name), flags = re.IGNORECASE) and re.sub(' ','',re.sub("COP REFUND", '',str(x)))!=''
#                             else False).values
#     resultArray = (rule1|rule2)
    resultArray = rule1
    resultArray = np.where(resultArray, ruleName, "False")
    return resultArray

def matchAccountLast4Digit(accountNo, series):
    ruleName = "matchAccountLast4Digit"
    resultArray = series.map(lambda x:ruleName if re.search('%s'%accountNo, str(x), flags = re.IGNORECASE)
                                        and len(accountNo) == 4 else "False").values
    return resultArray

def matchBillTypeRef(billType, series):
    ruleName = "matchBillTypeRef"
    if billType == '':
        resultArray = np.full(len(series), "False")
        reason = ""
    else:
        #if can find billtype - Indah water in reference 
        rule1 =  series\
                .map(lambda x: True if re.search('%s'%billType, str(x), flags = re.IGNORECASE) 
                     and str(x) not in  ['', 'nan'] else False).values
        #if billtype is PAIP and Pengurusan air pahan in reference
        rule2 = series\
                .map(lambda x: True if re.search('%s'%"PENGURUSAN AIR PAHAN", str(x), flags = re.IGNORECASE) 
                     and billType == 'PAIP' else False).values
        #if billtype is PBA and perbadanan bekalan air in reference
        rule3 = series\
                .map(lambda x: True if re.search('%s'%"perbadanan bekalan air|perbadanan bekalan", str(x), 
                                                 flags = re.IGNORECASE) 
                     and billType == 'PBA' else False).values
        resultArray = (rule1|rule2|rule3)

        resultArray = np.where(resultArray, ruleName, "False")
    return resultArray

def matchUniqueAmount(amount, series):
    ruleName = "matchUniqueAmount"
    #if only unique match in bs for amount
    resultArray = series.map(lambda x:True if x == amount else False).values
    if sum(resultArray) == 1:
        return np.full(len(series), ruleName)
    else:
        return np.full(len(series), "False")

def PartialMatch(df, indexCol, notMatchIndex, nameCol, dfBS, ruleType):
    startTime = timer()
    matchResult = []
    bsIndexCheckList = []
    df = df[df[indexCol].isin(notMatchIndex)].copy()
    for no, row in enumerate(list(df.iterrows())):
        #assign values to variables: POname, POamount, AccountNo, BillType
        index = row[1][indexCol]
        POname = row[1][nameCol]
        POname = re.sub("\u200b", "", str(POname))
        POname = re.sub("\xa0", "", str(POname))
        POname = re.sub("\‘|\’", "'", str(POname))
        POamount = row[1][poAmountCol]
        #merchant, other payout
        if "Bank Acc" in df.columns:
            AccountNo = re.sub(' |-|\.0|\(|\)','', str(row[1]['Bank Acc']))[-4:]
            match4 = matchAccountLast4Digit(AccountNo, dfBS[bsMatchCol])
        #bill payment
        elif "Acc Number" in df.columns:
            AccountNo = re.sub(' |-|\.0|\(|\)','', str(row[1]['Acc Number']))[-4:]
            match4 = matchAccountLast4Digit(AccountNo, dfBS[bsMatchCol])
        else:
            AccountNo = ''
            
        if 'Bill Type' in df.columns:
            BillType = row[1]['Bill Type']
            match5 = matchBillTypeRef(BillType, dfBS[bsMatchCol])
        else:
            BillType = ''
        
        #matching rules
        if ruleType in ["General", "Merchant"]:
            match1 = matchTruncated(POname, dfBS[bsMatchCol])
        match2 = matchFirst2Terms(POname, dfBS[bsMatchCol])
#         match3 = noSpacenoCopRefund(POname, dfBankStatement2[bsMatchCol])
        match3 = noSpaceMatch(POname, dfBS[bsMatchCol])
        match6 = matchUniqueAmount(POamount, dfBS[bsAmountCol])
        ruleStr = "|".join(['(match%s!="False")'%i for i in ruleDict[ruleType]])
        checkRule = eval(ruleStr)
        #assign bool statement to checkRule
        #use result of matching rules to subset dfBankStatement2
        dfMatch = dfBS[(dfBS[bsAmountCol] == POamount) & (checkRule)].copy()
        
        #to copy rulename if matches 
        ruleStr = '[' + ','.join(['match%s'%i for i in ruleDict[ruleType]]) + ']'
        #assign list to ruleList
        ruleList = eval(ruleStr)

        if dfMatch.shape[0] == 1:
            BSname = dfMatch[bsMatchCol].values[0]
            BSamount = dfMatch[bsAmountCol].values[0]   
            BSindex = dfMatch[bsIndex].values[0]
            NoRow = list(dfBS[bsIndex]).index(BSindex)
            rule = ','.join([i[NoRow] for i in ruleList if i[NoRow]!="False"])
            if rule != "matchAccountLast4Digit" and rule != "matchUniqueAmount":
                matchResult.append([index, POname, POamount, BSindex, BSname, BSamount, rule])
            if BSindex not in bsIndexCheckList:
                bsIndexCheckList.append(BSindex)
            
        elif dfMatch.shape[0] > 1:
            for no, name in enumerate(dfMatch[bsMatchCol].values):
                BSindex = dfMatch[bsIndex].values[no]
                if BSindex not in bsIndexCheckList:
                    BSname = dfMatch[bsMatchCol].values[no]
                    BSamount = dfMatch[bsAmountCol].values[no]
                    NoRow = list(dfBS[bsIndex]).index(BSindex)
                    rule = ','.join([i[NoRow] for i in ruleList if i[NoRow]!="False"])
                    #if rule not only matchAccountLast4Digit or matchUniqueAmount
                    if rule != "matchAccountLast4Digit" and rule != "matchUniqueAmount":
                        matchResult.append([index, POname, POamount, BSindex, BSname, BSamount, rule])
                        bsIndexCheckList.append(BSindex)
                        break
    
    endTime = timer()
    print("Total time: %0.4fs" % (endTime - startTime))
    return matchResult

def constructTable(matchResult, df, indexName, nameCol, dType):
    pandaList = []
    for i in matchResult:
        pandaList.append(i)
    
    dfMatchFinal = pd.DataFrame(pandaList)
    if dfMatchFinal.shape[0]!=0:
        #[index, POname, POamount, BSindex, BSname, BSamount, rule]
        dfMatchFinal.columns = [indexName, poNameCol, poAmountCol, bsIndex, bsMatchCol, bsAmountCol, 'Reason']
        dfMatchFinal['MatchCategory'] = "PartialMatch"
        dfMatchFinal['DataType'] = dType
        return dfMatchFinal
    else:
        return None

In [357]:
#if start from here: bsBackUp is bsIndexList 
# bsIndexList = bsBackUp

#EU
#782
euList = [3479, 1816, 3517, 
          2418, 215, 1275, 
          2149, 3137,  3389, 554, 1534]
# dfEU2 = dfEU.query("EU_Index in @euList")
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
matchResultEU = PartialMatch(dfEU, euIndex, noMatchEUList,
                            payoutNameCol, dfBankStatementTemp, ruleType = "General")

dfPartialMatchEU = constructTable(matchResultEU, dfEU, euIndex, poNameCol, 'EU')

dfPartialMatchEU.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfPartialMatchEU['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("No Matches Previously: %s"%len(noMatchEUList))
print("Partial Matches: %s"%dfPartialMatchEU.shape[0])

Total time: 171.1259s
Total row - BS before match: 2938
Total row - BS after match: 2548
No Matches Previously: 413
Partial Matches: 390


In [358]:
#COP
bsBackUpCOP = bsIndexList

#85
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
matchResultCOP = PartialMatch(dfCOP, copIndex, noMatchCOPList, 
                              payoutNameCol, dfBankStatementTemp, ruleType = "General")

dfPartialMatchCOP = constructTable(matchResultCOP, dfCOP, copIndex, poNameCol, 'COP')

dfPartialMatchCOP.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
#update bsIndexList
bsIndexList = [i for i in bsIndexList if i not in dfPartialMatchCOP['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("No Matches Previously: %s"%len(noMatchCOPList))
print("Partial Matches: %s"%dfPartialMatchCOP.shape[0])

Total time: 107.3745s
Total row - BS before match: 2548
Total row - BS after match: 2228
No Matches Previously: 328
Partial Matches: 320


In [359]:
bsBackUpManageCOP = bsIndexList
# bsIndexList = bsBackUpManageCOP
#Manage COP
#1
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
matchResultManageCOP = PartialMatch(dfManageCOP, manageCOPIndex, noMatchManageCOPList,
                                   payoutNameCol, dfBankStatementTemp, ruleType = "General")

dfPartialMatchManageCOP = constructTable(matchResultManageCOP, dfManageCOP, manageCOPIndex, poNameCol, 'ManageCOP')

if type(dfPartialMatchManageCOP) == pd.DataFrame:
    dfPartialMatchManageCOP.drop_duplicates(bsIndex, inplace = True)
    
print("Total row - BS before match: %s"%len(bsIndexList))
print("No Matches Previously: %s"%len(noMatchManageCOPList))
if type(dfPartialMatchManageCOP) == pd.DataFrame:
    bsIndexList = [i for i in bsIndexList if i not in dfPartialMatchManageCOP['BS_Index'].unique()]
    print("Total row - BS after match: %s"%len(bsIndexList))
    print("Partial Matches: %s"%dfPartialMatchManageCOP.shape[0])

Total time: 0.7846s
Total row - BS before match: 2228
No Matches Previously: 3
Total row - BS after match: 2225
Partial Matches: 3


In [360]:
bsBackUpMerchant = bsIndexList

# bsIndexList = bsBackUpMerchant

#Merchant
#774
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
MerchantList = []
for colName in ['Merchant', 'Bank Acc Name']:
    #replace noMatchMerchantList with a for testing purpose
#     a = [1093, 1084, 224, 442, 481, 1040, 659, 1468]
    matchResultMerchant = PartialMatch(dfMerchant, merchantIndex, noMatchMerchantList,
                                       colName, dfBankStatementTemp, ruleType = "Merchant")
    dfPartialMatchMerchant = constructTable(matchResultMerchant, dfMerchant, merchantIndex, colName, 'Merchant')
    MerchantList.append(dfPartialMatchMerchant)
    
dfPartialMatchMerchant = pd.concat(MerchantList).copy()
dfPartialMatchMerchant.drop_duplicates(merchantIndex, inplace = True)
dfPartialMatchMerchant.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
print("No Matches Previously: %s"%len(noMatchMerchantList))
# print("No Matches Previously: %s"%len(a))
bsIndexList = [i for i in bsIndexList if i not in dfPartialMatchMerchant['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Partial Matches: %s"%dfPartialMatchMerchant.shape[0])

Total time: 165.1914s
Total time: 166.1713s
Total row - BS before match: 2225
No Matches Previously: 634
Total row - BS after match: 1596
Partial Matches: 629


In [361]:
bsBackUpBillPayment = bsIndexList

#Bill Payment
#name, acc holder
dfBankStatementTemp = dfBankStatement2[dfBankStatement2[bsIndex].isin(bsIndexList)].copy()
BillPaymentList = []
for colName in [billPaymentNameCol, 'Acc Holder']:
    matchResultBP = PartialMatch(dfBillPayment, billPaymentIndex, noMatchBPList,
                                       colName, dfBankStatementTemp, ruleType = "BillPayment")
    dfPartialMatchBP = constructTable(matchResultBP, dfBillPayment, billPaymentIndex, colName, 'BillPayment')
    BillPaymentList.append(dfPartialMatchBP)
    
#ll only keep d first record of multiple matches
dfPartialMatchBP = pd.concat(BillPaymentList).copy()
dfPartialMatchBP.drop_duplicates(billPaymentIndex, inplace = True)
dfPartialMatchBP.drop_duplicates(bsIndex, inplace = True)

print("Total row - BS before match: %s"%len(bsIndexList))
print("No Matches Previously: %s"%len(noMatchBPList))
bsIndexList = [i for i in bsIndexList if i not in dfPartialMatchBP['BS_Index'].unique()]
print("Total row - BS after match: %s"%len(bsIndexList))
print("Partial Matches: %s"%dfPartialMatchBP.shape[0])

Total time: 23.4766s
Total time: 23.1933s
Total row - BS before match: 1596
No Matches Previously: 292
Total row - BS after match: 1589
Partial Matches: 7


In [362]:
#combine all partial matches - edited
dfPartialMatchFinal = pd.concat([dfPartialMatchEU, dfPartialMatchCOP, dfPartialMatchManageCOP,
                                 dfPartialMatchBP, dfPartialMatchMerchant]).copy()
csvFile =  'PartialMatch/' + 'PM_' + bsExcelFile.split('.')[0] + '.csv'
dfPartialMatchFinal.to_csv(csvFile, index = False)

dfMatchFinal = pd.concat([dfExactMatchFinal, dfPartialMatchFinal]).copy()

In [363]:
reasonList = ['matchUniqueAmount', 'matchAccountLast4Digit']

dfMatchFinal = dfMatchFinal.query('Reason not in @reasonList')

dfMatchFinal['Reason'].unique()

array([nan, 'matchTruncated,matchFirst2Terms',
       'matchTruncated,matchFirst2Terms,matchUniqueAmount',
       'matchTruncated',
       'matchFirst2Terms,noSpaceMatch,matchUniqueAmount',
       'matchFirst2Terms', 'matchFirst2Terms,matchUniqueAmount',
       'matchTruncated,matchUniqueAmount',
       'matchFirst2Terms,noSpaceMatch',
       'matchTruncated,matchFirst2Terms,matchAccountLast4Digit,matchUniqueAmount',
       'matchTruncated,matchFirst2Terms,matchAccountLast4Digit',
       'matchBillTypeRef'], dtype=object)

### Summary

In [364]:
reconciledEUIndex = dfMatchFinal[pd.notnull(dfMatchFinal['EU_Index'])]['EU_Index'].values
reconciledCOPIndex = dfMatchFinal[pd.notnull(dfMatchFinal['COP_Index'])]['COP_Index'].values
reconciledManageCOPIndex = dfMatchFinal[pd.notnull(dfMatchFinal['ManageCOP_Index'])]['ManageCOP_Index'].values
reconciledBillPaymentIndex = dfMatchFinal[pd.notnull(dfMatchFinal['BP_Index'])]['BP_Index'].values
reconciledMerchantIndex = dfMatchFinal[pd.notnull(dfMatchFinal['Merchant_Index'])]['Merchant_Index'].values
# reconciledBankStatementIndex = dfMatchFinal[pd.notnull(dfMatchFinal['BS_Index'])]['BS_Index'].values
# noMatchesBankStatementIndex = bsIndexList
noMatchesBankStatementIndex = set(dfBankStatement2['BS_Index'].values).difference(set(dfMatchFinal['BS_Index'].values))

reconciledEU_Sum = dfEU[dfEU['EU_Index'].isin(reconciledEUIndex)]['Amount'].sum()
reconciledCOP_Sum  = dfCOP[dfCOP['COP_Index'].isin(reconciledCOPIndex)]['Amount'].sum()
reconciledManageCOP_Sum  = dfManageCOP[dfManageCOP['ManageCOP_Index'].isin(reconciledManageCOPIndex)]['Amount'].sum()
reconciledBillPayment_Sum  = dfBillPayment[dfBillPayment['BP_Index'].isin(reconciledBillPaymentIndex)]['Amount'].sum()
reconciledMerchant_Sum  = dfMerchant[dfMerchant['Merchant_Index'].isin(reconciledMerchantIndex)]['Amount'].sum()

noReconciledEU_Sum = dfEU[~dfEU['EU_Index'].isin(reconciledEUIndex)]['Amount'].sum()
noReconciledCOP_Sum  = dfCOP[~dfCOP['COP_Index'].isin(reconciledCOPIndex)]['Amount'].sum()
noReconciledManageCOP_Sum  = dfManageCOP[~dfManageCOP['ManageCOP_Index'].isin(reconciledManageCOPIndex)]['Amount'].sum()
noReconciledBillPayment_Sum  = dfBillPayment[~dfBillPayment['BP_Index'].isin(reconciledBillPaymentIndex)]['Amount'].sum()
noReconciledMerchant_Sum  = dfMerchant[~dfMerchant['Merchant_Index'].isin(reconciledMerchantIndex)]['Amount'].sum()
noReconciledBankStatement_Sum = dfBankStatement2[dfBankStatement2['BS_Index'].isin(noMatchesBankStatementIndex)]['Debit Amount'].sum()

# pendingEU_Sum = dfEU[dfEU['Status'] == "Pending"]['Amount'].sum()
pendingEU_Sum = dfEU['Amount'].sum()
pendingCOP_Sum = dfCOP[dfCOP['Status'] == "Pending"]['Amount'].sum()
pendingManageCOP_Sum = dfManageCOP[dfManageCOP['Status'] == "Pending"]['Amount'].sum()
pendingMerchant_Sum = dfMerchant[dfMerchant['Status'] == "Pending"]['Amount'].sum()
billPayment_Sum = dfBillPayment['Amount'].sum()

#handling charges
dfHC = dfBankStatement[dfBankStatement['Reference 2'].map(lambda x:
                                                    True if re.search(filterTransDesc, str(x))
                                                           else False)].copy()
HandlingSum = dfHC['Debit Amount'].sum()

In [365]:
tableList = []
tableList.append(["Journal", "BS vs Payout (A)", "In BS, not in Payout (B)", "Not in BS (C)"])

tableList.append(['EU', reconciledEU_Sum, "", noReconciledEU_Sum])
tableList.append(['COP', "{:.2f}".format(reconciledCOP_Sum), "", noReconciledCOP_Sum])
tableList.append(['Manage COP', reconciledManageCOP_Sum, "", 
                  noReconciledManageCOP_Sum])
tableList.append(['Bill Payment', reconciledBillPayment_Sum, "", 
                  noReconciledBillPayment_Sum])
tableList.append(['Merchant', reconciledMerchant_Sum, "", 
                  noReconciledMerchant_Sum])
tableList.append(['DIS', "", "", ""])
tableList.append(['QS DIS', "", "", ""])
tableList.append(['Handling Charges', "", HandlingSum, ""])
tableList.append(['Misc', "", noReconciledBankStatement_Sum, ""])
totalReconciled = reconciledEU_Sum + reconciledCOP_Sum + reconciledManageCOP_Sum\
                  + reconciledBillPayment_Sum + reconciledMerchant_Sum
totalnoReconciled = noReconciledEU_Sum + noReconciledCOP_Sum + \
                    noReconciledManageCOP_Sum + noReconciledBillPayment_Sum + \
                    noReconciledMerchant_Sum 
tableList.append(['Total', "{:.2f}".format(totalReconciled), "{:.2f}".format(noReconciledBankStatement_Sum + HandlingSum),
                  "{:.2f}".format(totalnoReconciled)])
tableList.append(['Total (A)+(B) / (A)+(C)', 
                  "{:.2f}".format(totalReconciled + noReconciledBankStatement_Sum),
                 "", "{:.2f}".format(totalReconciled + totalnoReconciled)])
pendingTotal = pendingEU_Sum + pendingCOP_Sum + pendingManageCOP_Sum\
                + pendingMerchant_Sum + billPayment_Sum
tableList.append(['As per BS / Payout', "({:.2f})".format(totalReconciled + noReconciledBankStatement_Sum),
                                  "", "({:.2f})".format(pendingTotal)])
tableList.append(['Variance', "", "", totalReconciled + totalnoReconciled - pendingTotal])
unconvertedBSpercent = (noReconciledBankStatement_Sum/(totalReconciled + noReconciledBankStatement_Sum + HandlingSum))*100
unconvertedPOpercent = (totalnoReconciled/(totalReconciled + totalnoReconciled))*100
tableList.append(["Unconverted percent", "", "{:.2f}".format(unconvertedBSpercent),
                 "{:.2f}".format(unconvertedPOpercent)])

In [366]:
summaryTable = pd.DataFrame(tableList)
summaryTable.columns = ["", "Reconciled", "No reconciled", "No reconciled2"]
summaryTable

Unnamed: 0,Unnamed: 1,Reconciled,No reconciled,No reconciled2
0,Journal,BS vs Payout (A),"In BS, not in Payout (B)",Not in BS (C)
1,EU,2.42677e+06,,28786
2,COP,8586698.00,,38834
3,Manage COP,26758,,0
4,Bill Payment,273.2,,61089
5,Merchant,2.01228e+06,,1100.5
6,DIS,,,
7,QS DIS,,,
8,Handling Charges,,471,
9,Misc,,1.24283e+07,


In [367]:
13052778.35 + 12428781.64

25481559.990000002

In [368]:
excelFile

'ExactMatch/EM_31_12 3207971832_2020.xlsx'

In [369]:
#no reconciled bs
noReconciledBSdf = dfBankStatement2[dfBankStatement2['BS_Index'].isin(noMatchesBankStatementIndex)].copy()
#no reconciled payout
dfNoMatchFinal = pd.concat([dfEU[~dfEU['EU_Index'].isin(reconciledEUIndex)],
     
                            dfCOP[~dfCOP['COP_Index'].isin(reconciledCOPIndex)],
                           dfManageCOP[~dfManageCOP['ManageCOP_Index'].isin(reconciledManageCOPIndex)],
                           dfBillPayment[~dfBillPayment['BP_Index'].isin(reconciledBillPaymentIndex)],
                           dfMerchant[~dfMerchant['Merchant_Index'].isin(reconciledMerchantIndex)]]).copy()

excelFile = outputFolder + '/' +  bsExcelFile
print(excelFile)
with pd.ExcelWriter(excelFile) as writer:
    summaryTable.to_excel(writer, index = False, sheet_name = 'Summary')
    dfMatchFinal.to_excel(writer, index = False, sheet_name = 'Reconciled')
    noReconciledBSdf.to_excel(writer, index = False, sheet_name = "NotReconciled_BS")
    dfNoMatchFinal.to_excel(writer, index = False, sheet_name = "NotReconciled_Payout")
    dfEU.to_excel(writer, index = False, sheet_name = "EU")
    dfCOP.to_excel(writer, index = False, sheet_name = "COP")
    dfManageCOP.to_excel(writer, index = False, sheet_name = "ManageCOP")
    dfMerchant.to_excel(writer, index = False, sheet_name = "Merchant")
    dfBillPayment.to_excel(writer, index = False, sheet_name = "BillPayment")
    dfBankStatement2.to_excel(writer, index = False, sheet_name = "BankStatement")

Dec2020_MatchingResult/31_12 3207971832_2020.xlsx
