In [3]:
import sys
import time
import datetime
import logging
import DWM10_Parms
import DWM14_BuildRefDict
import DWM15_BuildLinkIndex
import DWM16_BuildTokenFreqDict
import DWM25_Global_Token_Replace
import DWM42_BuildBlockPairs
import DWM55_LinkBlockPairs
import DWM80_TransitiveClosure
import DWM90_IterateClusters
import DWM96_WriteLinkIndex
import DWM97_ClusterProfile
import DWM99_ERmetrics

In [4]:
# Main Driver for Refactored Data Washing Machine
# Version 1.20 creates a log file with same information being written to console
# Version 1.30 creates cluster profile at end of program and evaluates ER statistics
# Version 1.40 FK - added module DWM25 to do global level token replacement
#              JRT - added DWM65_ScoringMatrix to allow ScoringMatrix as a comparitor type
# Version 1.50 Revised and corrected scoring matrix
#              Revised DWM25 Global Replacement to reuse Tokenizer Dictionary and use DWM_WordList.txt
# Version 1.60 Implemented 2 versions of Scoring Rule - Standard (Std) and Weighted (Kris)
#              Changed Parms to be a class imported by all modules
# Version 1.70 Added new parameter minBlkTokenLen to set a minimum length for blokcing tokens
#              Improved performance of global cleaning
# Version 1.80 Added new parameter excludeNumericBlocks when True does not block on numeric tokens
#              Added new parameter removeExcludedBlkTokens when True removes tokens excluded by
#                 minBlkTokenLen & exludeNumbericBlocks
#              Added timer and added Total Runtime to logging statistics
# Version 1.90 Extensive refactor of the processing logic separating stop word removal from blocking.
#              Also generate and deduplicate pairs from all blocks before comparing or correcting references
#              Added a new parameter blockByPairs requiring refs in a block to share 2 tokens
#              Added MultiDriver for testing the 18 test files.
version = 1.90
# get start time for timer
startTime = time.time()
# date time is used to label the logfile
now = datetime.datetime.now()
tag = str(now.year)+(str(now.month)).zfill(2)+(str(now.day)).zfill(2)
tag = tag+'_'+(str(now.hour)).zfill(2)+'_'+(str(now.minute)).zfill(2)
logFile = open('DWM_Log_Multi_'+tag+'.txt','w')
print("Data Washing Machine Refactor Version",version)
print("Data Washing Machine Refactor Version",version, file=logFile)
print("Date/Time",tag)
print("Data/Time",tag, file=logFile)

fileName = "input.txt"
file1 = open(fileName, 'r')

while True:
    now1 = datetime.datetime.now()
    parmFileName = file1.readline()
    if not parmFileName or "#" in parmFileName:
        print('\nEnd of the parmFileName Runs')
        break
    print('\n\nRunning parms file',parmFileName)
    print('\nRunning parms file ',parmFileName, file=logFile)
    parmFileName = parmFileName.replace('\n','')
    DWM10_Parms.getParms(parmFileName, logFile)
    # Create refDict, a dictionary where key=refID, value is list of reference tokens
    refDict = DWM14_BuildRefDict.tokenizeInput()
    # Create linkIndx, a dictionary where key=refID, value is cluster ID
    linkIndex = DWM15_BuildLinkIndex.buildLinkIndex(refDict)
    # Create tokenFeqDict, a dictionary where key=token, value is token frequency
    tokenFreqDict =DWM16_BuildTokenFreqDict.buildTokenFreqDict(refDict)
    # create dictionary of corrections (stdTokenDict), leave empty if not running replacement
    #if global replacement configured, populate stdTokenDict of corrections in DWM25
    if DWM10_Parms.runGlobalCorrection:
        refDict = DWM25_Global_Token_Replace.globalReplace(refDict, tokenFreqDict)
        tokenFreqDict =DWM16_BuildTokenFreqDict.buildTokenFreqDict(refDict)
    moreToDo = True
    print('\n>>Starting Iterations')
    print('\n>>Starting Iterations', file=logFile)
    mu = DWM10_Parms.mu
    print('mu start value=', mu)
    print('mu start value=', mu, file=logFile)
    muIterate = DWM10_Parms.muIterate
    print('mu iterate value=', muIterate)
    print('mu iterate value=', muIterate, file=logFile)
    epsilon = DWM10_Parms.epsilon
    print('epsilon start value=', epsilon)
    print('epsilon start value=', epsilon, file=logFile)
    epsilonIterate = DWM10_Parms.epsilonIterate
    print('epsilon iterate value=', epsilonIterate)
    print('epsilon iterate value=', epsilonIterate, file=logFile)
    comparator = DWM10_Parms.comparator
    print('comparator =', comparator)
    print('comparator =', comparator, file=logFile)

    while moreToDo:
        print('\n****New Iteration\nSize of refDict =', len(refDict))   
        print('\n****New Iteration\nSize of refDict =', len(refDict), file=logFile)  
        #blockList = DWM40_BuildBlocks.buildBlocks(logFile, refList, tokenFreqDict)
        blockPairList = DWM42_BuildBlockPairs.buildBlockPairs(refDict, linkIndex, tokenFreqDict)
        if len(blockPairList)==0:
            print('--Ending because blockPairList is empty')
            print('--Ending because blockPairList is empty', file=logFile)
            break
        linkedPairList = DWM55_LinkBlockPairs.linkBlockPairs(blockPairList, refDict, tokenFreqDict)
        if len(linkedPairList)==0:
            print('Ending because linkedPairList is empty')
            print('Ending because linkedPairList is empty', file=logFile)
            break
        clusterList = DWM80_TransitiveClosure.transitiveClosure(linkedPairList)
        if len(clusterList)==0:
            print('--Ending because clusterList is empty') 
            print('--Ending because clusterList is empty', file=logFile)
            break  
        DWM90_IterateClusters.iterateClusters(clusterList, refDict, linkIndex)
        print('\n>>End of Iteration, Resetting mu and epsilon')
        print('\n>>End of Iteration, Resetting mu and epsilon', file=logFile)
        mu += muIterate
        mu = round(mu, 2)
        DWM10_Parms.mu = mu
        print('>>>New Value of mu = ',mu)
        print('>>>New Value of mu = ',mu, file=logFile)
        epsilon += epsilonIterate
        epsilon = round(epsilon, 2)
        DWM10_Parms.epsilon = epsilon
        print('>>>New Value of epsilon = ',epsilon)
        print('>>>New Value of epsilon = ',epsilon, file=logFile)
        if mu > 1.0:
            moreToDo = False
            print('Ending because mu > 1.0')
            print('Ending because mu > 1.0', file=logFile)
    # End of iterations
    # write Link Index to text file
    DWM96_WriteLinkIndex.writeLinkIndex(linkIndex)
    # Generate Cluster Profile
    DWM97_ClusterProfile.generateProfile(linkIndex)
    # Generat ER Metrics if truthFileName was given
    if DWM10_Parms.truthFileName != '':
        DWM99_ERmetrics.generateMetrics(linkIndex)
        #print('\nAbove ERMetrics for file',parmFileName)
    now2 = datetime.datetime.now()
    print("\nTotal File Runtime =", now2-now1, file=logFile)
    print("\nEnd of File ",parmFileName)
    print('Time to run File ', now2-now1)
    print("End of File ",parmFileName, file=logFile)   
file1.close()
endTime = time.time()
totalTime = endTime - startTime
print("All Files Total Runtime =", totalTime/60, " minutes")
print("\All Files Total Runtime =", totalTime/60, file=logFile)
print("End of Program")
print("End of Program", file=logFile)
logFile.close()

Data Washing Machine Refactor Version 1.9
Date/Time 20210831_13_13


Running parms file S5-parms.txt


>> Starting DWM14
Input Reference File Name = S5G.txt
Input File has Header Records = True
Input File Delimiter = ,
Tokenizer Function Type = Splitter
Remove Duplicate Reference Tokens = False
Total References Read= 3004
Total Tokens Found = 38106

>>Starting DWM15
LinkIndex created, record count = 3004

>> Starting DWM16
Total References Read= 3004
Total Tokens Found = 38106
Total Unique Tokens = 7190
Minimum Token Frequency = 1
Maximum Token Frequency = 2853
Top Five Tokens by Freqency
  Token= NC Frequency= 2853
  Token= SALEM Frequency= 1881
  Token= WINSTON Frequency= 1869
  Token= DR Frequency= 813
  Token= RD Frequency= 626
Average Token Frequency = 5.299860917941586
Standard Deviation of Token Frequency = 50.2307658022669

>>Starting Iterations
mu start value= 0.7
mu iterate value= 0.05
epsilon start value= 0.78
epsilon iterate value= 0.0
comparator = ScoringMatrixKris

****Ne

L= 1563.0 E= 1526.0 TP= 1371.0
Precision= 0.8772
Recall= 0.8984
F-measure= 0.8877

End of File  S5-parms.txt
Time to run File  0:00:10.752581


Running parms file S5-parms1.txt


>> Starting DWM14
Input Reference File Name = S5G.txt
Input File has Header Records = True
Input File Delimiter = ,
Tokenizer Function Type = Splitter
Remove Duplicate Reference Tokens = False
Total References Read= 3004
Total Tokens Found = 38106

>>Starting DWM15
LinkIndex created, record count = 3004

>> Starting DWM16
Total References Read= 3004
Total Tokens Found = 38106
Total Unique Tokens = 7190
Minimum Token Frequency = 1
Maximum Token Frequency = 2853
Top Five Tokens by Freqency
  Token= NC Frequency= 2853
  Token= SALEM Frequency= 1881
  Token= WINSTON Frequency= 1869
  Token= DR Frequency= 813
  Token= RD Frequency= 626
Average Token Frequency = 5.299860917941586
Standard Deviation of Token Frequency = 50.2307658022669

>>Starting Iterations
mu start value= 0.7
mu iterate value= 0.05
epsilon start v

L= 1619.0 E= 1526.0 TP= 1354.0
Precision= 0.8363
Recall= 0.8873
F-measure= 0.861

End of File  S5-parms1.txt
Time to run File  0:00:09.756495


Running parms file S5-parms2.txt


>> Starting DWM14
Input Reference File Name = S5G.txt
Input File has Header Records = True
Input File Delimiter = ,
Tokenizer Function Type = Splitter
Remove Duplicate Reference Tokens = False
Total References Read= 3004
Total Tokens Found = 38106

>>Starting DWM15
LinkIndex created, record count = 3004

>> Starting DWM16
Total References Read= 3004
Total Tokens Found = 38106
Total Unique Tokens = 7190
Minimum Token Frequency = 1
Maximum Token Frequency = 2853
Top Five Tokens by Freqency
  Token= NC Frequency= 2853
  Token= SALEM Frequency= 1881
  Token= WINSTON Frequency= 1869
  Token= DR Frequency= 813
  Token= RD Frequency= 626
Average Token Frequency = 5.299860917941586
Standard Deviation of Token Frequency = 50.2307658022669

>>Starting Iterations
mu start value= 0.7
mu iterate value= 0.05
epsilon start v

L= 1619.0 E= 1526.0 TP= 1354.0
Precision= 0.8363
Recall= 0.8873
F-measure= 0.861

End of File  S5-parms2.txt
Time to run File  0:00:09.798955

End of the parmFileName Runs
All Files Total Runtime = 0.5052000641822815  minutes
End of Program
