In [1]:
import sys
import datetime
import logging
import DWM10_Parms
import DWM20_TokenizerFunctions
import DWM25_Global_Token_Replace
import DWM30_BuildRefList
import DWM40_BuildBlocks
import DWM50_IterateBlocks
import DWM70_GeneratePairs
import DWM80_TransitiveClosure
import DWM90_IterateClusters
import DWM97_ClusterProfile
import DWM99_ERmetrics

In [2]:
# Main Driver for Refactored Data Washing Machine
# Version 1.20 creates a log file with same information being written to console
# Version 1.30 creates cluster profile at end of program and evaluates ER statistics
# Version 1.40 FK - added module DWM25 to do global level token replacement
#              JRT - added DWM65_ScoringMatrix to allow ScoringMatrix as a comparitor type
# Version 1.50 Revised and corrected scoring matrix
#              Revised DWM25 Global Replacement to reuse Tokenizer Dictionary and use DWM_WordList.txt
# Version 1.60 Implemented 2 versions of Scoring Rule - Standard (Std) and Weighted (Kris)
#              Changed Parms to be a class imported by all modules
version = 1.60
# date time is used to label the logfile
now = datetime.datetime.now()
tag = str(now.year)+(str(now.month)).zfill(2)+(str(now.day)).zfill(2)
tag = tag+'_'+(str(now.hour)).zfill(2)+'_'+(str(now.minute)).zfill(2)
logFile = open('DWM_Log_'+tag+'.txt','w')
print("Data Washing Machine Refactor Version",version)
print("Data Washing Machine Refactor Version",version, file=logFile)
#parmFileName = input('Enter Parameter File Name->')

fileName = input('Enter Parameter File Name->')
file1 = open(fileName, 'r')

while True:
    now1 = datetime.datetime.now()
    parmFileName = file1.readline()
    if not parmFileName:
        print('\nEnd of Parmfile Runs')
        break
    print('Running parms file',parmFileName)
    print('\nRunning parms file ',parmFileName, file=logFile)
    parmFileName = parmFileName.replace('\n','')

    DWM10_Parms.getParms(parmFileName)
    tokenFreqDict = DWM20_TokenizerFunctions.tokenizeInput(logFile)
    # create dictionary of corrections (stdTokenDict), leave empty if not running replacement
    stdTokenDict = {}
    #if global replacement configured, populate stdTokenDict of corrections in DWM25
    if DWM10_Parms.runReplacement:
        DWM25_Global_Token_Replace.globalReplace(logFile, tokenFreqDict, stdTokenDict)
    refList = DWM30_BuildRefList.buildRefList(logFile, stdTokenDict)
    moreToDo = True
    linkIndex =[]
    print('\n>>Starting Iterations')
    print('\n>>Starting Iterations', file=logFile)
    mu = DWM10_Parms.mu
    print('mu start value=', mu)
    print('mu start value=', mu, file=logFile)
    muIterate = DWM10_Parms.muIterate
    print('mu iterate value=', muIterate)
    print('mu iterate value=', muIterate, file=logFile)
    epsilon = DWM10_Parms.epsilon
    print('epsilon start value=', epsilon)
    print('epsilon start value=', epsilon, file=logFile)
    epsilonIterate = DWM10_Parms.epsilonIterate
    print('epsilon iterate value=', epsilonIterate)
    print('epsilon iterate value=', epsilonIterate, file=logFile)
    comparator = DWM10_Parms.comparator
    print('comparator =', comparator)
    print('comparator =', comparator, file=logFile)

    while moreToDo:
        print('\n****New Iteration\nSize of refList =', len(refList), 'Size of linkIndex =', len(linkIndex))   
        print('\n****New Iteration\nSize of refList =', len(refList), 'Size of linkIndex =', len(linkIndex), file=logFile)  
        blockList = DWM40_BuildBlocks.buildBlocks(logFile, refList, tokenFreqDict)
        if len(blockList)==0:
            print('--Ending because blockList is empty')
            print('--Ending because blockList is empty', file=logFile)
            break
        blockList.sort()
        compareCache = DWM50_IterateBlocks.iterateBlocks(logFile, blockList)
        pairList = DWM70_GeneratePairs.generatePairs(logFile, mu, compareCache)
        if len(pairList)==0:
            print('Ending because pairList is empty')
            print('Ending because pairList is empty', file=logFile)
            break
        clusterList = DWM80_TransitiveClosure.transitiveClosure(logFile, pairList)
        if len(clusterList)==0:
            print('--Ending because clusterList is empty') 
            print('--Ending because clusterList is empty', file=logFile)
            break  
        DWM90_IterateClusters.iterateClusters(logFile, clusterList, refList, linkIndex)
        print('\n>>End of Iteration, Resetting mu and epsilon')
        print('\n>>End of Iteration, Resetting mu and epsilon', file=logFile)
        mu += muIterate
        mu = round(mu, 2)
        DWM10_Parms.mu = mu
        print('>>>New Value of mu = ',mu)
        print('>>>New Value of mu = ',mu, file=logFile)
        epsilon += epsilonIterate
        DWM10_Parms.epsilon = epsilon
        print('>>>New Value of epsilon = ',epsilon)
        print('>>>New Value of epsilon = ',epsilon, file=logFile)
        if mu > 1.0:
            moreToDo = False
            print('Ending because mu > 1.0')
            print('Ending because mu > 1.0', file=logFile)
    # End of iterations
    # Add unclustered references to linkIndex
    for x in refList:
        refID = x[1]
        body = x[2]
        newTuple = (refID, refID)
        linkIndex.append(newTuple)
    # sort linkIndex by cluster IDs
    linkIndex.sort()
    # write out linkFile, but put RefID first and ClusterID second
    periodIndex = DWM10_Parms.inputFileName.rfind('.')
    inputPrefix = DWM10_Parms.inputFileName[0:periodIndex]
    linkFileName = inputPrefix+'-LinkIndex.txt'
    linkFile = open(linkFileName,'w')
    linkFile.write('RefID, ClusterID\n')
    for c in linkIndex:
        linkFile.write(c[1]+','+c[0]+'\n')
    linkFile.close()
    print('Record written to',linkFileName, '=',len(linkIndex))
    print('Record written to',linkFileName, '=',len(linkIndex), file=logFile)
    # Generate Cluster Profile
    profile = DWM97_ClusterProfile.generateProfile(linkIndex)
    print('\nCluster Profile')
    print('\nCluster Profile', file=logFile)
    print('Size\tCount')
    print('Size\tCount', file=logFile)
    total = 0
    for key in sorted(profile.keys()) :
        clusterTotal = key*profile[key]
        total +=clusterTotal
        print(key, '\t', profile[key], '\t', clusterTotal)
        print(key, '\t', profile[key], '\t', clusterTotal, file=logFile)
    print('\tTotal\t', total)
    print('\tTotal\t', total, file=logFile)
    # Generat ER Metrics if truthFileName was given
    if DWM10_Parms.truthFileName != '':
        DWM99_ERmetrics.generateMetrics(logFile, linkIndex, DWM10_Parms.truthFileName)
    print("End of Program")
    print("End of Program", file=logFile)
    now2 = datetime.datetime.now()
    print('\ntime to run ', now2-now1)
logFile.close()
file1.close()

Data Washing Machine Refactor Version 1.6
Enter Parameter File Name->S8-parms.txt
Running parms file # Test of Parameter File



FileNotFoundError: [Errno 2] No such file or directory: '# Test of Parameter File'