# Prep 1: steps to install the packages

In [1]:
!pip install py_entitymatching
!pip install scipy
!pip install numpy
!pip install pandas

Collecting py_entitymatching
[?25l  Downloading https://files.pythonhosted.org/packages/ee/d3/2eacdb4ee0e268eb4c041fc2921e880262658b24e15ae470559fb1999eab/py_entitymatching-0.3.1.tar.gz (2.0MB)
[K     |████████████████████████████████| 2.0MB 2.8MB/s 
[?25hCollecting PyPrind (from py_entitymatching)
  Downloading https://files.pythonhosted.org/packages/6c/b3/c3420d9a05e8fd0677907aab873998afd473af41aaf8d3bc557e8f35832c/PyPrind-2.11.2.tar.gz
Collecting py_stringsimjoin==0.3.0 (from py_entitymatching)
[?25l  Downloading https://files.pythonhosted.org/packages/b9/32/28a76b430e092a330850707e34f89419ce77b06dc303e6c5f6cd701ad5ba/py_stringsimjoin-0.3.0.tar.gz (786kB)
[K     |████████████████████████████████| 788kB 38.2MB/s 
Collecting py_stringmatching>=0.2.1 (from py_stringsimjoin==0.3.0->py_entitymatching)
[?25l  Downloading https://files.pythonhosted.org/packages/3e/a3/89c3d02bbf1e24868673702ebd38a3b76259cb124a5d26d46a050d3fccf2/py_stringmatching-0.4.1.tar.gz (646kB)
[K     |█████████

# Prep 2: enter the file location on your harddisk

In [0]:
table_a = 'Table A.xls'
table_b = 'Table B.xls'
candidate_set = 'reducedTuplepairs.csv'
prediction_set = 'Prediction list.xls'

# Prep 3: reading the files into pandas dataframe

In [0]:
import pandas as pd
dfa = pd.read_csv(table_a)
dfb = pd.read_csv(table_b)
dfc = pd.read_csv(candidate_set)
dfp = pd.read_csv(prediction_set)

# Module: debug_blocker

In [0]:
# Example input format:
# Format of table_a:
# _id, attribute1, attribute2, ....., attributen

# Format of table_b:
# _id, attribute1, attribute2, ....., attributen

# Format of candidate_set
# A_id,B_id
# where A_id is _id from table_a and B_id is the _id column value from table_b

In [0]:
import py_entitymatching as em
import pandas as pd

def run_debug_blocker(table_a, table_b, table_a_key, table_b_key, candidate_set):
    dfl = em.read_csv_metadata(table_a, key=table_a_key)
    dfr = em.read_csv_metadata(table_b, key=table_b_key)

    # reading the candidate set and adding key
    dfcand = pd.read_csv(candidate_set)
    dfcand.drop_duplicates(inplace=True)
    dfcand.to_csv('cand_set_with_index.csv', index_label='id')

    dfcset = em.read_csv_metadata('cand_set_with_index.csv', key='id', ltable=dfl, 
                                  rtable=dfr, fk_ltable='A_id', fk_rtable='B_id')

    # running debug blocker to identify the records in A x B \ C
    debug_file = em.debug_blocker(dfcset, dfl, dfr)
    
    return debug_file

In [0]:
debug_file = run_debug_blocker(table_a, table_b, '_id', '_id', candidate_set)

In [0]:
debug_file

# Module: estimate_precision_recall

In [0]:
import pandas as pd
from scipy.stats import norm
from numpy import sqrt

delta = .05
Z = norm.ppf(1 - (delta / 2))

def estimate_PR(labeled_pairs, reduced_cands, predicted_matches):
    '''
    labeled_pairs - a pandas dataframe with schema id1,id2,label
                    Note label needs to be Boolean

    reduced_cands - a pandas dataframe with schema id1,id2
    predicted_matches - a pandas dataframe with schema id1,id2
    
    return:
        ( (recall lower bound, recall upper bound), (precision lower bound, precision upper bound) )
    '''

    labeled_pairs.drop_duplicates(inplace=True)
    labeled_pairs.columns = ['id1', 'id2', 'label']
    reduced_cands.columns = ['id1', 'id2']
    reduced_cand_set = set(zip(reduced_cands.id1, reduced_cands.id2))
    predicted_matches = set(zip(predicted_matches.id1, predicted_matches.id2))
    
    # estimate the recall
    # number of positives in the labeled sample
    actual_pos = float(labeled_pairs.label.sum())
    # the maximum number of postives in the candidate set
    max_actual_pos = float(actual_pos + len(reduced_cand_set) - len(labeled_pairs))
    
    # true positives in the labeled sample
    true_pos = float(labeled_pairs.apply(lambda x : (x['id1'], x['id2']) in predicted_matches and x['label'], axis=1).sum())
    #estimated recall
    recall = float(true_pos / actual_pos)

    recall_error = Z * sqrt( ((recall * (1 - recall)) / (actual_pos)) * ((max_actual_pos - actual_pos) / (max_actual_pos - 1)) )


    # estimate Precision
    labeled_set  = set(zip(labeled_pairs.id1, labeled_pairs.id2))
    predicted_pos = float(len(labeled_set & predicted_matches))
    
    predicted_pos_in_reduced_cand_set = float(len(reduced_cand_set & predicted_matches))
    
    alpha =  predicted_pos_in_reduced_cand_set / len(predicted_matches)
    precision = alpha * (true_pos / predicted_pos)
    
    precision_error = alpha * Z * sqrt( ((precision * (1 - precision)) / predicted_pos) * (float((len(predicted_matches) - predicted_pos)) / (len(predicted_matches)  - 1)) )

    return ((recall - recall_error, recall + recall_error),
            (precision - precision_error, precision + precision_error))

# Estimating Precision and Recall

In [7]:
# read the labeled pairs file, i.e. the file with the labels
labeled_pairs = pd.read_csv('labelPairs_400.csv')
print(estimate_PR(labeled_pairs, dfc, dfp))

((0.970093033882644, 0.9846239472494315), (0.9337969948257316, 0.9539917840521561))
