In [1]:
import pandas as pd
from scipy.stats import norm
from numpy import sqrt

table_a = 'TableA'
table_b = 'TableB'
candidate_set = 'CandidateSet'
prediction_set = 'PredictionList'

dfa = pd.read_csv(table_a)
dfb = pd.read_csv(table_b)
dfc = pd.read_csv(candidate_set)
dfp = pd.read_csv(prediction_set)

shuffled_dfc = dfc.sample(frac=1)
shuffled_dfc.to_csv('shuffled_dfc', index=False)

## Blocking Rules ( generates <em>ReducedCandidateSet.csv</em> )

In [2]:
import re
def blocking_rules():
    print('Running all blocking rules...')
    print('This will generate `ReducedCandidateSet.csv` after applying all blocking rules.')
    new_cs = {}
    j = 0
    for i in range(len(shuffled_dfc)):
        tuple = shuffled_dfc.iloc[i, :]
        idxa = tuple.A_id; idxb = tuple.B_id
        rowa = dfa.iloc[idxa, :]
        rowb = dfb.iloc[idxb, :]
        # Blocking Rule 1: Block all with missing `title` value
        if type(rowa['title']) == str and type(rowb['title']) == str and len(rowa['title']) > 0 and len(rowb['title']) > 0:
            # Blocking Rule 2: Block tuple pairs with non-matching `book_format`
            if rowa['book_format'] == rowb['book_format']:
                # Blocking Rule 3: Fraction of common words in title of tuple pairs > threshold(0.25)
                atitle = re.sub('[^0-9a-zA-Z]+', ' ', rowa['title']).split(' ')
                btitle = re.sub('[^0-9a-zA-Z]+', ' ', rowb['title']).split(' ')
                common_words = list(set(atitle) & set(btitle))
                total_words = list(set(atitle) | set(btitle))
                fraction = len(common_words) / len(total_words)
                if fraction > 0.10:
                    new_cs[j] = tuple
                    j = j+1
        
    updated_cdf = pd.DataFrame.from_dict(new_cs)
    updated_cdf = updated_cdf.T
    updated_cdf.to_csv('ReducedCandidateSet.csv', index=False)

blocking_rules()

Running all blocking rules...
This will generate `ReducedCandidateSet.csv` after applying all blocking rules.


In [3]:
shuffled_dfc = pd.read_csv('ReducedCandidateSet.csv')
print('Size of Reduced Candidate Set: ', len(shuffled_dfc))

Size of Reduced Candidate Set:  1520


In [4]:
import warnings
warnings.filterwarnings('ignore')

SAMPLE_SIZE = 50

def get_sample(start=0, n=50):
    print('Running in progres...')
    print('This will return sample of 50 pairs. You may set value of `SAMPLE_SIZE` to get desired sample size.')
    matching_dict = {}
    current_sample = {}
    k = start
    cols = dfa.columns.tolist()
    cols.remove('Up_System')
    cols.remove('_id')
    cols = ['_id', 'Up_System', 'predicted', 'manual'] + cols
    for i in range(len(shuffled_dfc[start:start+n])):
        tuple = shuffled_dfc.iloc[start + i, :]
        idxa = tuple.A_id; idxb = tuple.B_id
        rowa = dfa.iloc[idxa, :]
        rowb = dfb.iloc[idxb, :]
        rowa['Up_System'] = str(start + i) + 'A'
        rowb['Up_System'] = str(start + i) + 'B'
        rowa['manual'] = ''
        rowb['manual'] = ''
        matching = dfp[(dfp['id1'] == rowa['_id']) & (dfp['id2'] == rowb['_id'])]
        if len(matching) == 1 and matching['id1'].values[0] == rowa['_id'] and matching['id2'].values[0] == rowb['_id']:
            matching_dict[str(rowa['_id']) + '_' + str(rowb['_id'])] = True
            rowa['predicted'] = True
            rowb['predicted'] = True
        else:
            rowa['predicted'] = False
            rowb['predicted'] = False
        current_sample[k] = pd.Series.to_dict(rowa)
        current_sample[k+1] = pd.Series.to_dict(rowb)
        k += 2

    Sdf = pd.DataFrame.from_dict(current_sample)
    Sdf = Sdf.T
    Sdf.to_csv('labelled' + str(n) + '.csv', index=False, columns=cols)
    # Sdf.to_csv('all1520.csv', index=False, columns=cols)
    
    print('Done.')


get_sample(0, SAMPLE_SIZE)

Running in progres...
This will return sample of 50 pairs. You may set value of `SAMPLE_SIZE` to get desired sample size.
Done.


## Run this cell to generate sample of 400 tuple pairs to manually label in the next step.

In [None]:
# Put 400 instead of 50 SAMPLE_SIZE to get sample of labelled pairs.
get_sample(0, 400)

## [Run after generating sample of 400 from above cell].
### Manually label tuple pairs under the column <em>manual</em>. Then run this cell to generate labeled_pairs.csv file.

In [10]:
# Run this if you want to generate `labeled_pairs.csv` file.
ldf = pd.read_csv('labelled400.csv')
print('Generating `labeled_pairs.csv` file.')
labelled_pairs = {}
counter = 0
for i in range(0, len(ldf), 2):
    t1 = ldf.iloc[i, :]['_id']
    t2 = ldf.iloc[i+1, :]['_id']
    label = ldf.iloc[i, :]['manual']
    if label == 1:
        label = True
    else:
        label = False
    labelled_pairs[counter] = {'id1': t1, 'id2': t2, 'label': label}
    counter += 1

lpdf = pd.DataFrame.from_dict(labelled_pairs)
lpdf = lpdf.T
lpdf.to_csv('labeled_pairs.csv', index=False)
print('Done.')

Generating `labeled_pairs.csv` file.
Done.
