Skip to content

Commit

Permalink
support creating blocks based on previous blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Apr 8, 2019
1 parent bf6c61f commit 8775230
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 35 deletions.
13 changes: 13 additions & 0 deletions examples/blocking/generate_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,16 @@ def last_name(self):
pairs = rltk.get_record_pairs(ds1, ds2, block=block2)
for r1, r2 in pairs:
print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)


print('--- block on first_name[:3] based on previous blocks ---')
bg3 = rltk.HashBlockGenerator()
block3 = bg3.generate(
bg3.block(ds1, function_=lambda r: r.first_name[:3], base_on=block2),
bg3.block(ds2, function_=lambda r: r.first_name[:3], base_on=block2))
pairs = rltk.get_record_pairs(ds1, ds2, block=block3)
for r1, r2 in pairs:
print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)
print('inside blocks:')
for b, d, r in block3:
print(b, d ,r)
3 changes: 2 additions & 1 deletion rltk/blocking/block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class BlockGenerator(object):
"""

def block(self, dataset: 'Dataset', function_: Callable = None, property_: str = None,
block: Block = None, block_black_list: BlockBlackList = None):
block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
"""
Block on property or by function for dataset.
Expand All @@ -22,6 +22,7 @@ def block(self, dataset: 'Dataset', function_: Callable = None, property_: str =
property_ (str): The property in Record object.
block (Block): Where to write blocks. If None, a new block will be created. Defaults to None.
block_black_list (BlockBlackList, optional): Where all blacklisted blocks are stored. Defaults to None.
base_on (Block, optional): Current block is generated base on this block. Defaults to None.
Returns:
Block:
Expand Down
40 changes: 29 additions & 11 deletions rltk/blocking/canopy_block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,39 @@ def __init__(self, t1, t2, distance_metric):
self._distance_metric = distance_metric

def block(self, dataset, function_: Callable = None, property_: str = None,
block: Block = None, block_black_list: BlockBlackList = None):
block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
"""
The return of `property_` or `function_` should be a vector (list).
"""
block = super()._block_args_check(function_, property_, block)
for r in dataset:
value = function_(r) if function_ else getattr(r, property_)
k = self._encode_key(value)
if block_black_list and block_black_list.has(k):
continue
if not isinstance(value, list):
raise ValueError('Return of the function or property should be a vector (list)')
block.add(k, dataset.id, r.id)
if block_black_list:
block_black_list.add(k, block)

if base_on:
for block_id, dataset_id, record_id in base_on:
if dataset.id == dataset_id:
r = dataset.get_record(record_id)
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, list):
raise ValueError('Return of the function or property should be a vector (list)')
value = block_id + value
k = self._encode_key(value)
if block_black_list and block_black_list.has(k):
continue
block.add(k, dataset.id, r.id)
if block_black_list:
block_black_list.add(k, block)

else:
for r in dataset:
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, list):
raise ValueError('Return of the function or property should be a vector (list)')
k = self._encode_key(value)
if block_black_list and block_black_list.has(k):
continue
block.add(k, dataset.id, r.id)
if block_black_list:
block_black_list.add(k, block)

return block

@staticmethod
Expand Down
37 changes: 27 additions & 10 deletions rltk/blocking/hash_block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,37 @@ class HashBlockGenerator(BlockGenerator):
"""

def block(self, dataset, function_: Callable = None, property_: str = None,
block: Block = None, block_black_list: BlockBlackList = None):
block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
"""
The return of `property_` or `function_` should be string.
"""
block = super()._block_args_check(function_, property_, block)
for r in dataset:
value = function_(r) if function_ else getattr(r, property_)
if block_black_list and block_black_list.has(value):
continue
if not isinstance(value, str):
raise ValueError('Return of the function or property should be a string')
block.add(value, dataset.id, r.id)
if block_black_list:
block_black_list.add(value, block)

if base_on:
for block_id, dataset_id, record_id in base_on:
if dataset.id == dataset_id:
r = dataset.get_record(record_id)
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, str):
raise ValueError('Return of the function or property should be a string')
value = block_id + '-' + value
if block_black_list and block_black_list.has(value):
continue
block.add(value, dataset.id, r.id)
if block_black_list:
block_black_list.add(value, block)

else:
for r in dataset:
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, str):
raise ValueError('Return of the function or property should be a string')
if block_black_list and block_black_list.has(value):
continue
block.add(value, dataset.id, r.id)
if block_black_list:
block_black_list.add(value, block)

return block

def generate(self, block1: Block, block2: Block, output_block: Block = None):
Expand Down
46 changes: 33 additions & 13 deletions rltk/blocking/token_block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,43 @@ class TokenBlockGenerator(BlockGenerator):
"""

def block(self, dataset, function_: Callable = None, property_: str = None,
block: Block = None, block_black_list: BlockBlackList = None):
block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
"""
The return of `property_` or `function_` should be list or set.
"""
block = super()._block_args_check(function_, property_, block)
for r in dataset:
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, list) and not isinstance(value, set):
raise ValueError('Return of the function or property should be a list')
for v in value:
if block_black_list and block_black_list.has(v):
continue
if not isinstance(v, str):
raise ValueError('Elements in return list should be string')
block.add(v, dataset.id, r.id)
if block_black_list:
block_black_list.add(v, block)

if base_on:
for block_id, dataset_id, record_id in base_on:
if dataset.id == dataset_id:
r = dataset.get_record(record_id)
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, list) and not isinstance(value, set):
raise ValueError('Return of the function or property should be a list')
for v in value:
if not isinstance(v, str):
raise ValueError('Elements in return list should be string')
if block_black_list and block_black_list.has(v):
continue
v = block_id + v
block.add(v, dataset.id, r.id)
if block_black_list:
block_black_list.add(v, block)

else:
for r in dataset:
value = function_(r) if function_ else getattr(r, property_)
if not isinstance(value, list) and not isinstance(value, set):
raise ValueError('Return of the function or property should be a list')
for v in value:
if not isinstance(v, str):
raise ValueError('Elements in return list should be string')
if block_black_list and block_black_list.has(v):
continue
block.add(v, dataset.id, r.id)
if block_black_list:
block_black_list.add(v, block)

return block

def generate(self, block1: Block, block2: Block, output_block: Block = None):
Expand Down

0 comments on commit 8775230

Please sign in to comment.