Skip to content

Commit

Permalink
add docstring for blocking classes
Browse files Browse the repository at this point in the history
  • Loading branch information
GreatYYX committed Sep 4, 2018
1 parent 6242772 commit e5108da
Show file tree
Hide file tree
Showing 7 changed files with 98 additions and 4 deletions.
27 changes: 27 additions & 0 deletions docs/mod_blocking.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Blocking
========

.. automodule:: rltk.blocking.block_generator
:members:
:special-members:
:exclude-members: __dict__, __weakref__, __init__

.. automodule:: rltk.blocking.custom_block_generator
:members:
:special-members:
:exclude-members: __dict__, __weakref__, __init__

.. automodule:: rltk.blocking.inverted_index_block_generator
:members:
:special-members:
:exclude-members: __dict__, __weakref__, __init__

.. automodule:: rltk.blocking.canopy_block_generator
:members:
:special-members:
:exclude-members: __dict__, __weakref__, __init__

.. automodule:: rltk.blocking.blocking_helper
:members:
:special-members:
:exclude-members: __dict__, __weakref__, __init__
4 changes: 2 additions & 2 deletions docs/modules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ API Reference
:maxdepth: 4

mod_dataset.rst
mod_parallel_processor.rst
mod_record.rst
mod_similarity.rst
mod_evaluation.rst
mod_blocking.rst
mod_parallel_processor.rst
.. mod_io.rst
.. mod_blocking.rst
.. mod_tokenizer.rst
20 changes: 20 additions & 0 deletions rltk/blocking/block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,30 @@


class BlockGenerator(object):
"""
Super class of block generator
Args:
dataset1 (Dataset): Dataset 1.
dataset2 (Dataset): Dataset 2.
writer (BlockWriter): Block writer.
**kwargs: Key word arguments used by concrete class.
"""

def __init__(self, dataset1: Dataset, dataset2: Dataset, writer: BlockWriter, **kwargs):
self._writer = writer
self._dataset1 = dataset1
self._dataset2 = dataset2
self._kwargs = kwargs

def generate(self):
"""
Generate blocks (:meth:`_generate_blocks`) and return handler.
Returns:
obj: Writer handler, which can be used by corresponding Reader.
"""

self._generate_blocks()

Expand All @@ -18,4 +35,7 @@ def generate(self):
return handler

def _generate_blocks(self):
"""
Generate blocks. It needs to be overwritten.
"""
raise NotImplementedError
16 changes: 15 additions & 1 deletion rltk/blocking/blocking_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,25 @@


class BlockingHelper(object):
def __init__(self, reader1 : BlockReader, reader2 : BlockReader):
"""
It provides some useful blocking helper methods.
Args:
reader1 (BlockReader): BlockReader 1.
reader2 (BlockReader): BlockReader 2.
"""

def __init__(self, reader1: BlockReader, reader2: BlockReader):
self._reader1 = reader1
self._reader2 = reader2

def union(self, writer: BlockWriter):
"""
Union two blocks.
Args:
writer (BlockWriter): Block writer.
"""

for id1, id2 in self._reader1:
writer.write(id1, id2)
Expand Down
1 change: 1 addition & 0 deletions rltk/blocking/canopy_block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def _run_canopy_clustering(dataset, t1, t2, distance_metric):
if distance < t2:
delete_list.append(d_idx)

# delete vector from dataset from backward
for d_idx in sorted(delete_list, reverse=True):
del dataset[d_idx]
new_canopy.append(center_vec) # add center
Expand Down
7 changes: 7 additions & 0 deletions rltk/blocking/custom_block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@


class CustomBlockGenerator(BlockGenerator):
"""
Generate blocks based on custom function.
Args:
custom_function (Callable): Custom function which decides if two records belong to one block.
The signature is `custom_function(r1: Record1, r2: Record2) -> bool`.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if 'custom_function' not in self._kwargs:
Expand Down
27 changes: 26 additions & 1 deletion rltk/blocking/inverted_index_block_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,36 @@


class InvertedIndexBlockGenerator(BlockGenerator):
"""
Generic inverted index based block generator.
Args:
tokenizer (Callable): The tokenizer for record from dataset1 and dataset 2.
The signature is `tokenizer(r: Record) -> List[str]`.
tokenizer1 (Callable): The tokenizer for record of dataset 1.
tokenizer2 (Callable): The tokenizer for record of dataset 2.
buffer_size (int, optional): The maximum size of in-memory buffer, defaults to 10,000.
token_size (int, optional): The maximum number that this token is used in documents. Defaults to 1,000.
temp_dir_path (str, optional): Where the temp data stores while buffer is out-of-memory.
Defaults to system's temporary path (:py:meth:`tempfile.gettempdir`).
Note:
Either `tokenizer` or both `tokenizer1` and `tokenizer2` should be given.
Note:
If you use QGram as tokenizer, this becomes QGram block generator.
Note:
If tokens are pre-calculated, they can be returned directly in tokenizer function (e.g., `return r.name_tokens`).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._tokenizer = self._kwargs.get('tokenizer')
self._tokenizer = self._kwargs.get('tokenizer', None)
self._tokenizer1 = self._kwargs.get('tokenizer1', self._tokenizer)
self._tokenizer2 = self._kwargs.get('tokenizer2', self._tokenizer)
if not self._tokenizer1 or not self._tokenizer2:
raise ValueError('Tokenizer is not properly set')
self._buffer_size = self._kwargs.get('buffer_size') or 10000
self._token_size = self._kwargs.get('token_size') or 1000
self._temp_dir_path = self._kwargs.get('temp_dir_path', tempfile.gettempdir())
Expand Down

0 comments on commit e5108da

Please sign in to comment.