Skip to content

Commit

Permalink
Merge pull request #1 from Chinmay26/qgram_indexer
Browse files Browse the repository at this point in the history
add Qgram indexer
  • Loading branch information
GreatYYX committed Apr 26, 2017
2 parents cb1d2a1 + 0b6364e commit 4281efe
Show file tree
Hide file tree
Showing 14 changed files with 192,494 additions and 3 deletions.
1,574 changes: 1,574 additions & 0 deletions datasets/ima.json

Large diffs are not rendered by default.

2,228 changes: 2,228 additions & 0 deletions datasets/puam.json

Large diffs are not rendered by default.

187,485 changes: 187,485 additions & 0 deletions datasets/ulan.json

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions examples/qgram_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import sys, os
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),
"../../rltk")))
import rltk
tk = rltk.init()

print("Running Qgram indexing on GM museum dataset")
fi1 = tk.get_file_iterator('../datasets/ima.json', type='json_line', id_path='uri[*].value')
tk.q_gram_blocking(file_iter=fi1, q=[3], value_path=['name[*].value'], output_file_path='./ima_qgrams.json' )


print("Running Qgram indexing on ULAN and GM museum datasets")
fu1 = tk.get_file_iterator('../datasets/ulan.json', type='json_line', id_path='uri[*].value')
fi2 = tk.get_file_iterator('../datasets/ima.json', type='json_line', id_path='uri[*].value')

tk.q_gram_blocking(file_iter1=fu1, q1=[3], value_path1=['name[*].value'], file_iter2=fi2,
q2=[3], value_path2=['name[*].value'], output_file_path='./qgram_ulan_gm.json' )

8 changes: 5 additions & 3 deletions rltk/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from classifier import *
from similarity import utils
from file_iterator import FileIterator
from indexer import *


class Core(object):
Expand Down Expand Up @@ -1131,17 +1132,18 @@ def nysiis_similarity(self, s1, s2):
"""
return nysiis_similarity(s1, s2)

def q_gram_blocking(self, iter1, output_file_path, iter2=None):
def q_gram_blocking(self, output_file_path, **kwargs):
"""
Q-Gram.
Args:
iter1 (FileIterator): File iterator 1.
iter2 (FileIterator, optional): File iterator 2. Defaults to None.
**kwargs: Arbitrary keyword arguments
output_file_path (str): Output file string.
"""
output_file_path = self._get_abs_path(output_file_path)
kwargs['output_file_path'] = output_file_path
return qgram_indexing(**kwargs)

def canopy_blocking(self, iter1, output_file_path, iter2=None):
"""
Expand Down
1 change: 1 addition & 0 deletions rltk/indexer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from qgram import qgram_indexing

0 comments on commit 4281efe

Please sign in to comment.