Permalink
Browse files

API cleanup

  o simplified/clarified doc_reader.py api; don't make http requests
    implicitly (separate api calls now)
  o rolled sim_index.query_by_string() into sim_index.query(), and
    made sim_index._query() hook for concrete subclasses to override
  • Loading branch information...
1 parent 1f998ea commit d4728a8109b6ada510e237a0e0e04d4a5ce82ab9 @taherh committed Sep 22, 2011
@@ -35,7 +35,7 @@
import codecs
import io
-from itertools import izip as zip
+from itertools import chain
import re
@@ -46,57 +46,65 @@
# get_text_file() needs an http object
_HTTP = httplib2.Http(str('.cache')) # httplib2 doesn't like unicode arg
-def get_text_file(name):
- '''Returns a text stream from filename or url'''
- file = None
+def get_text_file(filename):
+ '''Returns file for filename
+
+ TODO: detect html and parse
+ '''
+ return codecs.open(filename, encoding='utf-8')
+
+def get_url(url):
http_pattern = '^http://'
- if re.search(http_pattern, name):
- (response, content) = _HTTP.request(name)
+ if re.search(http_pattern, url):
+ (response, content) = _HTTP.request(url)
html_tree = lxml.html.fromstring(content)
clean_html(html_tree) # removes crud from html
clean_html_string = lxml.html.tostring(html_tree,
encoding=unicode,
method='text')
- file = io.StringIO(clean_html_string)
+ return io.StringIO(clean_html_string)
else:
- file = codecs.open(name, encoding='utf-8')
- return file
-
-def get_text_files(*names):
- '''Returns iterator of files from filenames and/or urls'''
- return (get_text_file(name) for name in names)
+ raise Exception("Bad url: {}".format(url))
-def get_named_text_files(*names):
+def get_text_files(filenames=None):
'''
- Returns an iterator of (filename, file) tuples from filenames
- and/or urls (convenience function)
- '''
- return zip(names, get_text_files(*names))
-
-def term_vec(file, stoplist = None):
+ Returns an iterator of (name, file) tuples for filenames
+
+ Params:
+ filenames: list of filenames
'''
- Returns a term vector for 'file', represented as a dictionary
- of the form {term: frequency}
+ if filenames is not None:
+ return ((name, get_text_file(name)) for name in filenames)
+
+def get_urls(urls=None):
'''
- # default args:
- if stoplist is None:
- stoplist = set()
+ Returns an iterator of (name, file) tuples for urls
- tf_dict = {}
- for line in file:
- for term in line.split():
- if term not in stoplist:
- if term not in tf_dict: tf_dict[term] = 0
- tf_dict[term] += 1
- return tf_dict
+ Params:
+ urls: list of urls to fetch
+ '''
+ if urls is not None:
+ return ((url, get_url(url)) for url in urls)
-def term_vec_from_string(s):
+def term_vec(input, stoplist = None):
'''
- Returns term vector for string s, represented as a dictionary of the
- from {term: frequency}
-
- (Convenience function - wraps term_vec())
+ Returns a term vector for ``input``, represented as a dictionary
+ of the form {term: frequency}
+
+ ``input`` can be either a string or a file
'''
- with io.StringIO(s) as string_buffer:
- return term_vec(string_buffer)
-
+ if isinstance(input, basestring):
+ with io.StringIO(input) as string_buffer:
+ return term_vec(string_buffer)
+ else:
+ # default args:
+ if stoplist is None:
+ stoplist = set()
+
+ tf_dict = {}
+ for line in input:
+ for term in line.split():
+ if term not in stoplist:
+ if term not in tf_dict: tf_dict[term] = 0
+ tf_dict[term] += 1
+ return tf_dict
@@ -157,7 +157,7 @@ def postings_list(self, term):
return self._term_index.get(term, [])
- def query(self, query_vec):
+ def _query(self, query_vec):
'''Finds documents similar to query_vec
Params:
@@ -71,7 +71,7 @@ class RemoteSimIndex(object):
Instantiate a ``RemoteSimIndex`` as follows:
>>> remote_index = RemoteSimIndex('http://localhost:9001/RPC2')
- >>> remote_index.query_by_string('university')
+ >>> remote_index.query('university')
...
'''
@@ -45,7 +45,7 @@
pprint(list(sim_index.docnames_with_terms('university', 'california')))
sim_index.set_query_scorer('simple_count')
- pprint(list(sim_index.query_by_string("stanford university")))
+ pprint(list(sim_index.query("stanford university")))
'''
@@ -113,7 +113,7 @@ def __delitem__(self, key):
def __iter__(self):
raise Exception('Unsupported')
- return iter(self._map)
+ # return iter(self._map)
def __len__(self):
return len(self._map)
@@ -121,7 +121,7 @@ def set_query_scorer(self, query_scorer):
@abc.abstractmethod
def index_files(self, named_files):
- '''Adds files given in named_files to the index.
+ '''Add ``named_files`` to the index
Params:
named_files: iterable of (filename, file) pairs.
@@ -130,18 +130,27 @@ def index_files(self, named_files):
return
def index_filenames(self, *filenames):
- '''Build a similarity index over files given by filenames
+ '''Add ``filenames`` to the index
Convenience method that wraps :meth:`index_files()`
Params:
- ``*filenames``: list of filenames to add to the index.
+ ``filenames``: list of filenames to add to the index.
'''
- return self.index_files(zip(filenames,
- doc_reader.get_text_files(*filenames)))
+ return self.index_files(doc_reader.get_text_files(filenames))
+
+ def index_urls(self, *urls):
+ '''Add ``urls`` to the index
+
+ Convenience method that wraps :meth:`index_files()`
+
+ Params:
+ ``urls``: list of urls of web pages to add to the index.
+ '''
+ return self.index_files(doc_reader.get_urls(urls))
def index_string_buffers(self, named_string_buffers):
- '''Adds string buffers to the index.
+ '''Add ``named_string_buffers`` to the index
Params:
named_string_buffers: iterable of (name, string) tuples, where
@@ -190,8 +199,21 @@ def docnames_with_terms(self, *terms):
'''Returns an iterable of docnames containing terms'''
return (self.docid_to_name(docid) for docid in self.docids_with_terms(terms))
+ def query(self, q):
+ '''Finds documents similar to q.
+
+ Params:
+ query: the query given as either a string or query vector
+ '''
+ if isinstance(q, basestring):
+ if isinstance(q, str):
+ q = unicode(q)
+ return self._query(doc_reader.term_vec(q))
+ else:
+ return self._query(q)
+
@abc.abstractmethod
- def query(self, query_vec):
+ def _query(self, query_vec):
'''Finds documents similar to query_vec
Params:
@@ -202,14 +224,3 @@ def query(self, query_vec):
'''
return
- def query_by_string(self, query_string):
- '''Finds documents similar to query_string.
-
- Convenience method that calls ``self.query()``
-
- Params:
- query_string: the query given as a string
- '''
- if isinstance(query_string, str):
- query_string = unicode(query_string)
- return self.query(doc_reader.term_vec_from_string(query_string))
@@ -44,7 +44,7 @@
'http://www.ucla.edu',
'http://www.mit.edu')
- pprint(index_coll.query_by_string('stanford university'))
+ pprint(index_coll.query('stanford university'))
'''
@@ -191,15 +191,7 @@ def index_string_buffers(self, named_string_buffers):
@update_trigger
def index_urls(self, *urls):
- '''Index web pages given by urls
-
- We expose this as a separate api from ``index_filenames()``, so that
- backends can fetch and index urls themselves.
-
- In contrast, ``index_files()`` and ``index_filenames()`` read/collect
- data centrally, then dispatch fully materialized input data to backends
- for indexing.
- '''
+ '''Index web pages given by urls'''
# minimize rpcs by collecting (name, buffer) tuples for
# different shards up-front
sharded_input_map = defaultdict(list)
@@ -209,7 +201,7 @@ def index_urls(self, *urls):
# issue an indexing call to each sharded backend that has some input
# TODO: use non-blocking rpc's
for shard_id in sharded_input_map:
- self._shards[shard_id].index_filenames(
+ self._shards[shard_id].index_urls(
*sharded_input_map[shard_id]
)
@@ -249,7 +241,7 @@ def set_query_scorer(self, query_scorer):
for shard in self._shards:
shard.set_query_scorer(query_scorer)
- def query(self, query_vec):
+ def _query(self, query_vec):
'''Issues query to collection and returns merged results
TODO: use a merge alg. (heapq.merge doesn't have a key= arg yet)
@@ -303,7 +295,7 @@ def merge_df_map(target, source):
gdocid = self.make_node_docid(shard_id, docid)
self._name_to_docid_map[name] = gdocid
self._docid_to_name_map[gdocid] = name
-
+
def broadcast_node_stats(self):
# Broadcast global stats. Only called by collection root node.
for shard in self._shards:
@@ -94,7 +94,7 @@ class SimIndexService(object):
'''Provide access to sim_index as an RPC service'''
PREFIX = 'sim_index'
- EXPORTED_METHODS = {'index_filenames',
+ EXPORTED_METHODS = {'index_urls',
'index_string_buffers',
'docid_to_name',
'name_to_docid',
@@ -103,7 +103,6 @@ class SimIndexService(object):
'docnames_with_terms',
'set_query_scorer',
'query',
- 'query_by_string',
'set_global_N',
'get_local_N',
'set_global_df_map',
Oops, something went wrong.

0 comments on commit d4728a8

Please sign in to comment.