Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

Fix rich content extraction method & tests #63

Merged
merged 1 commit into from

2 participants

@acdha
Collaborator
  • Update test setup instructions with content extraction handler dependencies
  • Enable file upload support to _send_request
  • Added extraction tests

One side note: the change to the otherwise unrelated log_body handling in _send_request makes me think we should stop doing that and simply pass extra data to logging so things which are prepared to format the bodies can do so without _send_request having to do as much string processing.

@acdha acdha Fix rich content extraction method & tests
* Update test setup instructions with content extraction handler
  dependencies
* Enable file upload support to _send_request
* Added simple extract test
e22d7d1
@toastdriven
Owner

Looks great! :shipit: & preferably roll a v3.0.2 release.

log_body was only added to prevent mixing bytestrings & Unicode in the logging messages. I'm good with whatever reduces that pain.

@acdha
Collaborator

I'll defer log_body changes now – let's discuss logging with other users (#haystack?) to make sure I'm not pushing too far into Sentry-specific behaviour

@acdha acdha merged commit e22d7d1 into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jan 23, 2013
  1. @acdha

    Fix rich content extraction method & tests

    acdha authored
    * Update test setup instructions with content extraction handler
      dependencies
    * Enable file upload support to _send_request
    * Added simple extract test
This page is out of date. Refresh to see the latest.
Showing with 51 additions and 7 deletions.
  1. +6 −0 README.rst
  2. +7 −4 pysolr.py
  3. +38 −3 tests/client.py
View
6 README.rst
@@ -105,6 +105,9 @@ Setup looks like::
curl -O http://apache.osuosl.org/lucene/solr/4.1.0/solr-4.1.0.tgz
tar xvzf solr-4.1.0.tgz
cp -r solr-4.1.0/example solr4
+ # Used by the content extraction and clustering handlers:
+ mv solr-4.1.0/dist solr4/
+ mv solr-4.1.0/contrib solr4/
rm -rf solr-4.1.0*
cd solr4
rm -rf example-DIH exampledocs
@@ -112,6 +115,9 @@ Setup looks like::
mv multicore solr
cp -r solrsinglecoreanduseless/collection1/conf/* solr/core0/conf/
cp -r solrsinglecoreanduseless/collection1/conf/* solr/core1/conf/
+ # Fix paths for the content extraction handler:
+ perl -p -i -e 's|<lib dir="../../../contrib/|<lib dir="../../contrib/|'g solr/*/conf/solrconfig.xml
+ perl -p -i -e 's|<lib dir="../../../dist/|<lib dir="../../dist/|'g solr/*/conf/solrconfig.xml
# Now run Solr.
java -jar start.jar
View
11 pysolr.py
@@ -237,13 +237,15 @@ def _create_full_url(self, path=''):
# No path? No problem.
return self.url
- def _send_request(self, method, path='', body=None, headers=None):
+ def _send_request(self, method, path='', body=None, headers=None, files=None):
url = self._create_full_url(path)
method = method.lower()
log_body = body
if log_body is None:
log_body = ''
+ elif not isinstance(log_body, str):
+ log_body = repr(body)
self.log.debug("Starting request to '%s' (%s) with body '%s'...",
url, method, log_body[:10])
@@ -268,7 +270,8 @@ def _send_request(self, method, path='', body=None, headers=None):
for k, v in headers.items():
bytes_headers[force_bytes(k)] = force_bytes(v)
- resp = requests_method(url, data=bytes_body, headers=bytes_headers, timeout=self.timeout)
+ resp = requests_method(url, data=bytes_body, headers=bytes_headers, files=files,
+ timeout=self.timeout)
except requests.exceptions.Timeout as err:
error_message = "Connection to server '%s' timed out: %s"
self.log.error(error_message, [url, err], exc_info=True)
@@ -858,8 +861,8 @@ def extract(self, file_obj, extractOnly=True, **kwargs):
try:
# We'll provide the file using its true name as Tika may use that
# as a file type hint:
- resp = self._send_request('post', "update/extract",
- headers=params,
+ resp = self._send_request('post', 'update/extract',
+ body=params,
files={'file': (file_obj.name, file_obj)})
except (IOError, SolrError) as err:
self.log.error("Failed to extract document metadata: %s", err,
View
41 tests/client.py
@@ -2,8 +2,9 @@
from __future__ import unicode_literals
import datetime
-from pysolr import Solr, Results, SolrError, unescape_html, safe_urlencode, \
- force_unicode, force_bytes, sanitize, json, ET, IS_PY3
+
+from pysolr import (Solr, Results, SolrError, unescape_html, safe_urlencode,
+ force_unicode, force_bytes, sanitize, json, ET, IS_PY3)
try:
import unittest2 as unittest
@@ -15,6 +16,11 @@
except ImportError:
from urllib import unquote_plus
+if IS_PY3:
+ from io import StringIO
+else:
+ from StringIO import StringIO
+
class UtilsTestCase(unittest.TestCase):
def test_unescape_html(self):
@@ -404,4 +410,33 @@ def test_optimize(self):
self.assertEqual(len(self.solr.search('doc')), 4)
def test_extract(self):
- self.fail("Dear Chris, Please fix me. Love, pysolr")
+ fake_f = StringIO("""
+ <html>
+ <head>
+ <meta charset="utf-8">
+ <meta name="haystack-test" content="test 1234">
+ <title>Test Title ☃&#x2603;</title>
+ </head>
+ <body>foobar</body>
+ </html>
+ """)
+ fake_f.name = "test.html"
+ extracted = self.solr.extract(fake_f)
+
+ # Verify documented response structure:
+ self.assertIn('contents', extracted)
+ self.assertIn('metadata', extracted)
+
+ self.assertIn('foobar', extracted['contents'])
+
+ m = extracted['metadata']
+
+ self.assertEqual([fake_f.name], m['stream_name'])
+
+ self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
+ self.assertEqual(['test 1234'], m['haystack-test'])
+
+ # Note the underhanded use of a double snowman to verify both that Tika
+ # correctly decoded entities and that our UTF-8 characters survived the
+ # round-trip:
+ self.assertEqual(['Test Title ☃☃'], m['title'])
Something went wrong with that request. Please try again.