Add Python3 example code

xapian · Nov 1, 2016 · b30ba49 · b30ba49
1 parent bddb4df
commit b30ba49
Show file tree

Hide file tree

Showing 48 changed files with 1,520 additions and 0 deletions.
diff --git a/code/python3/delete1.py b/code/python3/delete1.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+import sys
+import xapian
+
+### Start of example code.
+def delete_docs(dbpath, identifiers):
+    # Open the database we're going to be deleting from.
+    db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN)
+
+    for identifier in identifiers:
+        idterm = u'Q' + identifier
+        db.delete_document(idterm)
+### End of example code.
+
+if len(sys.argv) < 3:
+    print("Usage: %s DBPATH ID..." % sys.argv[0])
+    sys.exit(1)
+
+delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:])
diff --git a/code/python3/delete1.py.out b/code/python3/delete1.py.out
diff --git a/code/python3/from_wikipedia.py b/code/python3/from_wikipedia.py
@@ -0,0 +1,88 @@
+import re
+import BeautifulSoup
+import time
+import eventlet
+from eventlet.green import urllib2
+import csv
+import sys
+
+def pull(title):
+    """pull all the infobox goodies from wikipedia"""
+    url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title
+    opener = urllib2.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0 (user-agent-restrictions-are-silly)')]
+    try:
+        html = opener.open(url.encode("utf-8")).read()
+    except:
+        print((u"  Could not fetch %s" % url).encode('utf-8'))
+        return None
+    try:
+        soup = BeautifulSoup.BeautifulSoup(html)
+    except:
+        print((u"  Could not parse %s" % url).encode('utf-8'))
+        return None
+    # Extract information
+    infobox = soup.find("table", { 'class': re.compile(r'\binfobox\b') })
+    if not infobox:
+        print((u"  No infobox found in %s" % url).encode('utf-8'))
+        return None
+
+    information = {}
+    name = infobox.find("th", { 'class': 'fn org' })
+    if name:
+        information['name'] = extract_text(name)
+
+    def grab(info, name=None):
+        if name is None:
+            name = info.lower()
+        text = infobox.find("text", text=info)
+        if text:
+            information[name] = extract_text(text.parent.findNext("td"))
+
+    grab("Capital")
+    grab("Admission to Union", "admitted")
+    pop = infobox.find("text", text="Population")
+    if pop:
+        text = pop.findNext("text", text=re.compile("Total$"))
+        if text:
+            information['population'] = extract_text(text.parent.findNext("td"))
+    grab(re.compile("Latitude$"), "latitude")
+    grab(re.compile("Longitude$"), "longitude")
+    text = infobox.find("text", text=re.compile("Motto"))
+    if text:
+        information["motto"] = extract_text(text.findNext("i"))
+    information["description"] = extract_text(infobox.findNext("p"))
+
+    return information
+
+def extract_text(tag):
+    if isinstance(tag, BeautifulSoup.NavigableString):
+        return tag.string
+    elif tag is None:
+        return ""
+    else:
+        return " ".join(" ".join([extract_text(item) for item in tag.contents]).split()).replace("( ", "(").replace(" )", ")").replace(" :", ":")
+
+
+columns = [
+    'name',
+    'capital',
+    'admitted',
+    'population',
+    'latitude',
+    'longitude',
+    'motto',
+    'description',
+    ]
+pool = eventlet.GreenPool(size=10)
+results = pool.imap(
+    pull,
+    sys.stdin.readlines(),
+    )
+with open("data/states.csv", "w") as fh:
+    w = csv.writer(fh, dialect='excel')
+    w.writerow(columns)
+    for result in results:
+        if result is None:
+            continue
+        w.writerow([ result.get(col, u"").encode('utf-8') for col in columns ])
diff --git a/code/python3/index1.py b/code/python3/index1.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import json
+import sys
+import xapian
+from support import parse_csv_file
+
+### Start of example code.
+def index(datapath, dbpath):
+    # Create or open the database we're going to be writing to.
+    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
+
+    # Set up a TermGenerator that we'll use in indexing.
+    termgenerator = xapian.TermGenerator()
+    termgenerator.set_stemmer(xapian.Stem("en"))
+
+    for fields in parse_csv_file(datapath):
+        # 'fields' is a dictionary mapping from field name to value.
+        # Pick out the fields we're going to index.
+        description = fields.get('DESCRIPTION', u'')
+        title = fields.get('TITLE', u'')
+        identifier = fields.get('id_NUMBER', u'')
+
+        # We make a document and tell the term generator to use this.
+        doc = xapian.Document()
+        termgenerator.set_document(doc)
+
+        # Index each field with a suitable prefix.
+        termgenerator.index_text(title, 1, 'S')
+        termgenerator.index_text(description, 1, 'XD')
+
+        # Index fields without prefixes for general search.
+        termgenerator.index_text(title)
+        termgenerator.increase_termpos()
+        termgenerator.index_text(description)
+
+        # Store all the fields for display purposes.
+        doc.set_data(json.dumps(fields))
+
+        # We use the identifier to ensure each object ends up in the
+        # database only once no matter how many times we run the
+        # indexer.
+        idterm = u"Q" + identifier
+        doc.add_boolean_term(idterm)
+        db.replace_document(idterm, doc)
+### End of example code.
+
+if len(sys.argv) != 3:
+    print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
+    sys.exit(1)
+
+index(datapath = sys.argv[1], dbpath = sys.argv[2])
diff --git a/code/python3/index1.py.data=2f100-objects-v1.csv_db.out b/code/python3/index1.py.data=2f100-objects-v1.csv_db.out
diff --git a/code/python3/index1.py.db_title=3asunwatch.out b/code/python3/index1.py.db_title=3asunwatch.out
diff --git a/code/python3/index_facets.py b/code/python3/index_facets.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import json
+import sys
+import xapian
+from support import parse_csv_file
+
+### Start of example code.
+def index(datapath, dbpath):
+    # Create or open the database we're going to be writing to.
+    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
+
+    # Set up a TermGenerator that we'll use in indexing.
+    termgenerator = xapian.TermGenerator()
+    termgenerator.set_stemmer(xapian.Stem("en"))
+
+    for fields in parse_csv_file(datapath):
+        # 'fields' is a dictionary mapping from field name to value.
+        # Pick out the fields we're going to index.
+        description = fields.get('DESCRIPTION', u'')
+        title = fields.get('TITLE', u'')
+        identifier = fields.get('id_NUMBER', u'')
+        collection = fields.get('COLLECTION', u'')
+        maker = fields.get('MAKER', u'')
+
+        # We make a document and tell the term generator to use this.
+        doc = xapian.Document()
+        termgenerator.set_document(doc)
+
+        # Index each field with a suitable prefix.
+        termgenerator.index_text(title, 1, 'S')
+        termgenerator.index_text(description, 1, 'XD')
+
+        # Index fields without prefixes for general search.
+        termgenerator.index_text(title)
+        termgenerator.increase_termpos()
+        termgenerator.index_text(description)
+
+        # Add the collection as a value in slot 0.
+        doc.add_value(0, collection)
+
+        # Add the maker as a value in slot 1.
+        doc.add_value(1, maker)
+
+        # Store all the fields for display purposes.
+        doc.set_data(json.dumps(fields))
+
+        # We use the identifier to ensure each object ends up in the
+        # database only once no matter how many times we run the
+        # indexer.
+        idterm = u"Q" + identifier
+        doc.add_boolean_term(idterm)
+        db.replace_document(idterm, doc)
+### End of example code.
+
+if len(sys.argv) != 3:
+    print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
+    sys.exit(1)
+
+index(datapath = sys.argv[1], dbpath = sys.argv[2])
diff --git a/code/python3/index_facets.py.out b/code/python3/index_facets.py.out
diff --git a/code/python3/index_filters.py b/code/python3/index_filters.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+import json
+import sys
+import xapian
+from support import parse_csv_file
+
+def index(datapath, dbpath):
+    # Create or open the database we're going to be writing to.
+    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
+
+    # Set up a TermGenerator that we'll use in indexing.
+    termgenerator = xapian.TermGenerator()
+    termgenerator.set_stemmer(xapian.Stem("en"))
+
+    for fields in parse_csv_file(datapath):
+        # 'fields' is a dictionary mapping from field name to value.
+        # Pick out the fields we're going to index.
+        description = fields.get('DESCRIPTION', u'')
+        title = fields.get('TITLE', u'')
+        identifier = fields.get('id_NUMBER', u'')
+
+        # We make a document and tell the term generator to use this.
+        doc = xapian.Document()
+        termgenerator.set_document(doc)
+
+        # Index each field with a suitable prefix.
+        termgenerator.index_text(title, 1, 'S')
+        termgenerator.index_text(description, 1, 'XD')
+
+        # Index fields without prefixes for general search.
+        termgenerator.index_text(title)
+        termgenerator.increase_termpos()
+        termgenerator.index_text(description)
+
+        ### Start of new indexing code.
+        # Index the MATERIALS field, splitting on semicolons.
+        for material in fields.get('MATERIALS', u'').split(';'):
+            material = material.strip().lower()
+            if len(material) > 0:
+                doc.add_boolean_term('XM' + material)
+        ### End of new indexing code.
+
+        # Store all the fields for display purposes.
+        doc.set_data(json.dumps(fields))
+
+        # We use the identifier to ensure each object ends up in the
+        # database only once no matter how many times we run the
+        # indexer.
+        idterm = u"Q" + identifier
+        doc.add_boolean_term(idterm)
+        db.replace_document(idterm, doc)
+
+if len(sys.argv) != 3:
+    print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
+    sys.exit(1)
+
+index(datapath = sys.argv[1], dbpath = sys.argv[2])
diff --git a/code/python3/index_filters.py.out b/code/python3/index_filters.py.out
diff --git a/code/python3/index_ranges.py b/code/python3/index_ranges.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+import json
+import sys
+import xapian
+from support import numbers_from_string, parse_csv_file
+
+def index(datapath, dbpath):
+    # Create or open the database we're going to be writing to.
+    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
+
+    # Set up a TermGenerator that we'll use in indexing.
+    termgenerator = xapian.TermGenerator()
+    termgenerator.set_stemmer(xapian.Stem("en"))
+
+    for fields in parse_csv_file(datapath):
+        # 'fields' is a dictionary mapping from field name to value.
+        # Pick out the fields we're going to index.
+        description = fields.get('DESCRIPTION', u'')
+        title = fields.get('TITLE', u'')
+        identifier = fields.get('id_NUMBER', u'')
+
+        # We make a document and tell the term generator to use this.
+        doc = xapian.Document()
+        termgenerator.set_document(doc)
+
+        # Index each field with a suitable prefix.
+        termgenerator.index_text(title, 1, 'S')
+        termgenerator.index_text(description, 1, 'XD')
+
+        # Index fields without prefixes for general search.
+        termgenerator.index_text(title)
+        termgenerator.increase_termpos()
+        termgenerator.index_text(description)
+
+        # Store all the fields for display purposes.
+        doc.set_data(json.dumps(fields))
+
+### Start of example code.
+        # parse the two values we need
+        measurements = fields.get('MEASUREMENTS', u'')
+        if len(measurements) > 0:
+            numbers = numbers_from_string(measurements)
+            if len(numbers) > 0:
+                doc.add_value(0, xapian.sortable_serialise(max(numbers)))
+
+        date_made = fields.get('DATE_MADE', u'')
+        years = numbers_from_string(date_made)
+        if len(years) > 0:
+            doc.add_value(1, xapian.sortable_serialise(years[0]))
+### End of example code.
+
+        # We use the identifier to ensure each object ends up in the
+        # database only once no matter how many times we run the
+        # indexer.
+        idterm = u"Q" + identifier
+        doc.add_boolean_term(idterm)
+        db.replace_document(idterm, doc)
+
+if len(sys.argv) != 3:
+    print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
+    sys.exit(1)
+
+index(datapath = sys.argv[1], dbpath = sys.argv[2])
diff --git a/code/python3/index_ranges.py.out b/code/python3/index_ranges.py.out