Skip to content

Commit

Permalink
Add Python3 example code
Browse files Browse the repository at this point in the history
  • Loading branch information
ojwb committed Nov 1, 2016
1 parent bddb4df commit b30ba49
Show file tree
Hide file tree
Showing 48 changed files with 1,520 additions and 0 deletions.
20 changes: 20 additions & 0 deletions code/python3/delete1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python

import sys
import xapian

### Start of example code.
def delete_docs(dbpath, identifiers):
# Open the database we're going to be deleting from.
db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN)

for identifier in identifiers:
idterm = u'Q' + identifier
db.delete_document(idterm)
### End of example code.

if len(sys.argv) < 3:
print("Usage: %s DBPATH ID..." % sys.argv[0])
sys.exit(1)

delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:])
Empty file added code/python3/delete1.py.out
Empty file.
88 changes: 88 additions & 0 deletions code/python3/from_wikipedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re
import BeautifulSoup
import time
import eventlet
from eventlet.green import urllib2
import csv
import sys

def pull(title):
"""pull all the infobox goodies from wikipedia"""
url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (user-agent-restrictions-are-silly)')]
try:
html = opener.open(url.encode("utf-8")).read()
except:
print((u" Could not fetch %s" % url).encode('utf-8'))
return None
try:
soup = BeautifulSoup.BeautifulSoup(html)
except:
print((u" Could not parse %s" % url).encode('utf-8'))
return None
# Extract information
infobox = soup.find("table", { 'class': re.compile(r'\binfobox\b') })
if not infobox:
print((u" No infobox found in %s" % url).encode('utf-8'))
return None

information = {}
name = infobox.find("th", { 'class': 'fn org' })
if name:
information['name'] = extract_text(name)

def grab(info, name=None):
if name is None:
name = info.lower()
text = infobox.find("text", text=info)
if text:
information[name] = extract_text(text.parent.findNext("td"))

grab("Capital")
grab("Admission to Union", "admitted")
pop = infobox.find("text", text="Population")
if pop:
text = pop.findNext("text", text=re.compile("Total$"))
if text:
information['population'] = extract_text(text.parent.findNext("td"))
grab(re.compile("Latitude$"), "latitude")
grab(re.compile("Longitude$"), "longitude")
text = infobox.find("text", text=re.compile("Motto"))
if text:
information["motto"] = extract_text(text.findNext("i"))
information["description"] = extract_text(infobox.findNext("p"))

return information

def extract_text(tag):
if isinstance(tag, BeautifulSoup.NavigableString):
return tag.string
elif tag is None:
return ""
else:
return " ".join(" ".join([extract_text(item) for item in tag.contents]).split()).replace("( ", "(").replace(" )", ")").replace(" :", ":")


columns = [
'name',
'capital',
'admitted',
'population',
'latitude',
'longitude',
'motto',
'description',
]
pool = eventlet.GreenPool(size=10)
results = pool.imap(
pull,
sys.stdin.readlines(),
)
with open("data/states.csv", "w") as fh:
w = csv.writer(fh, dialect='excel')
w.writerow(columns)
for result in results:
if result is None:
continue
w.writerow([ result.get(col, u"").encode('utf-8') for col in columns ])
52 changes: 52 additions & 0 deletions code/python3/index1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python

import json
import sys
import xapian
from support import parse_csv_file

### Start of example code.
def index(datapath, dbpath):
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for fields in parse_csv_file(datapath):
# 'fields' is a dictionary mapping from field name to value.
# Pick out the fields we're going to index.
description = fields.get('DESCRIPTION', u'')
title = fields.get('TITLE', u'')
identifier = fields.get('id_NUMBER', u'')

# We make a document and tell the term generator to use this.
doc = xapian.Document()
termgenerator.set_document(doc)

# Index each field with a suitable prefix.
termgenerator.index_text(title, 1, 'S')
termgenerator.index_text(description, 1, 'XD')

# Index fields without prefixes for general search.
termgenerator.index_text(title)
termgenerator.increase_termpos()
termgenerator.index_text(description)

# Store all the fields for display purposes.
doc.set_data(json.dumps(fields))

# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
idterm = u"Q" + identifier
doc.add_boolean_term(idterm)
db.replace_document(idterm, doc)
### End of example code.

if len(sys.argv) != 3:
print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
sys.exit(1)

index(datapath = sys.argv[1], dbpath = sys.argv[2])
Empty file.
Empty file.
60 changes: 60 additions & 0 deletions code/python3/index_facets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python

import json
import sys
import xapian
from support import parse_csv_file

### Start of example code.
def index(datapath, dbpath):
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for fields in parse_csv_file(datapath):
# 'fields' is a dictionary mapping from field name to value.
# Pick out the fields we're going to index.
description = fields.get('DESCRIPTION', u'')
title = fields.get('TITLE', u'')
identifier = fields.get('id_NUMBER', u'')
collection = fields.get('COLLECTION', u'')
maker = fields.get('MAKER', u'')

# We make a document and tell the term generator to use this.
doc = xapian.Document()
termgenerator.set_document(doc)

# Index each field with a suitable prefix.
termgenerator.index_text(title, 1, 'S')
termgenerator.index_text(description, 1, 'XD')

# Index fields without prefixes for general search.
termgenerator.index_text(title)
termgenerator.increase_termpos()
termgenerator.index_text(description)

# Add the collection as a value in slot 0.
doc.add_value(0, collection)

# Add the maker as a value in slot 1.
doc.add_value(1, maker)

# Store all the fields for display purposes.
doc.set_data(json.dumps(fields))

# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
idterm = u"Q" + identifier
doc.add_boolean_term(idterm)
db.replace_document(idterm, doc)
### End of example code.

if len(sys.argv) != 3:
print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
sys.exit(1)

index(datapath = sys.argv[1], dbpath = sys.argv[2])
Empty file.
58 changes: 58 additions & 0 deletions code/python3/index_filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python

import json
import sys
import xapian
from support import parse_csv_file

def index(datapath, dbpath):
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for fields in parse_csv_file(datapath):
# 'fields' is a dictionary mapping from field name to value.
# Pick out the fields we're going to index.
description = fields.get('DESCRIPTION', u'')
title = fields.get('TITLE', u'')
identifier = fields.get('id_NUMBER', u'')

# We make a document and tell the term generator to use this.
doc = xapian.Document()
termgenerator.set_document(doc)

# Index each field with a suitable prefix.
termgenerator.index_text(title, 1, 'S')
termgenerator.index_text(description, 1, 'XD')

# Index fields without prefixes for general search.
termgenerator.index_text(title)
termgenerator.increase_termpos()
termgenerator.index_text(description)

### Start of new indexing code.
# Index the MATERIALS field, splitting on semicolons.
for material in fields.get('MATERIALS', u'').split(';'):
material = material.strip().lower()
if len(material) > 0:
doc.add_boolean_term('XM' + material)
### End of new indexing code.

# Store all the fields for display purposes.
doc.set_data(json.dumps(fields))

# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
idterm = u"Q" + identifier
doc.add_boolean_term(idterm)
db.replace_document(idterm, doc)

if len(sys.argv) != 3:
print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
sys.exit(1)

index(datapath = sys.argv[1], dbpath = sys.argv[2])
Empty file.
64 changes: 64 additions & 0 deletions code/python3/index_ranges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python

import json
import sys
import xapian
from support import numbers_from_string, parse_csv_file

def index(datapath, dbpath):
# Create or open the database we're going to be writing to.
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

# Set up a TermGenerator that we'll use in indexing.
termgenerator = xapian.TermGenerator()
termgenerator.set_stemmer(xapian.Stem("en"))

for fields in parse_csv_file(datapath):
# 'fields' is a dictionary mapping from field name to value.
# Pick out the fields we're going to index.
description = fields.get('DESCRIPTION', u'')
title = fields.get('TITLE', u'')
identifier = fields.get('id_NUMBER', u'')

# We make a document and tell the term generator to use this.
doc = xapian.Document()
termgenerator.set_document(doc)

# Index each field with a suitable prefix.
termgenerator.index_text(title, 1, 'S')
termgenerator.index_text(description, 1, 'XD')

# Index fields without prefixes for general search.
termgenerator.index_text(title)
termgenerator.increase_termpos()
termgenerator.index_text(description)

# Store all the fields for display purposes.
doc.set_data(json.dumps(fields))

### Start of example code.
# parse the two values we need
measurements = fields.get('MEASUREMENTS', u'')
if len(measurements) > 0:
numbers = numbers_from_string(measurements)
if len(numbers) > 0:
doc.add_value(0, xapian.sortable_serialise(max(numbers)))

date_made = fields.get('DATE_MADE', u'')
years = numbers_from_string(date_made)
if len(years) > 0:
doc.add_value(1, xapian.sortable_serialise(years[0]))
### End of example code.

# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
idterm = u"Q" + identifier
doc.add_boolean_term(idterm)
db.replace_document(idterm, doc)

if len(sys.argv) != 3:
print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
sys.exit(1)

index(datapath = sys.argv[1], dbpath = sys.argv[2])
Empty file.

0 comments on commit b30ba49

Please sign in to comment.