-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
48 changed files
with
1,520 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys | ||
import xapian | ||
|
||
### Start of example code. | ||
def delete_docs(dbpath, identifiers): | ||
# Open the database we're going to be deleting from. | ||
db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN) | ||
|
||
for identifier in identifiers: | ||
idterm = u'Q' + identifier | ||
db.delete_document(idterm) | ||
### End of example code. | ||
|
||
if len(sys.argv) < 3: | ||
print("Usage: %s DBPATH ID..." % sys.argv[0]) | ||
sys.exit(1) | ||
|
||
delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:]) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import re | ||
import BeautifulSoup | ||
import time | ||
import eventlet | ||
from eventlet.green import urllib2 | ||
import csv | ||
import sys | ||
|
||
def pull(title): | ||
"""pull all the infobox goodies from wikipedia""" | ||
url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title | ||
opener = urllib2.build_opener() | ||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (user-agent-restrictions-are-silly)')] | ||
try: | ||
html = opener.open(url.encode("utf-8")).read() | ||
except: | ||
print((u" Could not fetch %s" % url).encode('utf-8')) | ||
return None | ||
try: | ||
soup = BeautifulSoup.BeautifulSoup(html) | ||
except: | ||
print((u" Could not parse %s" % url).encode('utf-8')) | ||
return None | ||
# Extract information | ||
infobox = soup.find("table", { 'class': re.compile(r'\binfobox\b') }) | ||
if not infobox: | ||
print((u" No infobox found in %s" % url).encode('utf-8')) | ||
return None | ||
|
||
information = {} | ||
name = infobox.find("th", { 'class': 'fn org' }) | ||
if name: | ||
information['name'] = extract_text(name) | ||
|
||
def grab(info, name=None): | ||
if name is None: | ||
name = info.lower() | ||
text = infobox.find("text", text=info) | ||
if text: | ||
information[name] = extract_text(text.parent.findNext("td")) | ||
|
||
grab("Capital") | ||
grab("Admission to Union", "admitted") | ||
pop = infobox.find("text", text="Population") | ||
if pop: | ||
text = pop.findNext("text", text=re.compile("Total$")) | ||
if text: | ||
information['population'] = extract_text(text.parent.findNext("td")) | ||
grab(re.compile("Latitude$"), "latitude") | ||
grab(re.compile("Longitude$"), "longitude") | ||
text = infobox.find("text", text=re.compile("Motto")) | ||
if text: | ||
information["motto"] = extract_text(text.findNext("i")) | ||
information["description"] = extract_text(infobox.findNext("p")) | ||
|
||
return information | ||
|
||
def extract_text(tag): | ||
if isinstance(tag, BeautifulSoup.NavigableString): | ||
return tag.string | ||
elif tag is None: | ||
return "" | ||
else: | ||
return " ".join(" ".join([extract_text(item) for item in tag.contents]).split()).replace("( ", "(").replace(" )", ")").replace(" :", ":") | ||
|
||
|
||
columns = [ | ||
'name', | ||
'capital', | ||
'admitted', | ||
'population', | ||
'latitude', | ||
'longitude', | ||
'motto', | ||
'description', | ||
] | ||
pool = eventlet.GreenPool(size=10) | ||
results = pool.imap( | ||
pull, | ||
sys.stdin.readlines(), | ||
) | ||
with open("data/states.csv", "w") as fh: | ||
w = csv.writer(fh, dialect='excel') | ||
w.writerow(columns) | ||
for result in results: | ||
if result is None: | ||
continue | ||
w.writerow([ result.get(col, u"").encode('utf-8') for col in columns ]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#!/usr/bin/env python | ||
|
||
import json | ||
import sys | ||
import xapian | ||
from support import parse_csv_file | ||
|
||
### Start of example code. | ||
def index(datapath, dbpath): | ||
# Create or open the database we're going to be writing to. | ||
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) | ||
|
||
# Set up a TermGenerator that we'll use in indexing. | ||
termgenerator = xapian.TermGenerator() | ||
termgenerator.set_stemmer(xapian.Stem("en")) | ||
|
||
for fields in parse_csv_file(datapath): | ||
# 'fields' is a dictionary mapping from field name to value. | ||
# Pick out the fields we're going to index. | ||
description = fields.get('DESCRIPTION', u'') | ||
title = fields.get('TITLE', u'') | ||
identifier = fields.get('id_NUMBER', u'') | ||
|
||
# We make a document and tell the term generator to use this. | ||
doc = xapian.Document() | ||
termgenerator.set_document(doc) | ||
|
||
# Index each field with a suitable prefix. | ||
termgenerator.index_text(title, 1, 'S') | ||
termgenerator.index_text(description, 1, 'XD') | ||
|
||
# Index fields without prefixes for general search. | ||
termgenerator.index_text(title) | ||
termgenerator.increase_termpos() | ||
termgenerator.index_text(description) | ||
|
||
# Store all the fields for display purposes. | ||
doc.set_data(json.dumps(fields)) | ||
|
||
# We use the identifier to ensure each object ends up in the | ||
# database only once no matter how many times we run the | ||
# indexer. | ||
idterm = u"Q" + identifier | ||
doc.add_boolean_term(idterm) | ||
db.replace_document(idterm, doc) | ||
### End of example code. | ||
|
||
if len(sys.argv) != 3: | ||
print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) | ||
sys.exit(1) | ||
|
||
index(datapath = sys.argv[1], dbpath = sys.argv[2]) |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#!/usr/bin/env python | ||
|
||
import json | ||
import sys | ||
import xapian | ||
from support import parse_csv_file | ||
|
||
### Start of example code. | ||
def index(datapath, dbpath): | ||
# Create or open the database we're going to be writing to. | ||
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) | ||
|
||
# Set up a TermGenerator that we'll use in indexing. | ||
termgenerator = xapian.TermGenerator() | ||
termgenerator.set_stemmer(xapian.Stem("en")) | ||
|
||
for fields in parse_csv_file(datapath): | ||
# 'fields' is a dictionary mapping from field name to value. | ||
# Pick out the fields we're going to index. | ||
description = fields.get('DESCRIPTION', u'') | ||
title = fields.get('TITLE', u'') | ||
identifier = fields.get('id_NUMBER', u'') | ||
collection = fields.get('COLLECTION', u'') | ||
maker = fields.get('MAKER', u'') | ||
|
||
# We make a document and tell the term generator to use this. | ||
doc = xapian.Document() | ||
termgenerator.set_document(doc) | ||
|
||
# Index each field with a suitable prefix. | ||
termgenerator.index_text(title, 1, 'S') | ||
termgenerator.index_text(description, 1, 'XD') | ||
|
||
# Index fields without prefixes for general search. | ||
termgenerator.index_text(title) | ||
termgenerator.increase_termpos() | ||
termgenerator.index_text(description) | ||
|
||
# Add the collection as a value in slot 0. | ||
doc.add_value(0, collection) | ||
|
||
# Add the maker as a value in slot 1. | ||
doc.add_value(1, maker) | ||
|
||
# Store all the fields for display purposes. | ||
doc.set_data(json.dumps(fields)) | ||
|
||
# We use the identifier to ensure each object ends up in the | ||
# database only once no matter how many times we run the | ||
# indexer. | ||
idterm = u"Q" + identifier | ||
doc.add_boolean_term(idterm) | ||
db.replace_document(idterm, doc) | ||
### End of example code. | ||
|
||
if len(sys.argv) != 3: | ||
print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) | ||
sys.exit(1) | ||
|
||
index(datapath = sys.argv[1], dbpath = sys.argv[2]) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python | ||
|
||
import json | ||
import sys | ||
import xapian | ||
from support import parse_csv_file | ||
|
||
def index(datapath, dbpath): | ||
# Create or open the database we're going to be writing to. | ||
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) | ||
|
||
# Set up a TermGenerator that we'll use in indexing. | ||
termgenerator = xapian.TermGenerator() | ||
termgenerator.set_stemmer(xapian.Stem("en")) | ||
|
||
for fields in parse_csv_file(datapath): | ||
# 'fields' is a dictionary mapping from field name to value. | ||
# Pick out the fields we're going to index. | ||
description = fields.get('DESCRIPTION', u'') | ||
title = fields.get('TITLE', u'') | ||
identifier = fields.get('id_NUMBER', u'') | ||
|
||
# We make a document and tell the term generator to use this. | ||
doc = xapian.Document() | ||
termgenerator.set_document(doc) | ||
|
||
# Index each field with a suitable prefix. | ||
termgenerator.index_text(title, 1, 'S') | ||
termgenerator.index_text(description, 1, 'XD') | ||
|
||
# Index fields without prefixes for general search. | ||
termgenerator.index_text(title) | ||
termgenerator.increase_termpos() | ||
termgenerator.index_text(description) | ||
|
||
### Start of new indexing code. | ||
# Index the MATERIALS field, splitting on semicolons. | ||
for material in fields.get('MATERIALS', u'').split(';'): | ||
material = material.strip().lower() | ||
if len(material) > 0: | ||
doc.add_boolean_term('XM' + material) | ||
### End of new indexing code. | ||
|
||
# Store all the fields for display purposes. | ||
doc.set_data(json.dumps(fields)) | ||
|
||
# We use the identifier to ensure each object ends up in the | ||
# database only once no matter how many times we run the | ||
# indexer. | ||
idterm = u"Q" + identifier | ||
doc.add_boolean_term(idterm) | ||
db.replace_document(idterm, doc) | ||
|
||
if len(sys.argv) != 3: | ||
print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) | ||
sys.exit(1) | ||
|
||
index(datapath = sys.argv[1], dbpath = sys.argv[2]) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/usr/bin/env python | ||
|
||
import json | ||
import sys | ||
import xapian | ||
from support import numbers_from_string, parse_csv_file | ||
|
||
def index(datapath, dbpath): | ||
# Create or open the database we're going to be writing to. | ||
db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) | ||
|
||
# Set up a TermGenerator that we'll use in indexing. | ||
termgenerator = xapian.TermGenerator() | ||
termgenerator.set_stemmer(xapian.Stem("en")) | ||
|
||
for fields in parse_csv_file(datapath): | ||
# 'fields' is a dictionary mapping from field name to value. | ||
# Pick out the fields we're going to index. | ||
description = fields.get('DESCRIPTION', u'') | ||
title = fields.get('TITLE', u'') | ||
identifier = fields.get('id_NUMBER', u'') | ||
|
||
# We make a document and tell the term generator to use this. | ||
doc = xapian.Document() | ||
termgenerator.set_document(doc) | ||
|
||
# Index each field with a suitable prefix. | ||
termgenerator.index_text(title, 1, 'S') | ||
termgenerator.index_text(description, 1, 'XD') | ||
|
||
# Index fields without prefixes for general search. | ||
termgenerator.index_text(title) | ||
termgenerator.increase_termpos() | ||
termgenerator.index_text(description) | ||
|
||
# Store all the fields for display purposes. | ||
doc.set_data(json.dumps(fields)) | ||
|
||
### Start of example code. | ||
# parse the two values we need | ||
measurements = fields.get('MEASUREMENTS', u'') | ||
if len(measurements) > 0: | ||
numbers = numbers_from_string(measurements) | ||
if len(numbers) > 0: | ||
doc.add_value(0, xapian.sortable_serialise(max(numbers))) | ||
|
||
date_made = fields.get('DATE_MADE', u'') | ||
years = numbers_from_string(date_made) | ||
if len(years) > 0: | ||
doc.add_value(1, xapian.sortable_serialise(years[0])) | ||
### End of example code. | ||
|
||
# We use the identifier to ensure each object ends up in the | ||
# database only once no matter how many times we run the | ||
# indexer. | ||
idterm = u"Q" + identifier | ||
doc.add_boolean_term(idterm) | ||
db.replace_document(idterm, doc) | ||
|
||
if len(sys.argv) != 3: | ||
print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) | ||
sys.exit(1) | ||
|
||
index(datapath = sys.argv[1], dbpath = sys.argv[2]) |
Empty file.
Oops, something went wrong.