Skip to content

Commit

Permalink
import script improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
yourcelf committed Oct 31, 2010
1 parent 26e9739 commit 9da371f
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 47 deletions.
116 changes: 76 additions & 40 deletions afg/management/commands/import_wikileaks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import print_function
import re
import os
import csv
Expand All @@ -12,29 +13,50 @@

from afg.models import DiaryEntry, import_fields

def clean_summary(text):
# Fix ampersand mess
while text.find("&") != -1:
text = text.replace("&", "&")
text = re.sub('&(?!(#[a-z\d]+|\w+);)/gi', "&", text)
return text
# How many words may constitute a phrase?
PHRASE_LENGTHS = [2]
# Phrases will only be linked if they match between these values
# (non-inclusive).
PHRASE_LINK_LIMITS = [1, 10]
OUTPUT_NAME = "data/processed.csv"

class StatusPrinter(object):
def __init__(self, c=0, n=0):
self.c = c
self.n = n
self.previous = ""

def inc(self):
self.c += 1

def print(self):
print("\b" * len(self.previous), end="")
self.previous = "{0} / {1}".format(self.c, self.n)
print(self.previous, end="")

def end(self):
print()

def thru(string):
if string.strip() == '<null value>':
return ""
else:
return string

class Command(BaseCommand):
args = '<csv_file> <release_name>'
help = """Import the wikileaks Afghan War Diary CSV file(s)."""

def handle(self, *args, **kwargs):
print args
if len(args) < 2:
print """Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25"). The CSV file can be downloaded here:
print("""Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25"). The CSV file can be downloaded here:
http://wikileaks.org/wiki/Afghan_War_Diary,_2004-2010
"""
""")
return

fields = [a[0] for a in import_fields]
thru = lambda f: f
conversions = []
for f in import_fields:
if len(f) > 1:
Expand All @@ -44,67 +66,81 @@ def handle(self, *args, **kwargs):

rows = []
phrases = defaultdict(set)

for i in range(0, len(args), 2):
filename = args[i]
release = args[i + 1]

print("Loading", filename)
sp = StatusPrinter()

with open(filename) as fh:
reader = csv.reader(fh)
reader = csv.reader(fh, delimiter=",", quotechar='"')
for c, row in enumerate(reader):
print "Loading", filename, c
if len(row) == 0:
continue
sp.print()
sp.inc()
values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
kwargs = dict(zip(fields, values))
kwargs['release'] = release
rows.append(kwargs)

# get phrases
summary = re.sub(r'<[^>]*?>', '', kwargs['summary'])
summary = re.sub(r'&[^;\s]+;', ' ', summary)
summary = re.sub(r'[^A-Z ]', ' ', summary.upper())
summary = re.sub(r'\s+', ' ', summary).strip()
words = summary.split(' ')
for i in range(3, 1, -1):
for i in PHRASE_LENGTHS:
for j in range(i, len(words)):
phrases[" ".join(words[j-i:j])].add(kwargs['report_key'])
phrase = " ".join(words[j-i:j])
if len(phrases[phrase]) <= PHRASE_LINK_LIMITS[1]:
phrases[phrase].add(kwargs['report_key'])
sp.end()

print "Calcuting phrase links..."
print("Calcuting phrase links...")
phrase_links = defaultdict(dict)
n = len(phrases)
for c, (phrase, report_keys) in enumerate(phrases.iteritems()):
print "Phrases:", c, n
if len(report_keys) > 2 and len(report_keys) < 10:
sp = StatusPrinter(0, n)
for phrase, report_keys in phrases.iteritems():
sp.print()
sp.inc()
if len(report_keys) > PHRASE_LINK_LIMITS[0] and \
len(report_keys) < PHRASE_LINK_LIMITS[1]:
key_list = list(report_keys)
for report_key in report_keys:
phrase_links[report_key][phrase] = key_list
phrases = None
sp.end()

print "Writing CSV"
print("Writing CSV")
# Write to CSV and bulk import.
fields = rows[0].keys()
fields.append('phrase_links')
temp = tempfile.NamedTemporaryFile(delete=False)
writer = csv.writer(temp)
name = temp.name
n = len(rows)
c = 0
# Pop rows to preserve memory (adding the json in phrase_links grows
# too fast).
while len(rows) > 0:
row = rows.pop(0)
print "CSV", c, len(rows), n
row['phrase_links'] = json.dumps(phrase_links[row['report_key']])
writer.writerow([row[f] for f in fields])
c += 1
temp.close()

print "Loading into postgres"
cmd = '''psql -U %(user)s -c "\copy %(table)s (%(fields)s) FROM '%(filename)s' WITH CSV NULL AS 'NULL' "''' % {
with open(OUTPUT_NAME, 'w') as fh:
writer = csv.writer(fh)
n = len(rows)
c = 0
# Pop rows to preserve memory (adding the json in phrase_links grows
# too fast).
sp = StatusPrinter(c, n)
while len(rows) > 0:
row = rows.pop(0)
sp.print()
sp.inc()
row['phrase_links'] = json.dumps(phrase_links[row['report_key']])
writer.writerow([row[f] for f in fields])
c += 1
sp.end()

print("Loading into postgres")
cmd = '''psql -U %(user)s -c "\copy %(table)s (%(fields)s) FROM '%(filename)s' WITH CSV NULL AS '<null value>' "''' % {
'user': connection.settings_dict['USER'],
'table': DiaryEntry._meta.db_table,
'fields': ",".join('"%s"' % f for f in fields),
'filename': name,
'filename': OUTPUT_NAME,
}
print cmd
print(cmd)
proc = subprocess.Popen(cmd, shell=True)
proc.wait()
os.remove(name)
16 changes: 12 additions & 4 deletions afg/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

def clean_summary(text):
# Fix ampersand mess
if text.strip() == "<null value>":
return ""
while text.find("&amp;") != -1:
text = text.replace("&amp;", "&")
text = re.sub('&(?!(#[a-z\d]+|\w+);)/gi', "&amp;", text)
Expand All @@ -17,7 +19,13 @@ def force_int(a):
def float_or_null(f):
if f:
return float(f)
return "NULL"
return "<null value>"

def complex_attack(f):
if f == "<null value>":
return f
else:
return bool(f)

import_fields = [
("report_key",), # 0
Expand All @@ -29,7 +37,7 @@ def float_or_null(f):
("summary", clean_summary), # 6
("region",), # 7
("attack_on",), # 8
("complex_attack", lambda f: bool(f)), # 9
("complex_attack", complex_attack), # 9
("reporting_unit",), # 10
("unit_name",), # 11
("type_of_unit",), # 12
Expand All @@ -47,7 +55,7 @@ def float_or_null(f):
("longitude", float_or_null), # 24
("originator_group",), # 25
("updated_by_group",), # 26
("ccir", lambda f: f or ""), # 27
("ccir",), # 27
("sigact",), # 28
("affiliation",), # 29
("dcolor",), # 30
Expand All @@ -65,7 +73,7 @@ class DiaryEntry(models.Model):
summary = models.TextField()
region = models.CharField(max_length=255)
attack_on = models.CharField(max_length=255)
complex_attack = models.BooleanField()
complex_attack = models.NullBooleanField(null=True)
reporting_unit = models.CharField(max_length=255)
unit_name = models.CharField(max_length=255)
type_of_unit = models.CharField(max_length=255)
Expand Down
4 changes: 2 additions & 2 deletions afg/search_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
summary = indexes.CharField(model_attr='summary')
region = indexes.CharField(model_attr='region', faceted=True)
attack_on = indexes.CharField(model_attr='attack_on', faceted=True)
complex_attack = indexes.BooleanField(model_attr='complex_attack', faceted=True)
complex_attack = indexes.BooleanField(model_attr='complex_attack', faceted=True, null=True)
reporting_unit = indexes.CharField(model_attr='reporting_unit', faceted=True)
unit_name = indexes.CharField(model_attr='unit_name', faceted=True)
type_of_unit = indexes.CharField(model_attr='type_of_unit', faceted=True)
Expand All @@ -40,7 +40,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
classification = indexes.CharField(model_attr='classification', faceted=True)
total_casualties = indexes.IntegerField(model_attr='total_casualties', faceted=True)

search_facet_display = ('date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
search_facet_display = ('release', 'date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
offer_to_sort_by = (('Date', 'date'), ('Casualties', 'total_casualties')) # (display, field) pairs

min_date = datetime.datetime(2004, 1, 1, 0, 0, 0)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
django==1.2.3
django>=1.2.3
-e git://github.com/toastdriven/django-haystack.git#egg=django-haystack
pysolr
httplib2
Expand Down

0 comments on commit 9da371f

Please sign in to comment.