Permalink
Browse files

full solr functionality, many improvements

  • Loading branch information...
1 parent 74d0410 commit 67a0a8ac7511185919c4f3c0caac74ee35c15fba Charles DeTar committed Sep 10, 2010
@@ -7,50 +7,13 @@
from django.core.management.base import BaseCommand
from django.db import connection, transaction
-from afg.models import DiaryEntry, Phrase
-
-fields = ["report_key", # 0
- "date", # 1
- "type", # 2
- "category", # 3
- "tracking_number", # 4
- "title", # 5
- "summary", # 6
- "region", # 7
- "attack_on", # 8
- "complex_attack", # 9
- "reporting_unit", # 10
- "unit_name", # 11
- "type_of_unit", # 12
- "friendly_wia", # 13
- "friendly_kia", # 14
- "host_nation_wia", # 15
- "host_nation_kia", # 16
- "civilian_wia", # 17
- "civilian_kia", # 18
- "enemy_wia", # 19
- "enemy_kia", # 20
- "enemy_detained", # 21
- "mgrs", # 22
- "latitude", # 23
- "longitude", # 24
- "originator_group", # 25
- "updated_by_group", # 26
- "ccir", # 27
- "sigact", # 28
- "affiliation", # 29
- "dcolor", # 30
- "classification", # 31
-]
+from afg.models import DiaryEntry, Phrase, import_fields
def clean_summary(text):
# Fix ampersand mess
while text.find("&") != -1:
text = text.replace("&", "&")
text = re.sub('&(?!(#[a-z\d]+|\w+);)/gi', "&", text)
-
- # Linebreaks
- text = text.replace("\n", "<br />")
return text
class Command(BaseCommand):
@@ -66,18 +29,26 @@ def handle(self, *args, **kwargs):
"""
return
+ fields = [a[0] for a in import_fields]
+ thru = lambda f: f
+ conversions = []
+ for f in import_fields:
+ if len(f) > 1:
+ conversions.append(f[1])
+ else:
+ conversions.append(thru)
+
+
phrases = defaultdict(set)
with open(args[0]) as fh:
reader = csv.reader(fh)
for c, row in enumerate(reader):
- print c
- for i in range(13, 22):
- row[i] = int(row[i] or 0)
- kwargs = dict(zip(fields, row))
- kwargs['summary'] = clean_summary(kwargs['summary'])
- kwargs['latitude'] = float(kwargs['latitude']) if kwargs['latitude'] else None
- kwargs['longitude'] = float(kwargs['longitude']) if kwargs['longitude'] else None
+ if c % 1000 == 0:
+ print c
+ values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
+ kwargs = dict(zip(fields, values))
entry = DiaryEntry.objects.create(**kwargs)
+
# Get words for phrases
summary = re.sub(r'<[^>]*?>', '', kwargs['summary'])
summary = re.sub(r'&[^;\s]+;', ' ', summary)
@@ -91,13 +62,22 @@ def handle(self, *args, **kwargs):
n = len(phrases)
cursor = connection.cursor()
+ # Drop the join reference constraint for efficiency. We're confident
+ # that the 4 million rows we're about to add all satisfy the
+ # constraint, and it saves about 5 hours of computation time.
+ cursor.execute('''ALTER TABLE "afg_phrase_entries" DROP CONSTRAINT "phrase_id_refs_id_48aa97f2"''')
+ transaction.commit_unless_managed()
for c, (phrase, entry_ids) in enumerate(phrases.iteritems()):
- if len(entry_ids) > 1:
- print c, n
+ if len(entry_ids) > 1 and len(entry_ids) <= 10:
+ if c % 1000 == 0:
+ print c, n
cursor.execute("INSERT INTO afg_phrase (phrase, entry_count) VALUES (%s, %s) RETURNING id", (phrase, len(entry_ids)))
phrase_id = cursor.fetchone()[0]
cursor.execute("""
INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES
""" + ",".join("(%s, %s)" % (phrase_id, entry_id) for entry_id in entry_ids)
)
transaction.commit_unless_managed()
+ cursor.execute('''ALTER TABLE "afg_phrase_entries" ADD CONSTRAINT "phrase_id_refs_id_48aa97f2" FOREIGN KEY ("phrase_id") REFERENCES "afg_phrase" ("id") DEFERRABLE INITIALLY DEFERRED;''')
+ transaction.commit_unless_managed()
+
View
@@ -1,5 +1,57 @@
+import re
+import datetime
+
from django.db import models
+def clean_summary(text):
+ # Fix ampersand mess
+ while text.find("&amp;") != -1:
+ text = text.replace("&amp;", "&")
+ text = re.sub('&(?!(#[a-z\d]+|\w+);)/gi', "&amp;", text)
+ return text
+
+def force_int(a):
+ return int(a or 0)
+
+def float_or_null(f):
+ if f:
+ return float(f)
+ return None
+
+import_fields = [
+ ("report_key",), # 0
+ ("date",), # 1
+ ("type",), # 2
+ ("category",), # 3
+ ("tracking_number",), # 4
+ ("title",), # 5
+ ("summary", clean_summary), # 6
+ ("region",), # 7
+ ("attack_on",), # 8
+ ("complex_attack",), # 9
+ ("reporting_unit",), # 10
+ ("unit_name",), # 11
+ ("type_of_unit",), # 12
+ ("friendly_wia", force_int), # 13
+ ("friendly_kia", force_int), # 14
+ ("host_nation_wia", force_int), # 15
+ ("host_nation_kia", force_int), # 16
+ ("civilian_wia", force_int), # 17
+ ("civilian_kia", force_int), # 18
+ ("enemy_wia", force_int), # 19
+ ("enemy_kia", force_int), # 20
+ ("enemy_detained", force_int), # 21
+ ("mgrs",), # 22
+ ("latitude", float_or_null), # 23
+ ("longitude", float_or_null), # 24
+ ("originator_group",), # 25
+ ("updated_by_group",), # 26
+ ("ccir",), # 27
+ ("sigact",), # 28
+ ("affiliation",), # 29
+ ("dcolor",), # 30
+ ("classification",), # 31
+]
# No DB indexes because we're kicking all that to SOLR.
class DiaryEntry(models.Model):
report_key = models.CharField(max_length=255, unique=True)
@@ -35,7 +87,6 @@ class DiaryEntry(models.Model):
dcolor = models.CharField(max_length=255)
classification = models.CharField(max_length=255)
- # denormalization for sorting
def total_casualties(self):
return self.friendly_wia + self.friendly_kia + self.host_nation_wia + self.host_nation_kia + self.civilian_wia + self.civilian_kia + self.enemy_wia + self.enemy_kia
@@ -52,20 +103,6 @@ class Meta:
ordering = ['date']
verbose_name_plural = 'Diary entries'
- def casualty_summary(self):
- parts = []
- for attr in ('civilian', 'host_nation', 'friendly', 'enemy'):
- k = getattr(self, attr + '_kia')
- w = getattr(self, attr + '_wia')
- if k or w:
- counts = []
- if k:
- counts.append("%i killed" % k)
- if w:
- counts.append("%i wounded" % w)
- parts.append("%s: %s" % (attr.title().replace("_", " "), ", ".join(counts)))
- return "; ".join(parts)
-
class Phrase(models.Model):
phrase = models.CharField(max_length=255, unique=True, db_index=True)
entries = models.ManyToManyField(DiaryEntry)
@@ -39,6 +39,12 @@ class DiaryEntryIndex(indexes.SearchIndex):
classification = indexes.CharField(model_attr='classification', faceted=True)
total_casualties = indexes.IntegerField(model_attr='total_casualties', faceted=True)
+ search_facet_display = ('date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
+ offer_to_sort_by = (('Date', 'date'), ('Casualties', 'total_casualties')) # (display, field) pairs
+
+ min_date = datetime.datetime(2004, 1, 1, 0, 0, 0)
+ max_date = datetime.datetime(2010, 1, 1, 0, 0, 0)
+
def get_queryset(self):
return DiaryEntry.objects.all()
@@ -1,11 +1,12 @@
+{% load afg %}
<ul>
{% for entry, stub in entries %}
<li class='searchresult'>
<a href='{% url afg.show_entry entry.report_key %}'>{{ entry.title }}</a>
<span class='date'>{{ entry.date }}</span><br />
<span class='category'>{{ entry.category }}</span>,
<span class='region'>{{ entry.region }}</span>,
- <span class='casualties' title='{{ entry.casualty_summary }}'>{{ entry.total_casualties }} casualties</span>
+ <span class='casualties' title='{{ entry|casualty_summary }}'>{{ entry.total_casualties }} casualties</span>
<p>{{ stub }}</p>
</li>
{% endfor %}
Oops, something went wrong.

0 comments on commit 67a0a8a

Please sign in to comment.