Permalink
Browse files

Flattening and denormalizing DiaryEntry, removing Phrase table, much …

…improved import, far faster
  • Loading branch information...
1 parent 04e2944 commit bca7c3ef478d0ce2fcb3e84d4d9802fbf74efbe8 @yourcelf committed Sep 13, 2010
Showing with 75 additions and 113 deletions.
  1. +2 −6 afg/admin.py
  2. +0 −1 afg/fixtures/initial_data.json
  3. +59 −41 afg/management/commands/import_wikileaks.py
  4. +7 −15 afg/models.py
  5. +1 −7 afg/templates/afg/entry.html
  6. +4 −41 afg/views.py
  7. +2 −2 media/js/script.js
View
@@ -1,14 +1,10 @@
from django.contrib import admin
-from afg import models
+from afg.models import DiaryEntry
class DiaryAdmin(admin.ModelAdmin):
list_display = ('title', 'type', 'category', 'host_nation_wia', 'host_nation_kia', 'civilian_wia', 'civilian_kia', 'enemy_wia', 'enemy_kia', 'friendly_wia', 'friendly_kia')
list_filter = ('type', 'region', 'attack_on', 'category', 'complex_attack')
search_fields = ('title', 'summary')
date_hierarchy = 'date'
-admin.site.register(models.DiaryEntry, DiaryAdmin)
-
-class PhraseAdmin(admin.ModelAdmin):
- filter_horizontal = ('entries',)
-admin.site.register(models.Phrase, PhraseAdmin)
+admin.site.register(DiaryEntry, DiaryAdmin)
@@ -1 +0,0 @@
-[{"pk": 22, "model": "auth.permission", "fields": {"codename": "add_logentry", "name": "Can add log entry", "content_type": 8}}, {"pk": 23, "model": "auth.permission", "fields": {"codename": "change_logentry", "name": "Can change log entry", "content_type": 8}}, {"pk": 24, "model": "auth.permission", "fields": {"codename": "delete_logentry", "name": "Can delete log entry", "content_type": 8}}, {"pk": 25, "model": "auth.permission", "fields": {"codename": "add_diaryentry", "name": "Can add diary entry", "content_type": 9}}, {"pk": 26, "model": "auth.permission", "fields": {"codename": "change_diaryentry", "name": "Can change diary entry", "content_type": 9}}, {"pk": 27, "model": "auth.permission", "fields": {"codename": "delete_diaryentry", "name": "Can delete diary entry", "content_type": 9}}, {"pk": 28, "model": "auth.permission", "fields": {"codename": "add_phrase", "name": "Can add phrase", "content_type": 10}}, {"pk": 29, "model": "auth.permission", "fields": {"codename": "change_phrase", "name": "Can change phrase", "content_type": 10}}, {"pk": 30, "model": "auth.permission", "fields": {"codename": "delete_phrase", "name": "Can delete phrase", "content_type": 10}}, {"pk": 4, "model": "auth.permission", "fields": {"codename": "add_group", "name": "Can add group", "content_type": 2}}, {"pk": 5, "model": "auth.permission", "fields": {"codename": "change_group", "name": "Can change group", "content_type": 2}}, {"pk": 6, "model": "auth.permission", "fields": {"codename": "delete_group", "name": "Can delete group", "content_type": 2}}, {"pk": 10, "model": "auth.permission", "fields": {"codename": "add_message", "name": "Can add message", "content_type": 4}}, {"pk": 11, "model": "auth.permission", "fields": {"codename": "change_message", "name": "Can change message", "content_type": 4}}, {"pk": 12, "model": "auth.permission", "fields": {"codename": "delete_message", "name": "Can delete message", "content_type": 4}}, {"pk": 1, "model": "auth.permission", "fields": {"codename": "add_permission", "name": "Can add permission", "content_type": 1}}, {"pk": 2, "model": "auth.permission", "fields": {"codename": "change_permission", "name": "Can change permission", "content_type": 1}}, {"pk": 3, "model": "auth.permission", "fields": {"codename": "delete_permission", "name": "Can delete permission", "content_type": 1}}, {"pk": 7, "model": "auth.permission", "fields": {"codename": "add_user", "name": "Can add user", "content_type": 3}}, {"pk": 8, "model": "auth.permission", "fields": {"codename": "change_user", "name": "Can change user", "content_type": 3}}, {"pk": 9, "model": "auth.permission", "fields": {"codename": "delete_user", "name": "Can delete user", "content_type": 3}}, {"pk": 13, "model": "auth.permission", "fields": {"codename": "add_contenttype", "name": "Can add content type", "content_type": 5}}, {"pk": 14, "model": "auth.permission", "fields": {"codename": "change_contenttype", "name": "Can change content type", "content_type": 5}}, {"pk": 15, "model": "auth.permission", "fields": {"codename": "delete_contenttype", "name": "Can delete content type", "content_type": 5}}, {"pk": 16, "model": "auth.permission", "fields": {"codename": "add_session", "name": "Can add session", "content_type": 6}}, {"pk": 17, "model": "auth.permission", "fields": {"codename": "change_session", "name": "Can change session", "content_type": 6}}, {"pk": 18, "model": "auth.permission", "fields": {"codename": "delete_session", "name": "Can delete session", "content_type": 6}}, {"pk": 19, "model": "auth.permission", "fields": {"codename": "add_site", "name": "Can add site", "content_type": 7}}, {"pk": 20, "model": "auth.permission", "fields": {"codename": "change_site", "name": "Can change site", "content_type": 7}}, {"pk": 21, "model": "auth.permission", "fields": {"codename": "delete_site", "name": "Can delete site", "content_type": 7}}, {"pk": 1, "model": "auth.user", "fields": {"username": "admin", "first_name": "", "last_name": "", "is_active": true, "is_superuser": true, "is_staff": true, "last_login": "2010-09-07 16:13:29", "groups": [], "user_permissions": [], "password": "sha1$4ff10$21ae972ee9dbaaae152d861e77d2c2570f284511", "email": "admin@admin.com", "date_joined": "2010-09-07 16:13:29"}}]
@@ -1,13 +1,17 @@
import re
+import os
import csv
+import json
import datetime
+import tempfile
import itertools
+import subprocess
from collections import defaultdict
from django.core.management.base import BaseCommand
from django.db import connection, transaction
-from afg.models import DiaryEntry, Phrase, import_fields
+from afg.models import DiaryEntry, import_fields
def clean_summary(text):
# Fix ampersand mess
@@ -29,7 +33,6 @@ def handle(self, *args, **kwargs):
"""
return
- release = args[1]
fields = [a[0] for a in import_fields]
thru = lambda f: f
conversions = []
@@ -39,48 +42,63 @@ def handle(self, *args, **kwargs):
else:
conversions.append(thru)
-
+ rows = []
phrases = defaultdict(set)
- with open(args[0]) as fh:
- reader = csv.reader(fh)
- for c, row in enumerate(reader):
- if c % 1000 == 0:
- print c
- values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
- kwargs = dict(zip(fields, values))
- kwargs['release'] = release
- entry = DiaryEntry.objects.create(**kwargs)
+ for i in range(0, len(args), 2):
+ filename = args[i]
+ release = args[i + 1]
- # Get words for phrases
- summary = re.sub(r'<[^>]*?>', '', kwargs['summary'])
- summary = re.sub(r'&[^;\s]+;', ' ', summary)
- summary = re.sub(r'[^A-Z ]', ' ', summary.upper())
- summary = re.sub(r'\s+', ' ', summary).strip()
- words = summary.split(' ')
- for i in range(3, 1, -1):
- for j in range(i, len(words)):
- print entry.id
- phrases[" ".join(words[j-i:j])].add(entry.id)
+ with open(filename) as fh:
+ reader = csv.reader(fh)
+ for c, row in enumerate(reader):
+ print "Loading", filename, c
+ values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
+ kwargs = dict(zip(fields, values))
+ kwargs['release'] = release
+ rows.append(kwargs)
+
+ # get phrases
+ summary = re.sub(r'<[^>]*?>', '', kwargs['summary'])
+ summary = re.sub(r'&[^;\s]+;', ' ', summary)
+ summary = re.sub(r'[^A-Z ]', ' ', summary.upper())
+ summary = re.sub(r'\s+', ' ', summary).strip()
+ words = summary.split(' ')
+ for i in range(3, 1, -1):
+ for j in range(i, len(words)):
+ phrases[" ".join(words[j-i:j])].add(kwargs['report_key'])
+ print "Calcuting phrase links..."
+ phrase_links = defaultdict(dict)
n = len(phrases)
- cursor = connection.cursor()
- transaction.commit_unless_managed()
- # Drop the join reference constraint for efficiency. We're confident
- # that the 4 million rows we're about to add all satisfy the
- # constraint, and it saves about 5 hours of computation time.
- cursor.execute('''ALTER TABLE "afg_phrase_entries" DROP CONSTRAINT "phrase_id_refs_id_48aa97f2"''')
- for c, (phrase, entry_ids) in enumerate(phrases.iteritems()):
- if c % 10000 == 0:
- transaction.commit_unless_managed()
- print c, n
- if len(entry_ids) > 1 and len(entry_ids) <= 10:
- cursor.execute("INSERT INTO afg_phrase (phrase, entry_count) VALUES (%s, %s) RETURNING id", (phrase, len(entry_ids)))
- phrase_id = cursor.fetchone()[0]
- phrase_entries = []
- for entry_id in entry_ids:
- phrase_entries.append((phrase_id, entry_id))
- cursor.executemany("""INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES (%s, %s)""", phrase_entries)
+ for c, (phrase, report_keys) in enumerate(phrases.iteritems()):
+ print "Phrases:", c, n
+ if len(report_keys) > 2 and len(report_keys) < 10:
+ key_list = list(report_keys)
+ for report_key in report_keys:
+ phrase_links[report_key][phrase] = key_list
- cursor.execute('''ALTER TABLE "afg_phrase_entries" ADD CONSTRAINT "phrase_id_refs_id_48aa97f2" FOREIGN KEY ("phrase_id") REFERENCES "afg_phrase" ("id") DEFERRABLE INITIALLY DEFERRED;''')
- transaction.commit_unless_managed()
+ print "Writing CSV"
+ # Write to CSV and bulk import.
+ fields = rows[0].keys()
+ fields.append('phrase_links')
+ temp = tempfile.NamedTemporaryFile(delete=False)
+ name = temp.name
+ writer = csv.writer(temp)
+ n = len(rows)
+ for c, row in enumerate(rows):
+ print "CSV", c, n
+ row['phrase_links'] = json.dumps(phrase_links[row['report_key']])
+ writer.writerow([row[f] for f in fields])
+ temp.close()
+ print "Loading into postgres"
+ cmd = '''psql -U %(user)s -c "\copy %(table)s (%(fields)s) FROM '%(filename)s' WITH CSV NULL AS 'NULL' "''' % {
+ 'user': connection.settings_dict['USER'],
+ 'table': DiaryEntry._meta.db_table,
+ 'fields': ",".join('"%s"' % f for f in fields),
+ 'filename': name,
+ }
+ print cmd
+ proc = subprocess.Popen(cmd, shell=True)
+ proc.wait()
+ os.remove(name)
View
@@ -16,7 +16,7 @@ def force_int(a):
def float_or_null(f):
if f:
return float(f)
- return None
+ return "NULL"
import_fields = [
("report_key",), # 0
@@ -28,7 +28,7 @@ def float_or_null(f):
("summary", clean_summary), # 6
("region",), # 7
("attack_on",), # 8
- ("complex_attack",), # 9
+ ("complex_attack", lambda f: bool(f)), # 9
("reporting_unit",), # 10
("unit_name",), # 11
("type_of_unit",), # 12
@@ -46,7 +46,7 @@ def float_or_null(f):
("longitude", float_or_null), # 24
("originator_group",), # 25
("updated_by_group",), # 26
- ("ccir",), # 27
+ ("ccir", lambda f: f or ""), # 27
("sigact",), # 28
("affiliation",), # 29
("dcolor",), # 30
@@ -55,7 +55,7 @@ def float_or_null(f):
# No DB indexes because we're kicking all that to SOLR.
class DiaryEntry(models.Model):
release = models.CharField(max_length=255)
- report_key = models.CharField(max_length=255, unique=True)
+ report_key = models.CharField(max_length=255, primary_key=True)
date = models.DateTimeField()
type = models.CharField(max_length=255)
category = models.CharField(max_length=255)
@@ -82,12 +82,14 @@ class DiaryEntry(models.Model):
longitude = models.FloatField(blank=True, null=True)
originator_group = models.CharField(max_length=255)
updated_by_group = models.CharField(max_length=255)
- ccir = models.CharField(max_length=255)
+ ccir = models.CharField(max_length=255, default="")
sigact = models.CharField(max_length=255)
affiliation = models.CharField(max_length=255)
dcolor = models.CharField(max_length=255)
classification = models.CharField(max_length=255)
+ phrase_links = models.TextField(blank=True, default="")
+
def total_casualties(self):
return self.friendly_wia + self.friendly_kia + self.host_nation_wia + self.host_nation_kia + self.civilian_wia + self.civilian_kia + self.enemy_wia + self.enemy_kia
@@ -103,13 +105,3 @@ def to_dict(self):
class Meta:
ordering = ['date']
verbose_name_plural = 'Diary entries'
-
-class Phrase(models.Model):
- phrase = models.CharField(max_length=255, unique=True, db_index=True)
- entries = models.ManyToManyField(DiaryEntry)
-
- # denormalization for performance
- entry_count = models.IntegerField(default=0, db_index=True)
-
- def __unicode__(self):
- return self.phrase
@@ -195,13 +195,7 @@ <h3>Limited script-free view:</h3>
<script type='text/javascript'>
// Load data
var summary = "{{ entry.summary|escapejs }}";
- var phrases = {{% for phrase, dest_ids in phrase_entries %}
- "{{ phrase.phrase|escapejs }}": [
- {% for dest_id in dest_ids %}{% if dest_id != entry.id %}
- "{{ dest_id }}"{% if not forloop.last %},{% endif %}
- {% endif %}{% endfor %}
- ]{% if not forloop.last %},{% endif %}
- {% endfor %}};
+ var phrases = {{ entry.phrase_links|safe }};
var linkPhrases = {};
for (var phrase in phrases) {
for (var i = 0; i < phrases[phrase].length; i++) {
View
@@ -1,4 +1,5 @@
import re
+import json
import urllib
import random
import datetime
@@ -14,7 +15,7 @@
from haystack.utils import Highlighter
import haystack
-from afg.models import DiaryEntry, Phrase
+from afg.models import DiaryEntry
from afg.search_indexes import DiaryEntryIndex
from afg import utils
@@ -25,58 +26,20 @@ def show_entry(request, rid, template='afg/entry_page.html', api=False):
try:
entry = DiaryEntry.objects.get(report_key=rid)
except DiaryEntry.DoesNotExist:
- try:
- entry = DiaryEntry.objects.get(id=int(rid))
- except (ValueError, DiaryEntry.DoesNotExist):
- raise Http404
-
- phrases = Phrase.objects.filter(entry_count__gt=1,
- entry_count__lt=10, entries=entry)
-# Equivalent query without denormalization:
-# phrases = list(Phrase.objects.raw("""
-# SELECT sub.* FROM
-# (SELECT p.id, p.phrase, COUNT(pe2.diaryentry_id) AS entry_count FROM
-# afg_phrase_entries pe2, afg_phrase p
-# INNER JOIN afg_phrase_entries pe1 ON pe1.phrase_id = p.id
-# WHERE pe1.diaryentry_id=%s AND p.id=pe2.phrase_id
-# GROUP BY p.phrase, p.id) AS sub
-# WHERE entry_count > 1 AND entry_count < 10;
-# """, [entry.id]))
-
- phrase_ids = [p.id for p in phrases]
-
- dest_ids = defaultdict(list)
- if phrase_ids:
- cursor = connection.cursor()
- # Using modulus not params here because we need to do funky literalizing of
- # the table
- cursor.execute("""
- SELECT pe.phrase_id, d.id FROM afg_phrase_entries pe
- INNER JOIN afg_diaryentry d ON pe.diaryentry_id=d.id
- WHERE pe.phrase_id IN (SELECT * FROM (VALUES %s) AS phrase_id_set);
- """ % (",".join("(%s)" % i for i in phrase_ids)))
- for row in cursor.fetchall():
- dest_ids[int(row[0])].append(row[1])
-
- phrase_entries = [(phrase, dest_ids[phrase.id]) for phrase in phrases]
+ raise Http404
if api:
return utils.render_json(request, {
'entry': entry.to_dict(),
- 'phrase_entries': [{
- 'phrase': p.phrase,
- 'entry_ids': ids,
- } for p, ids in phrase_entries],
})
return utils.render_request(request, template, {
'entry': entry,
- 'phrase_entries': phrase_entries,
})
def entry_popup(request):
try:
- rids = [int(r) for r in request.GET.get('rids').split(',')]
+ rids = [r for r in request.GET.get('rids').split(',')]
clicked = request.GET.get('clicked')
join_to = request.GET.get('entry')
texts = [urllib.unquote(t) for t in request.GET.get('texts').split(',')]
View
@@ -109,9 +109,9 @@ function displayText(tokens, div, popup_url, loading_text) {
pop.offset(newOffset); // chrome/safari bug; have to do this twice
var ids = [];
$.each(this.className.split(' '), function(i, classname) {
- var match = /key(\d+)/.exec(classname);
+ var match = /key(.+)/.exec(classname);
if (match) {
- ids.push(parseInt(match[1]));
+ ids.push(match[1]);
}
});

0 comments on commit bca7c3e

Please sign in to comment.