Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

importer speed improvements

  • Loading branch information...
commit 67f49c271c008aa9cee337372a633f0684be3d1b 1 parent 3e1599c
@yourcelf authored
Showing with 21 additions and 13 deletions.
  1. +21 −13 afg/management/commands/import_wikileaks.py
View
34 afg/management/commands/import_wikileaks.py
@@ -66,6 +66,7 @@ def handle(self, *args, **kwargs):
"""
return
+ phrases = defaultdict(set)
with open(args[0]) as fh:
reader = csv.reader(fh)
for c, row in enumerate(reader):
@@ -73,12 +74,6 @@ def handle(self, *args, **kwargs):
for i in range(13, 22):
row[i] = int(row[i] or 0)
kwargs = dict(zip(fields, row))
- try:
- DiaryEntry.objects.get(report_key=kwargs['report_key'])
- continue
- except DiaryEntry.DoesNotExist:
- pass
-
kwargs['summary'] = clean_summary(kwargs['summary'])
kwargs['latitude'] = float(kwargs['latitude']) if kwargs['latitude'] else None
kwargs['longitude'] = float(kwargs['longitude']) if kwargs['longitude'] else None
@@ -91,12 +86,25 @@ def handle(self, *args, **kwargs):
words = summary.split(' ')
for i in range(3, 1, -1):
for j in range(i, len(words)):
- phrase, created = Phrase.objects.get_or_create(phrase=" ".join(words[j-i:j])[:255])
- entry.phrase_set.add(phrase)
-
- # dennormalize entry counts
+ print entry.id
+ phrases[" ".join(words[j-i:j])].add(entry.id)
+
+ n = len(phrases)
+ phrase_mappings = []
+ phrase_counts = []
+ for c, (phrase, entry_ids) in enumerate(phrases.iteritems()):
+ if len(entry_ids) > 1:
+ print c, n
+ phrase = Phrase.objects.create(phrase=phrase)
+ phrase_id = phrase.id
+ phrase_counts.append((len(entry_ids), phrase_id))
+ for entry_id in entry_ids:
+ phrase_mappings.append((phrase_id, entry_id))
+
+ # Quickly as possible, update phrase mappings.
cursor = connection.cursor()
- cursor.execute("""
-UPDATE afg_phrase SET entry_count = (SELECT COUNT(pe.*) FROM afg_phrase_entries pe WHERE pe.phrase_id = afg_phrase.id);
- """)
+ print 'phrase_entries...'
+ cursor.executemany("INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES (%s, %s)", phrase_mappings)
+ print 'phrase entry counts...'
+ cursor.executemany("""UPDATE afg_phrase SET entry_count=%s WHERE id=%s""", phrase_counts)
transaction.commit_unless_managed()
Please sign in to comment.
Something went wrong with that request. Please try again.