Improve user ID randomization, avoid non-ISBN items

By creating a separate table, we can generate a much more random string to use as a surrogate for the user, rather than directly translating their user ID into a hash or including the timestamp as salt (where the timestamp, as a static value, could be brute-forced). Also avoid generating item entries where we have no ISBNs because we consequently have no matchpoints, either. Signed-off-by: Dan Scott <dan@coffeecode.net>
wdenton · Feb 17, 2012 · 26d40c6 · 26d40c6
1 parent d513965
commit 26d40c6
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 11 deletions.
diff --git a/README.txt b/README.txt
@@ -0,0 +1,13 @@
+Evergreen Level 0 Data Extract
+==============================
+
+Horribly simple approach, good enough for demonstration purposes for getting
+data to build a recommendation engine. Just plain old SQL to run against the
+Evergreen database.
+
+You will need to update:
+
+1. Institution IDs from which you want to extract data (we're assuming
+   a hierarchy where a parent institution contains the children from
+   which you want data)
+2. Output file names to something not so hardcoded.
diff --git a/evergreen/extract_data_lvl_0.sql b/evergreen/extract_data_lvl_0.sql
@@ -3,26 +3,35 @@
 --   aou.parent_ou: 105 = LUSYS, 106 = WINDSYS
 --   Persistent URL is lame but better than pointing at the JSPAC;
 --   - need to find out what this is actually used for
-
-COPY (SELECT acp.id, array_to_string(rsr.isbn, '|') AS isbns, rsr.title, rsr.author, rsr.publisher, rsr.pubdate, 'http://laurentian.concat.ca/opac/extras/supercat/retrieve/marcxml-full/record/' || rsr.id AS "Persistent URL"
+COPY (SELECT DISTINCT acp.id, array_to_string(rsr.isbn, '|') AS isbns, rsr.title,
+    rsr.author, rsr.publisher, rsr.pubdate,
+    'http://laurentian.concat.ca/opac/extras/supercat/retrieve/marcxml-full/record/' || rsr.id AS "Persistent URL"
     FROM asset.copy acp
         INNER JOIN asset.call_number acn ON acn.id = acp.call_number
         INNER JOIN reporter.materialized_simple_record rsr ON rsr.id = acn.record
         INNER JOIN action.circulation acirc ON acirc.target_copy = acp.id
         INNER JOIN actor.org_unit aou ON aou.id = acirc.circ_lib
-    WHERE acirc.xact_start < NOW() - '1 year'::interval
+    WHERE acirc.xact_start > NOW() - '1 year'::interval
         AND aou.parent_ou IN (105, 106)
-) TO '/tmp/items_conifer.txt' NULL '';
+        AND array_to_string(rsr.isbn, '') != ''
+) TO '/tmp/conifer.items.txt' NULL '';
+
+-- Create a table of randomized values for user IDs
+DROP TABLE IF EXISTS scratchpad.random_user_id;
+CREATE TABLE scratchpad.random_user_id (id BIGINT, rand_id TEXT);
+INSERT INTO scratchpad.random_user_id (id, rand_id) SELECT au.id, md5(random()::text || md5(random()::text)) FROM actor.usr au;
+CREATE INDEX CONCURRENTLY ON scratchpad.random_user_id(id);
 
--- Gets the raw transaction data
--- "Randomizes" the user ID with MD5 hex digest
-COPY (SELECT EXTRACT(epoch FROM acirc.xact_start) AS "timestamp", acp.id AS "Item ID", md5(md5(extract(epoch FROM NOW())::text) || au.id::text) AS "User ID"
+-- Gets the raw transaction data with randomized user IDs
+COPY (SELECT DISTINCT EXTRACT(epoch FROM acirc.xact_start) AS "timestamp", acp.id AS "Item ID", scruid.rand_id AS "User ID"
     FROM action.circulation acirc
         INNER JOIN asset.copy acp ON acp.id = acirc.target_copy
-        INNER JOIN actor.usr au ON au.id = acirc.usr
+        INNER JOIN asset.call_number acn ON acn.id = acp.call_number
+        INNER JOIN reporter.materialized_simple_record rsr ON rsr.id = acn.record
         INNER JOIN actor.org_unit aou ON aou.id = acirc.circ_lib
-    WHERE acirc.xact_start < NOW() - '1 year'::interval
+        INNER JOIN scratchpad.random_user_id scruid ON scruid.id = acirc.usr
+    WHERE acirc.xact_start > NOW() - '1 year'::interval
         AND aou.parent_ou IN (105, 106)
+        AND array_to_string(rsr.isbn, '') != ''
         ORDER BY 1 DESC
-        LIMIT 10
-) TO '/tmp/transactions_conifer.txt' NULL '';
+) TO '/tmp/conifer.transactions.txt' NULL '';