Permalink
Browse files

Improve user ID randomization, avoid non-ISBN items

By creating a separate table, we can generate a much more random string
to use as a surrogate for the user, rather than directly translating
their user ID into a hash or including the timestamp as salt (where the
timestamp, as a static value, could be brute-forced).

Also avoid generating item entries where we have no ISBNs because we
consequently have no matchpoints, either.

Signed-off-by: Dan Scott <dan@coffeecode.net>
  • Loading branch information...
1 parent d513965 commit 26d40c6e8b1732d65db12728fd497412a86b5b3c @dbs dbs committed with Feb 13, 2012
Showing with 33 additions and 11 deletions.
  1. +13 −0 README.txt
  2. +20 −11 evergreen/extract_data_lvl_0.sql
View
@@ -0,0 +1,13 @@
+Evergreen Level 0 Data Extract
+==============================
+
+Horribly simple approach, good enough for demonstration purposes for getting
+data to build a recommendation engine. Just plain old SQL to run against the
+Evergreen database.
+
+You will need to update:
+
+1. Institution IDs from which you want to extract data (we're assuming
+ a hierarchy where a parent institution contains the children from
+ which you want data)
+2. Output file names to something not so hardcoded.
@@ -3,26 +3,35 @@
-- aou.parent_ou: 105 = LUSYS, 106 = WINDSYS
-- Persistent URL is lame but better than pointing at the JSPAC;
-- - need to find out what this is actually used for
-
-COPY (SELECT acp.id, array_to_string(rsr.isbn, '|') AS isbns, rsr.title, rsr.author, rsr.publisher, rsr.pubdate, 'http://laurentian.concat.ca/opac/extras/supercat/retrieve/marcxml-full/record/' || rsr.id AS "Persistent URL"
+COPY (SELECT DISTINCT acp.id, array_to_string(rsr.isbn, '|') AS isbns, rsr.title,
+ rsr.author, rsr.publisher, rsr.pubdate,
+ 'http://laurentian.concat.ca/opac/extras/supercat/retrieve/marcxml-full/record/' || rsr.id AS "Persistent URL"
FROM asset.copy acp
INNER JOIN asset.call_number acn ON acn.id = acp.call_number
INNER JOIN reporter.materialized_simple_record rsr ON rsr.id = acn.record
INNER JOIN action.circulation acirc ON acirc.target_copy = acp.id
INNER JOIN actor.org_unit aou ON aou.id = acirc.circ_lib
- WHERE acirc.xact_start < NOW() - '1 year'::interval
+ WHERE acirc.xact_start > NOW() - '1 year'::interval
AND aou.parent_ou IN (105, 106)
-) TO '/tmp/items_conifer.txt' NULL '';
+ AND array_to_string(rsr.isbn, '') != ''
+) TO '/tmp/conifer.items.txt' NULL '';
+
+-- Create a table of randomized values for user IDs
+DROP TABLE IF EXISTS scratchpad.random_user_id;
+CREATE TABLE scratchpad.random_user_id (id BIGINT, rand_id TEXT);
+INSERT INTO scratchpad.random_user_id (id, rand_id) SELECT au.id, md5(random()::text || md5(random()::text)) FROM actor.usr au;
+CREATE INDEX CONCURRENTLY ON scratchpad.random_user_id(id);
--- Gets the raw transaction data
--- "Randomizes" the user ID with MD5 hex digest
-COPY (SELECT EXTRACT(epoch FROM acirc.xact_start) AS "timestamp", acp.id AS "Item ID", md5(md5(extract(epoch FROM NOW())::text) || au.id::text) AS "User ID"
+-- Gets the raw transaction data with randomized user IDs
+COPY (SELECT DISTINCT EXTRACT(epoch FROM acirc.xact_start) AS "timestamp", acp.id AS "Item ID", scruid.rand_id AS "User ID"
FROM action.circulation acirc
INNER JOIN asset.copy acp ON acp.id = acirc.target_copy
- INNER JOIN actor.usr au ON au.id = acirc.usr
+ INNER JOIN asset.call_number acn ON acn.id = acp.call_number
+ INNER JOIN reporter.materialized_simple_record rsr ON rsr.id = acn.record
INNER JOIN actor.org_unit aou ON aou.id = acirc.circ_lib
- WHERE acirc.xact_start < NOW() - '1 year'::interval
+ INNER JOIN scratchpad.random_user_id scruid ON scruid.id = acirc.usr
+ WHERE acirc.xact_start > NOW() - '1 year'::interval
AND aou.parent_ou IN (105, 106)
+ AND array_to_string(rsr.isbn, '') != ''
ORDER BY 1 DESC
- LIMIT 10
-) TO '/tmp/transactions_conifer.txt' NULL '';
+) TO '/tmp/conifer.transactions.txt' NULL '';

0 comments on commit 26d40c6

Please sign in to comment.