Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Improve user ID randomization, avoid non-ISBN items

By creating a separate table, we can generate a much more random string
to use as a surrogate for the user, rather than directly translating
their user ID into a hash or including the timestamp as salt (where the
timestamp, as a static value, could be brute-forced).

Also avoid generating item entries where we have no ISBNs because we
consequently have no matchpoints, either.

Signed-off-by: Dan Scott <dan@coffeecode.net>
  • Loading branch information...
commit 26d40c6e8b1732d65db12728fd497412a86b5b3c 1 parent d513965
Dan Scott dbs authored committed

Showing 2 changed files with 33 additions and 11 deletions. Show diff stats Hide diff stats

  1. +13 0 README.txt
  2. +20 11 evergreen/extract_data_lvl_0.sql
13 README.txt
... ... @@ -0,0 +1,13 @@
  1 +Evergreen Level 0 Data Extract
  2 +==============================
  3 +
  4 +Horribly simple approach, good enough for demonstration purposes for getting
  5 +data to build a recommendation engine. Just plain old SQL to run against the
  6 +Evergreen database.
  7 +
  8 +You will need to update:
  9 +
  10 +1. Institution IDs from which you want to extract data (we're assuming
  11 + a hierarchy where a parent institution contains the children from
  12 + which you want data)
  13 +2. Output file names to something not so hardcoded.
31 evergreen/extract_data_lvl_0.sql
@@ -3,26 +3,35 @@
3 3 -- aou.parent_ou: 105 = LUSYS, 106 = WINDSYS
4 4 -- Persistent URL is lame but better than pointing at the JSPAC;
5 5 -- - need to find out what this is actually used for
6   -
7   -COPY (SELECT acp.id, array_to_string(rsr.isbn, '|') AS isbns, rsr.title, rsr.author, rsr.publisher, rsr.pubdate, 'http://laurentian.concat.ca/opac/extras/supercat/retrieve/marcxml-full/record/' || rsr.id AS "Persistent URL"
  6 +COPY (SELECT DISTINCT acp.id, array_to_string(rsr.isbn, '|') AS isbns, rsr.title,
  7 + rsr.author, rsr.publisher, rsr.pubdate,
  8 + 'http://laurentian.concat.ca/opac/extras/supercat/retrieve/marcxml-full/record/' || rsr.id AS "Persistent URL"
8 9 FROM asset.copy acp
9 10 INNER JOIN asset.call_number acn ON acn.id = acp.call_number
10 11 INNER JOIN reporter.materialized_simple_record rsr ON rsr.id = acn.record
11 12 INNER JOIN action.circulation acirc ON acirc.target_copy = acp.id
12 13 INNER JOIN actor.org_unit aou ON aou.id = acirc.circ_lib
13   - WHERE acirc.xact_start < NOW() - '1 year'::interval
  14 + WHERE acirc.xact_start > NOW() - '1 year'::interval
14 15 AND aou.parent_ou IN (105, 106)
15   -) TO '/tmp/items_conifer.txt' NULL '';
  16 + AND array_to_string(rsr.isbn, '') != ''
  17 +) TO '/tmp/conifer.items.txt' NULL '';
  18 +
  19 +-- Create a table of randomized values for user IDs
  20 +DROP TABLE IF EXISTS scratchpad.random_user_id;
  21 +CREATE TABLE scratchpad.random_user_id (id BIGINT, rand_id TEXT);
  22 +INSERT INTO scratchpad.random_user_id (id, rand_id) SELECT au.id, md5(random()::text || md5(random()::text)) FROM actor.usr au;
  23 +CREATE INDEX CONCURRENTLY ON scratchpad.random_user_id(id);
16 24
17   --- Gets the raw transaction data
18   --- "Randomizes" the user ID with MD5 hex digest
19   -COPY (SELECT EXTRACT(epoch FROM acirc.xact_start) AS "timestamp", acp.id AS "Item ID", md5(md5(extract(epoch FROM NOW())::text) || au.id::text) AS "User ID"
  25 +-- Gets the raw transaction data with randomized user IDs
  26 +COPY (SELECT DISTINCT EXTRACT(epoch FROM acirc.xact_start) AS "timestamp", acp.id AS "Item ID", scruid.rand_id AS "User ID"
20 27 FROM action.circulation acirc
21 28 INNER JOIN asset.copy acp ON acp.id = acirc.target_copy
22   - INNER JOIN actor.usr au ON au.id = acirc.usr
  29 + INNER JOIN asset.call_number acn ON acn.id = acp.call_number
  30 + INNER JOIN reporter.materialized_simple_record rsr ON rsr.id = acn.record
23 31 INNER JOIN actor.org_unit aou ON aou.id = acirc.circ_lib
24   - WHERE acirc.xact_start < NOW() - '1 year'::interval
  32 + INNER JOIN scratchpad.random_user_id scruid ON scruid.id = acirc.usr
  33 + WHERE acirc.xact_start > NOW() - '1 year'::interval
25 34 AND aou.parent_ou IN (105, 106)
  35 + AND array_to_string(rsr.isbn, '') != ''
26 36 ORDER BY 1 DESC
27   - LIMIT 10
28   -) TO '/tmp/transactions_conifer.txt' NULL '';
  37 +) TO '/tmp/conifer.transactions.txt' NULL '';

0 comments on commit 26d40c6

Please sign in to comment.
Something went wrong with that request. Please try again.