-
Notifications
You must be signed in to change notification settings - Fork 19
/
importer.clj
196 lines (175 loc) · 8.19 KB
/
importer.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
; Copyright 2020 Mark Wardle and Eldrix Ltd
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;;;;
(ns com.eldrix.hermes.importer
"Provides import functionality for processing directories of files"
(:require
[cheshire.core :as json]
[clojure.core.async :as async]
[clojure.java.io :as io]
[clojure.string :as str]
[clojure.tools.logging.readable :as log]
[com.eldrix.hermes.snomed :as snomed])
(:import (java.io File)
(com.fasterxml.jackson.core JsonParseException)))
(defn is-snomed-file? [f]
(snomed/parse-snomed-filename (.getName (clojure.java.io/file f))))
(defn snomed-file-seq
"A tree sequence for SNOMED CT data files, returning a sequence of maps.
Each result is a map of SNOMED information from the filename as per
the [release file documentation](https://confluence.ihtsdotools.org/display/DOCRELFMT/3.3.2+Release+File+Naming+Convention),
with additional keys:
path : the path of the file,
component : the canonical name of the SNOMED component (e.g. 'Concept', 'SimpleRefset')
component-order : the sort order as defined by component type"
[dir]
(->> dir
clojure.java.io/file
file-seq
(map #(snomed/parse-snomed-filename (.getPath ^File %)))
(filter :component)))
(defn importable-files
"Return a list of importable files from the directory specified."
[dir]
(->> (snomed-file-seq dir)
(filter #(= (:release-type %) "Snapshot"))
(filter :parser)))
(defn read-metadata
"Reads the metadata from the file specified.
Unfortunately, some UK releases have invalid JSON in their metadata, so
we log an error and avoid throwing an exception.
Raised as issue #34057 with NHS Digital.
Unfortunately the *name* of the release is not included currently, but as the
metadata file exists at the root of the release, we can guess the name from
the parent directory and use that if a 'name' isn't in the metadata.
Raised as issue #32991 with Snomed International."
[^File f]
(let [default {:name (.getName (.getParentFile f))}]
(try (merge default (json/parse-string (slurp f) true))
(catch JsonParseException _e
(log/warn "invalid metadata in distribution file" (:name default))
(assoc default :error "invalid metadata: invalid json in file")))))
(defn metadata-files
"Returns a list of release package information files from the directory.
Each entry returned in the list will be a `java.io.File`.
These files have been issued since the July 2020 International edition release."
[dir]
(->> (io/file dir)
(file-seq)
(filter #(= (.getName ^File %) "release_package_information.json"))))
(defn all-metadata
"Returns all release metadata from the directory specified."
[dir]
(doall (->> (metadata-files dir)
(map read-metadata))))
(defn- process-file
"Process the specified file, streaming batched results to the channel
specified, blocking if channel not being drained.
Each batch is a map with keys
- :type : a type of SNOMED component
- :parser : a parser that can take each row and give you data
- :headings : a sequence of headings from the original file
- :data : a sequence of vectors representing each column."
[filename out-c & {:keys [batch-size] :or {batch-size 1000}}]
(with-open [reader (io/reader filename)]
(let [snofile (snomed/parse-snomed-filename filename)
parser (:parser snofile)]
(when parser
(let [csv-data (map #(str/split % #"\t") (line-seq reader))
headings (first csv-data)
data (rest csv-data)
batches (->> data
(partition-all batch-size)
(map #(hash-map :type (:identifier snofile)
:parser parser
:headings headings
:data %)))]
(log/info "Processing: " filename " type: " (:component snofile))
(log/debug "Processing " (count batches) " batches")
(doseq [batch batches]
(log/debug "Processing batch " {:batch (dissoc batch :data) :first-data (-> batch :data first)})
(when-not (async/>!! out-c batch)
(log/debug "Processing cancelled (output channel closed)")
(throw (InterruptedException. "process cancelled")))))))))
(defn test-csv [filename]
(with-open [rdr (clojure.java.io/reader filename)]
(if-let [parser (:parser (snomed/parse-snomed-filename filename))]
(loop [i 0
n 0
data (map #(str/split % #"\t") (line-seq rdr))]
(when-let [line (first data)]
(when (= i 0)
(println "Processing " filename "\n" line))
(try
(when (> i 0) (parser line))
(catch Throwable e (throw (ex-info "Error parsing file" {:filename filename
:line i :content line
:error (ex-data e)}))))
(when (and (not= 0 n) (not= n (count line)))
(println "incorrect number of columns; expected" n " got:" (count line)) {})
(recur (inc i)
(if (= n 0) (long (count line)) n)
(next data))))
(println "no parser for file: " filename))))
(defn test-all-csv [dir]
(doseq [filename (map :path (importable-files dir))]
(test-csv filename)))
(defn load-snomed-files
"Imports a SNOMED-CT distribution from the specified files, returning
results on the returned channel which will be closed once all files have been
sent through. Any exceptions will be passed on the channel."
[files & {:keys [nthreads batch-size] :or {nthreads 4 batch-size 5000}}]
(let [raw-c (async/chan) ;; CSV data in batches with :type, :headings and :data, :data as a vector of raw strings
processed-c (async/chan)] ;; CSV data in batches with :type, :headings and :data, :data as a vector of SNOMED entities
(async/thread
(log/info "Processing " (count files) " files")
(try
(doseq [file files]
(process-file (:path file) raw-c :batch-size batch-size))
(catch Throwable e
(log/debug "Error during raw SNOMED file import: " e)
(async/>!! processed-c e)))
(async/close! raw-c))
(async/pipeline-blocking
nthreads
processed-c
(map snomed/parse-batch)
raw-c
true
(fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err))
processed-c))
(defn load-snomed
"Imports a SNOMED-CT distribution from the specified directory, returning
results on the returned channel which will be closed once all files have been
sent through. Any exceptions will be passed on the channel.
This streams data in a single pass; in generally usage you will usually want
to stream data in multiple passes."
[dir & opts]
(let [files (snomed-file-seq dir)]
(load-snomed-files files opts)))
(defn examine-distribution-files
[dir]
(let [results-c (load-snomed dir :batch-size 5000)]
(loop [counts {}
batch (async/<!! results-c)]
(if-not batch
(print "Total statistics: \n" counts)
(do (print counts "\r")
(recur
(merge-with + counts {(:type batch) (count (:data batch))})
(async/<!! results-c)))))))
(comment
(snomed/parse-snomed-filename "sct2_Concept_Full_INT_20190731.txt")
(def filename "/Users/mark/Downloads/uk_sct2cl_30.0.0_20200805000001/SnomedCT_InternationalRF2_PRODUCTION_20190731T120000Z/Snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20190731.txt")
(test-csv filename))