Permalink
Fetching contributors…
Cannot retrieve contributors at this time
262 lines (218 sloc) 7.87 KB

Contents

Namespace: thi.ng.fabric.facts.io.ntriples

This namespace provides a simple NTriples parser without any other dependencies.

Implementation notes

  • All subjects & predicates are always returned as strings
  • Object URIs and string literals are returned as strings
  • Object string literals with language tag or unknown type URI will be returned as map
  • BNodes IDs are returned as is (as strings), i.e. no ID mangling takes place, BNodes can be identified via : char prefix
  • Line comments are supported (# comment text...)

Object string literal handling

See http://www.w3.org/TR/n-triples/#sec-literals for details about literal type mapping.

Language tags

If a object literal string has an attached language tag, it will be returned as map with these keys:

  • :lit : original parsed string
  • :lang : language tag (excluding @ prefix)

Type mapping

The parser provides an extensible mechanism via its literal-value multimethod. By default only boolean and common numeric type conversion is implemented. If conversion isn’t available or fails, typed object literals will be returned as map with these keys:

  • :lit : original parsed string
  • :type : type URI
(defn vocab-map
  [uri & xs]
  (->> xs (map (fn [x] [x (str uri (name x))])) (into {})))

(def xsd
  (vocab-map
   "http://www.w3.org/2001/XMLSchema#"
   :boolean
   :byte
   :short
   :integer
   :int
   :long
   :decimal
   :double
   :float
   :decimal))

(defmulti literal-value (fn [_ uri] uri))

(defmethod literal-value :default [_ _]       nil)
(defmethod literal-value (:byte xsd) [x _]    (strf/parse-int x 10))
(defmethod literal-value (:short xsd) [x _]   (strf/parse-int x 10))
(defmethod literal-value (:int xsd) [x _]     (strf/parse-int x 10))
(defmethod literal-value (:integer xsd) [x _] (strf/parse-int x 10))
(defmethod literal-value (:long xsd) [x _]    (strf/parse-long x 10))
(defmethod literal-value (:float xsd) [x _]   (strf/parse-float x))
(defmethod literal-value (:double xsd) [x _]  (strf/parse-double x))
(defmethod literal-value (:decimal xsd) [x _] (strf/parse-double x))
(defmethod literal-value (:boolean xsd) [x _] (strf/parse-boolean x))

BNode handling

Using the map-bnodes function below, RDF BNodes in the graph can be converted to alternative representations. If no ID generator fn is given, BNode identifiers will be converted into UUIDv4’s.

The function consumes a seq of parsed NTriples and returns the seq with transformed BNodes.

(defn bnodes
  [facts]
  (into
   #{}
   (comp (mapcat identity)
         (filter string?)
         (filter #(zero? (.indexOf ^String % "_:"))))
   facts))

(defn map-bnodes
  ([facts]
   (map-bnodes strf/new-uuid facts))
  ([idgen facts]
   (let [bn-map (zipmap (bnodes facts) (repeatedly idgen))]
     (map (fn [fact] (mapv #(bn-map % %) fact)) facts))))

Parser

#?(:clj
   (defn hex->str
     [chars]
     (-> (Integer/parseInt (apply str chars) 16)
         (Character/toChars)
         (String.)))
   :cljs
   (defn hex->str
     [chars]
     (.fromCharCode js/String (js/parseInt (apply str chars) 16))))

(def WS #{\space \tab \newline})

(defn parse-escape
  "Takes escape seq w/o leading backslash. Returns parsed char &
  stream after."
  [str]
  (condp = (first str)
    \x [(hex->str (take 2 (next str))) (drop 2 (next str))]
    \u [(hex->str (take 4 (next str))) (drop 4 (next str))]
    \n ["\n" (next str)]
    \r ["\r" (next str)]
    \b ["\b" (next str)]
    \t ["\t" (next str)]
    \f ["\f" (next str)]
    \" [\" (next str)]
    \\ [\\ (next str)]
    (err/illegal-arg! "illegal escape sequence")))

(defn discard-until
  "Reads allowed chars until stop returns true.
  Returns stream from stop char."
  [stream allowed stop]
  (loop [stream (seq stream)]
    (let [ch (first stream)]
      (if (or (nil? ch) (stop ch))
        stream
        (if (= \# ch)
          (let [stream (next (discard-until (next stream) identity #(= \newline %)))]
            (recur stream))
          (if (allowed ch)
            (recur (next stream))
            (err/illegal-arg! (str "illegal character: " ch))))))))

(defn read-until
  "Reads stream until stop char (incl. escape seqs).
  Returns [token stream-after]."
  [stream stop]
  (loop [token (transient []) stream (seq stream)]
    (let [ch (first stream)]
      (if (or (nil? ch) (stop ch))
        [(apply str (persistent! token)) stream]
        (if (= \\ ch)
          (let [[esc stream] (parse-escape (next stream))]
            (recur (conj! token esc) stream))
          (recur (conj! token ch) (next stream)))))))

(defn parse-uri
  [str] (read-until str #(= \> %)))

(defn parse-bnode
  [str] (read-until str #(= \space %)))

(defn parse-literal-type
  [lit stream]
  (let [stream (discard-until stream #(= \^ %) #(= \< %))
        [turi stream] (parse-uri (next stream))
        val (literal-value lit turi)]
    [(or val {:type turi :lit lit}) stream]))

(defn parse-lang
  [lit stream]
  (let [[lang stream] (read-until stream #(= \space %))]
    [{:lit lit :lang lang} stream]))

(defn parse-string
  [stream]
  (let [[s stream] (read-until stream #(= \" %))]
    (condp = (fnext stream)
      \^ (parse-literal-type s (nnext stream))
      \@ (parse-lang s (nnext stream))
      [s stream])))

(defn parse-subject
  "NT subject is <uri> or _:bnode"
  [str]
  (let [str (discard-until str WS #(or (= \< %) (= \_ %)))]
    (condp = (first str)
      \< (parse-uri (next str))
      \_ (parse-bnode (next str))
      str)))

(defn parse-pred
  [str] (parse-uri (next (discard-until str WS #(= \< %)))))

(defn parse-object
  "NT object is: <uri>, _:bnode or \"string\"."
  [str]
  (let [str (discard-until str WS #{\< \_ \"})]
    (condp = (first str)
      \< (parse-uri (next str))
      \_ (parse-bnode (next str))
      \" (parse-string (next str))
      str)))

(defn parse-ntriple
  "NT triple is \"subject <uri> object .\" (trailing dot)"
  [str]
  (let [[s str] (parse-subject str)
        [p str] (parse-pred (next str))
        [o str] (parse-object (next str))
        [_ str] (read-until str #(= \. %))]
    (if (or (empty? s) (empty? p))
      [nil (next str)]
      [[s p o] (next str)])))

(defn parse-ntriples
  "Takes NT string and returns vector of triples."
  [str]
  (loop [acc (transient []), str str]
    (if str
      (let [[spo str] (parse-ntriple str)]
        ;;(prn spo)
        (if spo
          (recur (conj! acc spo) str)
          (recur acc str)))
      (persistent! acc))))

(defn parse-ntriples-lazy
  "Takes NT string and returns lazy-seq of triples."
  [str]
  (lazy-seq
   (when str
     (let [[spo str] (parse-ntriple str)]
       (if spo
         (cons spo (parse-ntriples-lazy str)))))))

Complete namespace definition

(ns thi.ng.fabric.facts.io.ntriples
  (:require
   [thi.ng.strf.core :as strf]
   [thi.ng.xerror.core :as err]))

<<xsd>>

<<parser>>