Skip to content

Commit

Permalink
Added wvlib format support for loading word vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
vseloved committed Aug 11, 2017
1 parent 54df6f0 commit e78ef8f
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
6 changes: 5 additions & 1 deletion cl-nlp-contrib.asd
Expand Up @@ -39,7 +39,11 @@
(:file "ptb")
(:file "nps-chat")
(:file "semcor")
(:file "reuters")))))
(:file "reuters")))
(:module #:embeddings
:depends-on ("packages")
:components
((:file "wvlib")))))
#+dev
(:module #:test
:components
Expand Down
29 changes: 29 additions & 0 deletions contrib/embeddings/wvlib.lisp
@@ -0,0 +1,29 @@
;;; (c) 2017 Vsevolod Dyomkin

(in-package #:nlp.embeddings)
(named-readtables:in-readtable rutilsx-readtable)

#+sbcl
(defmethod init-vecs ((vecs mem-vecs) (format (eql :wvlib)) file &key prolog)
(let ((dict #h(equal)))
(block reading
(with-open-file (in file :element-type 'unsigned-byte)
(when prolog
(loop :for ch := (read-byte in) :until (char= #\Newline (code-char ch))))
(loop :for cc :from 0 :do
(let ((word (loop :for ch := (read-byte in nil)
:unless ch :do (return-from reading)
:until (char= #\Space (code-char ch))
:collect (code-char ch) :into rez
:finally (return (coerce rez 'string))))
(vec (make-array @vecs.order :element-type 'single-float)))
(file-position in (file-position in)) ; sync file position for unix-read
(sb-unix:unix-read (sb-sys:fd-stream-fd in)
(sb-sys:vector-sap vec)
(* 4 @vecs.order))
(:= (? dict (normalize vecs word)) vec)
(read-byte in nil) ; skip newline
(when (zerop (rem cc 1000)) (format *debug-io* "."))))))
(:= @vecs.dict dict)
vecs))

2 changes: 1 addition & 1 deletion version.txt
@@ -1,3 +1,3 @@
0.2.1
0.0.1
0.1.0
0.0.3

0 comments on commit e78ef8f

Please sign in to comment.