Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

524 lines (507 sloc) 20.208 kB
;;;
;;; This code was written by:
;;;
;;; Lawrence E. Freil <lef@freil.com>
;;; National Science Center Foundation
;;; Augusta, Georgia 30909
;;;
;;; This program was released into the public domain on 2005-08-31.
;;; (See the slime-devel mailing list archive for details.)
;;;
;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
;;; parser.
;;;
;;; This regular expression parser operates by taking a
;;; regular expression and breaking it down into a list
;;; consisting of lisp expressions and flags. The list
;;; of lisp expressions is then taken in turned into a
;;; lambda expression that can be later applied to a
;;; string argument for parsing.
;;;;
;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz)
;;;; to get working with Corman Lisp 1.42, add package statement and export
;;;; relevant functions.
;;;;
(in-package :cl-user)
;; Renamed to slime-nregex avoid name clashes with other versions of
;; this file. -- he
;;;; CND - 6/3/2001
(defpackage slime-nregex
(:use #:common-lisp)
(:export
#:regex
#:regex-compile
))
;;;; CND - 6/3/2001
(in-package :slime-nregex)
;;;
;;; First we create a copy of macros to help debug the beast
(eval-when (:compile-toplevel :load-toplevel :execute)
(defvar *regex-debug* nil) ; Set to nil for no debugging code
)
(defmacro info (message &rest args)
(if *regex-debug*
`(format *standard-output* ,message ,@args)))
;;;
;;; Declare the global variables for storing the paren index list.
;;;
(defvar *regex-groups* (make-array 10))
(defvar *regex-groupings* 0)
;;;
;;; Declare a simple interface for testing. You probably wouldn't want
;;; to use this interface unless you were just calling this once.
;;;
(defun regex (expression string)
"Usage: (regex <expression> <string)
This function will call regex-compile on the expression and then apply
the string to the returned lambda list."
(let ((findit (cond ((stringp expression)
(regex-compile expression))
((listp expression)
expression)))
(result nil))
(if (not (funcall (if (functionp findit)
findit
(eval `(function ,findit))) string))
(return-from regex nil))
(if (= *regex-groupings* 0)
(return-from regex t))
(dotimes (i *regex-groupings*)
(push (funcall 'subseq
string
(car (aref *regex-groups* i))
(cadr (aref *regex-groups* i)))
result))
(reverse result)))
;;;
;;; Declare some simple macros to make the code more readable.
;;;
(defvar *regex-special-chars* "?*+.()[]\\${}")
(defmacro add-exp (list)
"Add an item to the end of expression"
`(setf expression (append expression ,list)))
;;;
;;; Define a function that will take a quoted character and return
;;; what the real character should be plus how much of the source
;;; string was used. If the result is a set of characters, return an
;;; array of bits indicating which characters should be set. If the
;;; expression is one of the sub-group matches return a
;;; list-expression that will provide the match.
;;;
(defun regex-quoted (char-string &optional (invert nil))
"Usage: (regex-quoted <char-string> &optional invert)
Returns either the quoted character or a simple bit vector of bits set for
the matching values"
(let ((first (char char-string 0))
(result (char char-string 0))
(used-length 1))
(cond ((eql first #\n)
(setf result #\NewLine))
((eql first #\c)
(setf result #\Return))
((eql first #\t)
(setf result #\Tab))
((eql first #\d)
(setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
((eql first #\D)
(setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
((eql first #\w)
(setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
((eql first #\W)
(setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
((eql first #\b)
(setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
((eql first #\B)
(setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
((eql first #\s)
(setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
((eql first #\S)
(setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
((and (>= (char-code first) (char-code #\0))
(<= (char-code first) (char-code #\9)))
(if (and (> (length char-string) 2)
(and (>= (char-code (char char-string 1)) (char-code #\0))
(<= (char-code (char char-string 1)) (char-code #\9))
(>= (char-code (char char-string 2)) (char-code #\0))
(<= (char-code (char char-string 2)) (char-code #\9))))
;;
;; It is a single character specified in octal
;;
(progn
(setf result (do ((x 0 (1+ x))
(return 0))
((= x 2) return)
(setf return (+ (* return 8)
(- (char-code (char char-string x))
(char-code #\0))))))
(setf used-length 3))
;;
;; We have a group number replacement.
;;
(let ((group (- (char-code first) (char-code #\0))))
(setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
(cadr (aref *regex-groups* ,group)))))
(if (< length (+ index (length nstring)))
(return-from compare nil))
(if (not (string= string nstring
:start1 index
:end1 (+ index (length nstring))))
(return-from compare nil)
(incf index (length nstring)))))))))
(t
(setf result first)))
(if (and (vectorp result) invert)
(bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
(values result used-length)))
;;;
;;; Now for the main regex compiler routine.
;;;
(defun regex-compile (source &key (anchored nil))
"Usage: (regex-compile <expression> [ :anchored (t/nil) ])
This function take a regular expression (supplied as source) and
compiles this into a lambda list that a string argument can then
be applied to. It is also possible to compile this lambda list
for better performance or to save it as a named function for later
use"
(info "Now entering regex-compile with \"~A\"~%" source)
;;
;; This routine works in two parts.
;; The first pass take the regular expression and produces a list of
;; operators and lisp expressions for the entire regular expression.
;; The second pass takes this list and produces the lambda expression.
(let ((expression '()) ; holder for expressions
(group 1) ; Current group index
(group-stack nil) ; Stack of current group endings
(result nil) ; holder for built expression.
(fast-first nil)) ; holder for quick unanchored scan
;;
;; If the expression was an empty string then it alway
;; matches (so lets leave early)
;;
(if (= (length source) 0)
(return-from regex-compile
'(lambda (&rest args)
(declare (ignore args))
t)))
;;
;; If the first character is a caret then set the anchored
;; flags and remove if from the expression string.
;;
(cond ((eql (char source 0) #\^)
(setf source (subseq source 1))
(setf anchored t)))
;;
;; If the first sequence is .* then also set the anchored flags.
;; (This is purely for optimization, it will work without this).
;;
(if (>= (length source) 2)
(if (string= source ".*" :start1 0 :end1 2)
(setf anchored t)))
;;
;; Also, If this is not an anchored search and the first character is
;; a literal, then do a quick scan to see if it is even in the string.
;; If not then we can issue a quick nil,
;; otherwise we can start the search at the matching character to skip
;; the checks of the non-matching characters anyway.
;;
;; If I really wanted to speed up this section of code it would be
;; easy to recognize the case of a fairly long multi-character literal
;; and generate a Boyer-Moore search for the entire literal.
;;
;; I generate the code to do a loop because on CMU Lisp this is about
;; twice as fast a calling position.
;;
(if (and (not anchored)
(not (position (char source 0) *regex-special-chars*))
(not (and (> (length source) 1)
(position (char source 1) *regex-special-chars*))))
(setf fast-first `((if (not (dotimes (i length nil)
(if (eql (char string i)
,(char source 0))
(return (setf start i)))))
(return-from final-return nil)))))
;;
;; Generate the very first expression to save the starting index
;; so that group 0 will be the entire string matched always
;;
(add-exp '((setf (aref *regex-groups* 0)
(list index nil))))
;;
;; Loop over each character in the regular expression building the
;; expression list as we go.
;;
(do ((eindex 0 (1+ eindex)))
((= eindex (length source)))
(let ((current (char source eindex)))
(info "Now processing character ~A index = ~A~%" current eindex)
(case current
((#\.)
;;
;; Generate code for a single wild character
;;
(add-exp '((if (>= index length)
(return-from compare nil)
(incf index)))))
((#\$)
;;
;; If this is the last character of the expression then
;; anchor the end of the expression, otherwise let it slide
;; as a standard character (even though it should be quoted).
;;
(if (= eindex (1- (length source)))
(add-exp '((if (not (= index length))
(return-from compare nil))))
(add-exp '((if (not (and (< index length)
(eql (char string index) #\$)))
(return-from compare nil)
(incf index))))))
((#\*)
(add-exp '(ASTRISK)))
((#\+)
(add-exp '(PLUS)))
((#\?)
(add-exp '(QUESTION)))
((#\()
;;
;; Start a grouping.
;;
(incf group)
(push group group-stack)
(add-exp `((setf (aref *regex-groups* ,(1- group))
(list index nil))))
(add-exp `(,group)))
((#\))
;;
;; End a grouping
;;
(let ((group (pop group-stack)))
(add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
index)))
(add-exp `(,(- group)))))
((#\[)
;;
;; Start of a range operation.
;; Generate a bit-vector that has one bit per possible character
;; and then on each character or range, set the possible bits.
;;
;; If the first character is carat then invert the set.
(let* ((invert (eql (char source (1+ eindex)) #\^))
(bitstring (make-array 256 :element-type 'bit
:initial-element
(if invert 1 0)))
(set-char (if invert 0 1)))
(if invert (incf eindex))
(do ((x (1+ eindex) (1+ x)))
((eql (char source x) #\]) (setf eindex x))
(info "Building range with character ~A~%" (char source x))
(cond ((and (eql (char source (1+ x)) #\-)
(not (eql (char source (+ x 2)) #\])))
(if (>= (char-code (char source x))
(char-code (char source (+ 2 x))))
(error "Invalid range \"~A-~A\". Ranges must be in acending order"
(char source x) (char source (+ 2 x))))
(do ((j (char-code (char source x)) (1+ j)))
((> j (char-code (char source (+ 2 x))))
(incf x 2))
(info "Setting bit for char ~A code ~A~%" (code-char j) j)
(setf (sbit bitstring j) set-char)))
(t
(cond ((not (eql (char source x) #\]))
(let ((char (char source x)))
;;
;; If the character is quoted then find out what
;; it should have been
;;
(if (eql (char source x) #\\ )
(let ((length))
(multiple-value-setq (char length)
(regex-quoted (subseq source x) invert))
(incf x length)))
(info "Setting bit for char ~A code ~A~%" char (char-code char))
(if (not (vectorp char))
(setf (sbit bitstring (char-code (char source x))) set-char)
(bit-ior bitstring char t))))))))
(add-exp `((let ((range ,bitstring))
(if (>= index length)
(return-from compare nil))
(if (= 1 (sbit range (char-code (char string index))))
(incf index)
(return-from compare nil)))))))
((#\\ )
;;
;; Intreprete the next character as a special, range, octal, group or
;; just the character itself.
;;
(let ((length)
(value))
(multiple-value-setq (value length)
(regex-quoted (subseq source (1+ eindex)) nil))
(cond ((listp value)
(add-exp value))
((characterp value)
(add-exp `((if (not (and (< index length)
(eql (char string index)
,value)))
(return-from compare nil)
(incf index)))))
((vectorp value)
(add-exp `((let ((range ,value))
(if (>= index length)
(return-from compare nil))
(if (= 1 (sbit range (char-code (char string index))))
(incf index)
(return-from compare nil)))))))
(incf eindex length)))
(t
;;
;; We have a literal character.
;; Scan to see how many we have and if it is more than one
;; generate a string= verses as single eql.
;;
(let* ((lit "")
(term (dotimes (litindex (- (length source) eindex) nil)
(let ((litchar (char source (+ eindex litindex))))
(if (position litchar *regex-special-chars*)
(return litchar)
(progn
(info "Now adding ~A index ~A to lit~%" litchar
litindex)
(setf lit (concatenate 'string lit
(string litchar)))))))))
(if (= (length lit) 1)
(add-exp `((if (not (and (< index length)
(eql (char string index) ,current)))
(return-from compare nil)
(incf index))))
;;
;; If we have a multi-character literal then we must
;; check to see if the next character (if there is one)
;; is an astrisk or a plus or a question mark. If so then we must not use this
;; character in the big literal.
(progn
(if (or (eql term #\*)
(eql term #\+)
(eql term #\?))
(setf lit (subseq lit 0 (1- (length lit)))))
(add-exp `((if (< length (+ index ,(length lit)))
(return-from compare nil))
(if (not (string= string ,lit :start1 index
:end1 (+ index ,(length lit))))
(return-from compare nil)
(incf index ,(length lit)))))))
(incf eindex (1- (length lit))))))))
;;
;; Plug end of list to return t. If we made it this far then
;; We have matched!
(add-exp '((setf (cadr (aref *regex-groups* 0))
index)))
(add-exp '((return-from final-return t)))
;;
;;; (print expression)
;;
;; Now take the expression list and turn it into a lambda expression
;; replacing the special flags with lisp code.
;; For example: A BEGIN needs to be replace by an expression that
;; saves the current index, then evaluates everything till it gets to
;; the END then save the new index if it didn't fail.
;; On an ASTRISK I need to take the previous expression and wrap
;; it in a do that will evaluate the expression till an error
;; occurs and then another do that encompases the remainder of the
;; regular expression and iterates decrementing the index by one
;; of the matched expression sizes and then returns nil. After
;; the last expression insert a form that does a return t so that
;; if the entire nested sub-expression succeeds then the loop
;; is broken manually.
;;
(setf result (copy-tree nil))
;;
;; Reversing the current expression makes building up the
;; lambda list easier due to the nexting of expressions when
;; and astrisk has been encountered.
(setf expression (reverse expression))
(do ((elt 0 (1+ elt)))
((>= elt (length expression)))
(let ((piece (nth elt expression)))
;;
;; Now check for PLUS, if so then ditto the expression and then let the
;; ASTRISK below handle the rest.
;;
(cond ((eql piece 'PLUS)
(cond ((listp (nth (1+ elt) expression))
(setf result (append (list (nth (1+ elt) expression))
result)))
;;
;; duplicate the entire group
;; NOTE: This hasn't been implemented yet!!
(t
(error "GROUP repeat hasn't been implemented yet~%")))))
(cond ((listp piece) ;Just append the list
(setf result (append (list piece) result)))
((eql piece 'QUESTION) ; Wrap it in a block that won't fail
(cond ((listp (nth (1+ elt) expression))
(setf result
(append `((progn (block compare
,(nth (1+ elt)
expression))
t))
result))
(incf elt))
;;
;; This is a QUESTION on an entire group which
;; hasn't been implemented yet!!!
;;
(t
(error "Optional groups not implemented yet~%"))))
((or (eql piece 'ASTRISK) ; Do the wild thing!
(eql piece 'PLUS))
(cond ((listp (nth (1+ elt) expression))
;;
;; This is a single character wild card so
;; do the simple form.
;;
(setf result
`((let ((oindex index))
(block compare
(do ()
(nil)
,(nth (1+ elt) expression)))
(do ((start index (1- start)))
((< start oindex) nil)
(let ((index start))
(block compare
,@result))))))
(incf elt))
(t
;;
;; This is a subgroup repeated so I must build
;; the loop using several values.
;;
))
)
(t t)))) ; Just ignore everything else.
;;
;; Now wrap the result in a lambda list that can then be
;; invoked or compiled, however the user wishes.
;;
(if anchored
(setf result
`(lambda (string &key (start 0) (end (length string)))
(setf *regex-groupings* ,group)
(block final-return
(block compare
(let ((index start)
(length end))
,@result)))))
(setf result
`(lambda (string &key (start 0) (end (length string)))
(setf *regex-groupings* ,group)
(block final-return
(let ((length end))
,@fast-first
(do ((marker start (1+ marker)))
((> marker end) nil)
(let ((index marker))
(if (block compare
,@result)
(return t)))))))))))
;; (provide 'nregex)
Jump to Line
Something went wrong with that request. Please try again.