44var html_entity_decode = require ( './entities' ) . decode ;
55
66var never_allowed_str = {
7- 'document.cookie' : '' ,
8- 'document.write' : '' ,
9- '.parentNode' : '' ,
10- '.innerHTML' : '' ,
11- 'window.location' : '' ,
12- '-moz-binding' : '' ,
7+ 'document.cookie' : '[removed] ' ,
8+ 'document.write' : '[removed] ' ,
9+ '.parentNode' : '[removed] ' ,
10+ '.innerHTML' : '[removed] ' ,
11+ 'window.location' : '[removed] ' ,
12+ '-moz-binding' : '[removed] ' ,
1313 '<!--' : '<!--' ,
1414 '-->' : '-->' ,
15- '(<!\\[CDATA\\[)' : '<![CDATA['
15+ '(<!\\[CDATA\\[)' : '<![CDATA[' ,
16+ '<comment>' : '<comment>'
1617} ;
1718
1819var never_allowed_regex = {
19- 'javascript\\s*:' : '' ,
20- 'expression\\s*(\\(|&\\#40;)' : '' ,
21- 'vbscript\\s*:' : '' ,
22- 'Redirect\\s+302' : ''
20+ 'javascript\\s*:' : '[removed]' ,
21+ 'expression\\s*(\\(|()' : '[removed]' ,
22+ 'vbscript\\s*:' : '[removed]' ,
23+ 'Redirect\\s+302' : '[removed]' ,
24+ "([\"'])?data\\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?" : '[removed]'
2325} ;
2426
2527var non_displayables = [
@@ -32,8 +34,8 @@ var non_displayables = [
3234
3335var compact_words = [
3436 'javascript' , 'expression' , 'vbscript' ,
35- 'script' , 'applet ' , 'alert ' , 'document ' ,
36- 'write' , 'cookie' , 'window'
37+ 'script' , 'base64 ' , 'applet ' , 'alert ' ,
38+ 'document' , ' write', 'cookie' , 'window'
3739] ;
3840
3941exports . clean = function ( str , is_image ) {
@@ -43,6 +45,8 @@ exports.clean = function(str, is_image) {
4345 for ( var i in str ) {
4446 str [ i ] = exports . clean ( str [ i ] ) ;
4547 }
48+ //We emulate the PHP behavior in CodeIgniter.
49+ str . toString = function ( ) { return 'Array' ; }
4650 return str ;
4751 }
4852
@@ -55,10 +59,14 @@ exports.clean = function(str, is_image) {
5559 // ensure str does not contain hash before inserting it
5660 hash = xss_hash ( ) ;
5761 } while ( str . indexOf ( hash ) >= 0 )
58- str = str . replace ( / \& ( [ a - z \_ 0 - 9 ] + ) \= ( [ a - z \_ 0 - 9 ] + ) / ig, hash + '$1=$2' ) ;
62+ str = str . replace ( / \& ( [ a - z \_ 0 - 9 \- ] + ) \= ( [ a - z \_ 0 - 9 \- ] + ) / ig, hash + '$1=$2' ) ;
63+
64+ //Validate standard character entities. Add a semicolon if missing. We do this to enable
65+ //the conversion of entities to ASCII later.
66+ str = str . replace ( / ( & # ? [ 0 - 9 a - z ] { 2 , } ) ( [ \x00 - \x20 ] ) * ; ? / ig, '$1;$2' ) ;
5967
6068 //Validate UTF16 two byte encoding (x00) - just as above, adds a semicolon if missing.
61- str = str . replace ( / ( & \ #x ? ) ( [ 0 - 9 A - F ] + ) ; ? / ig, '$1$2;' ) ;
69+ str = str . replace ( / ( & # x ? ) ( [ 0 - 9 A - F ] + ) ; ? / ig, '$1$2;' ) ;
6270
6371 //Un-protect query string variables
6472 str = str . replace ( new RegExp ( hash , 'g' ) , '&' ) ;
@@ -77,6 +85,9 @@ exports.clean = function(str, is_image) {
7785 str = str . replace ( / [ a - z ] + = ( [ \' \" ] ) .* ?\1/ gi, function ( m , match ) {
7886 return m . replace ( match , convert_attribute ( match ) ) ;
7987 } ) ;
88+ str = str . replace ( / < \w + .* / gi, function ( m ) {
89+ return m . replace ( m , html_entity_decode ( m ) ) ;
90+ } ) ;
8091
8192 //Remove invisible characters again
8293 str = remove_invisible_characters ( str ) ;
@@ -113,41 +124,54 @@ exports.clean = function(str, is_image) {
113124
114125 if ( str . match ( / < a / i) ) {
115126 str = str . replace ( / < a \s + ( [ ^ > ] * ?) ( > | $ ) / gi, function ( m , attributes , end_tag ) {
116- attributes = filter_attributes ( attributes . replace ( '<' , '' ) . replace ( '>' , '' ) ) ;
117- if ( attributes . match ( / h r e f = .* ?( a l e r t \( | a l e r t & \# 4 0 ; | j a v a s c r i p t \: | c h a r s e t \= | w i n d o w \. | d o c u m e n t \. | \. c o o k i e | < s c r i p t | < x s s | b a s e 6 4 \s * , ) / gi) ) {
118- return m . replace ( attributes , '' ) ;
119- }
120- return m ;
127+ var filtered_attributes = filter_attributes ( attributes . replace ( '<' , '' ) . replace ( '>' , '' ) ) ;
128+ filtered_attributes = filtered_attributes . replace ( / h r e f = .* ?(?: a l e r t \( | a l e r t & # 4 0 ; | j a v a s c r i p t : | l i v e s c r i p t : | m o c h a : | c h a r s e t = | w i n d o w \. | d o c u m e n t \. | \. c o o k i e | < s c r i p t | < x s s | d a t a \s * : ) / gi, '' ) ;
129+ return m . replace ( attributes , filtered_attributes ) ;
121130 } ) ;
122131 }
123132
124133 if ( str . match ( / < i m g / i) ) {
125134 str = str . replace ( / < i m g \s + ( [ ^ > ] * ?) ( \s ? \/ ? > | $ ) / gi, function ( m , attributes , end_tag ) {
126- attributes = filter_attributes ( attributes . replace ( '<' , '' ) . replace ( '>' , '' ) ) ;
127- if ( attributes . match ( / s r c = .* ?( a l e r t \( | a l e r t & \# 4 0 ; | j a v a s c r i p t \: | c h a r s e t \= | w i n d o w \. | d o c u m e n t \. | \. c o o k i e | < s c r i p t | < x s s | b a s e 6 4 \s * , ) / gi) ) {
128- return m . replace ( attributes , '' ) ;
129- }
130- return m ;
135+ var filtered_attributes = filter_attributes ( attributes . replace ( '<' , '' ) . replace ( '>' , '' ) ) ;
136+ filtered_attributes = filtered_attributes . replace ( / s r c = .* ?(?: a l e r t \( | a l e r t & # 4 0 ; | j a v a s c r i p t : | l i v e s c r i p t : | m o c h a : | c h a r s e t = | w i n d o w \. | d o c u m e n t \. | \. c o o k i e | < s c r i p t | < x s s | b a s e 6 4 \s * , ) / gi, '' ) ;
137+ return m . replace ( attributes , filtered_attributes ) ;
131138 } ) ;
132139 }
133140
134141 if ( str . match ( / s c r i p t / i) || str . match ( / x s s / i) ) {
135- str = str . replace ( / < ( \/ * ) ( s c r i p t | x s s ) ( .* ?) \> / gi, '' ) ;
142+ str = str . replace ( / < ( \/ * ) ( s c r i p t | x s s ) ( .* ?) \> / gi, '[removed] ' ) ;
136143 }
137144
138- } while ( original != str ) ;
145+ } while ( original !== str ) ;
139146
140- //Remove JavaScript Event Handlers - Note: This code is a little blunt. It removes the event
141- //handler and anything up to the closing >, but it's unlikely to be a problem.
142- var event_handlers = [ '[^a-z_\-]on\\w*' ] ;
147+ // Remove Evil HTML Attributes (like event handlers and style)
148+ var event_handlers = [ 'on\\w*' , 'style' , 'formaction' ] ;
143149
144150 //Adobe Photoshop puts XML metadata into JFIF images, including namespacing,
145151 //so we have to allow this for images
146152 if ( ! is_image ) {
147153 event_handlers . push ( 'xmlns' ) ;
148154 }
149155
150- str = str . replace ( new RegExp ( "<([^><]+?)(" + event_handlers . join ( '|' ) + ")(\\s*=\\s*[^><]*)([><]*)" , 'i' ) , '<$1$4' ) ;
156+ do {
157+ var attribs = [ ] ;
158+ var count = 0 ;
159+
160+ attribs = attribs . concat ( str . match ( new RegExp ( "(" + event_handlers . join ( '|' ) + ")\\s*=\\s*(\\x22|\\x27)([^\\2]*?)(\\2)" , 'ig' ) ) ) ;
161+ attribs = attribs . concat ( str . match ( new RegExp ( "(" + event_handlers . join ( '|' ) + ")\\s*=\\s*([^\\s>]*)" , 'ig' ) ) ) ;
162+ attribs = attribs . filter ( function ( element ) { return element !== null ; } ) ;
163+
164+ if ( attribs . length > 0 ) {
165+ for ( var i = 0 ; i < attribs . length ; ++ i ) {
166+ attribs [ i ] = attribs [ i ] . replace ( new RegExp ( '[.\\\\+*?\\[\\^\\]$(){}=!<>|:\\-]' , 'g' ) , '\\$&' )
167+ }
168+
169+ str = str . replace ( new RegExp ( "(<?)(\/?[^><]+?)([^A-Za-z<>\\-])(.*?)(" + attribs . join ( '|' ) + ")(.*?)([\\s><]?)([><]*)" , 'i' ) , function ( m , a , b , c , d , e , f , g , h ) {
170+ ++ count ;
171+ return a + b + ' ' + d + f + g + h ;
172+ } ) ;
173+ }
174+ } while ( count > 0 ) ;
151175
152176 //Sanitize naughty HTML elements
153177 //If a tag containing any of the words in the list
@@ -200,13 +224,15 @@ function convert_attribute(str) {
200224}
201225
202226function filter_attributes ( str ) {
203- var comments = / \/ \* .* ?\* \/ / g;
204- return str . replace ( / \s * [ a - z - ] + \s * = \s * ' [ ^ ' ] * ' / gi, function ( m ) {
205- return m . replace ( comments , '' ) ;
206- } ) . replace ( / \s * [ a - z - ] + \s * = \s * " [ ^ " ] * " / gi, function ( m ) {
207- return m . replace ( comments , '' ) ;
208- } ) . replace ( / \s * [ a - z - ] + \s * = \s * [ ^ \s ] + / gi, function ( m ) {
209- return m . replace ( comments , '' ) ;
210- } ) ;
227+ var result = "" ;
228+
229+ var match = str . match ( / \s * [ a - z - ] + \s * = \s * ( \x22 | \x27 ) ( [ ^ \1] * ?) \1/ ig) ;
230+ if ( match ) {
231+ for ( var i = 0 ; i < match . length ; ++ i ) {
232+ result += match [ i ] . replace ( / \* .* ?\* / g, '' ) ;
233+ }
234+ }
235+
236+ return result ;
211237}
212238
0 commit comments