Defend against RCE and XSS; modernize.

RCE was theoretically possible depending on how strict cgi.parse_header() reads Content-Type. If it ever returned shell metacharacters, those would be passed directly to the shell. This patch switches the code from popen2 to subprocess, since the latter is deprecated, and we get lots of warnings about that in our Apache error logs.
w3c · Jan 31, 2014 · d6c21fd · d6c21fd
1 parent 971e2a9
commit d6c21fd
Showing 1 changed file with 93 additions and 109 deletions.
diff --git a/spellchecker b/spellchecker
@@ -15,25 +15,24 @@ $Id$
  branched from v 1.46
 """
 
-import os
 import cgi
+import checkremote
+import http_auth
+import os
 import sys
-import string
 import urlparse
-import popen2
 
-customized_dico="/usr/local/share/aspell/w3c.dat"
+from cleanhtml import *
+from subprocess import Popen, PIPE
+
 languages = {"en_US":"English","fr":"French"}
 def format_option(a,b,c):
         if a:
                 selected=""
                 if a==c:
                         selected=" selected='selected'"
-                return "<option value='%s'%s>%s</option>" % (a,selected,b)
-
-def concat(a,b):
-        return a+b
-
+                return clean_format("<option value='%s'%s>%s</option>",
+                                    a, selected, b)
 
 Page1 ="""Content-Type:text/html; charset=utf-8
 
@@ -93,108 +92,93 @@ Last Modified: $Date$
 """
 
 def format(fp,suggest):
-	line = fp.readline()
 	words = {}
-	count = 0
-	while line!="":
-		if line!="\n" and line !="*\n" and line[0]!="@":
-			line = line[:-1]
-			parts = string.split(line,": ")
-			fields = string.split(parts[0]," ")
-			if fields[0]=="&":
-				values = string.split(parts[1],", ")
-				if (not words.has_key(fields[1])):
-					words[fields[1]]=values
+        for line in fp:
+                line = line.rstrip('\n')
+		if line and line != "*" and line[0] != "@":
+			parts = line.split(': ')
+			fields = parts[0].split()
+                        if fields[1] in words:
+                                continue
+			elif fields[0]=="&":
+                                words[fields[1]] = parts[1].split(", ")
 			elif fields[0]=="#":
-				if (not words.has_key(fields[1])):
-					words[fields[1]]=[]
-		elif line=="\n":
-			count = count + 1
-		line = fp.readline()
-	offsets = {}
-	count = 0
-	if len(words):
-                keys = words.keys()
-                keys.sort()
-		print "<form action=\"http://www.w3.org/Team/update_dictionary\" method=\"post\"><ol>"
-		for error in keys:
-			print "<li>\"<span class='no'>%s</span>\" (<input type=\"checkbox\" name=\"list[]\" value=\"%s\"/> add to the dictionary)" % (error,error)
-
-			if len(words[error]) and suggest:
-				print "; suggestions:<ul class='suggestions'>"
-				for option in words[error]:
-					print "<li>%s</li>" % option
-				print "</ul>"				
-			print "</li>"
-		print "</ol><p><label><input type=\"submit\" value=\"Update dictionary\"/> (W3C Comm Team only)</label></p></form>"
-	else:
+                                words[fields[1]] = []
+	if not words:
 		print "<p><span class='yes'>No errors</span> found.</p>"
+                return
+        for error in sorted(words):
+                clean_print("<li>\"<span class='no'>%s</span>\"", error)
+                if words[error] and suggest:
+                        print "; suggestions:<ul class='suggestions'>"
+                        for option in words[error]:
+                                clean_print("<li>%s</li>", option)
+                        print "</ul>"				
+                print "</li>"
+
+def getLangSetup(fields):
+        lang = fields.getfirst('lang')
+        if lang not in languages:
+                lang = 'en_US'
+        lang_opts = ''.join(format_option(code, languages[code], lang)
+                            for code in languages)
+        return lang, lang_opts
+
+def getSuggestSetup(fields):
+        if fields.getfirst('suggest') == 'on':
+                return True, " checked='checked'"
+        return False, ""
+
+def getURI(fields):
+        uri = fields.getfirst('uri')
+        if (not uri) and ('referrer' in fields):
+                uri = os.environ.get('HTTP_REFERER')
+        return uri
 
 if __name__ == '__main__':
-	if  os.environ.has_key('SCRIPT_NAME'):
-		fields = cgi.FieldStorage()
-		uri ="" 
-		uri_text =""
-		uri_text1=""
-		suggest=0
-		suggest_txt=''
-		if fields.has_key('uri'):
-			uri = fields['uri'].value
-                elif fields.has_key('referrer') and os.environ.has_key('HTTP_REFERER'):
-                        uri = os.environ['HTTP_REFERER']
-                if uri:
-			uri_text1="for %s" % (cgi.escape(uri))
-			uri_text=" for <a href=\"%s\">%s</a>" %(cgi.escape(uri),cgi.escape(uri))
-                lang = "en_US"
-                if fields.has_key('lang') and fields['lang'].value in languages.keys():
-                        lang=fields['lang'].value
-                languages_options = reduce(concat,map(format_option,languages.keys(),languages.values(),[lang for x in languages.keys()]))
-
-		if fields.has_key('suggest'):
-			if fields['suggest'].value=='on':
-				suggest=1
-				suggest_txt=" checked='checked'"
-		if uri:
-			import http_auth
-			url_opener = http_auth.ProxyAuthURLopener()
-			try:
-				fp = url_opener.open(uri)
-			except IOError as e:
-				url_opener.error = "I/O error: %s %s" % (e.errno,e.strerror)
-				fp = None
-			print Page1 % ('<meta name="ROBOTS" content="NOINDEX,NOFOLLOW"/>',uri_text1,uri_text,cgi.escape(uri),languages_options,suggest_txt)
-			if fp:
-                                personal = "--personal=%s" % customized_dico
-                                if lang!="en_US":
-                                        personal = ""
-                                headers = fp.info()
-                                charset_opt = ""
-                                if headers.has_key('Content-Type'):
-                                        contentType = cgi.parse_header(headers["Content-Type"])
-                                        if contentType[1].has_key('charset'):
-                                                charset_opt = "-assume_charset=%s" % contentType[1]['charset']
-	        		command = "/usr/bin/lynx  %s -cfg=/usr/local/lib/lynx.cfg -nolist -dump -stdin|/usr/bin/aspell --encoding=utf-8 --lang %s -a %s --sug-mode=fast" % (charset_opt,lang,personal)
-
-        	                (piperfd,pipewfd,pipeErr) = popen2.popen3(command)
-
-				pipewfd.write(fp.read())
-				fp.close()
-				pipewfd.close()
-				# Need to find a way to display any errors if relevant
-				processingErrors=""
-				if (processingErrors):
-					print "<p>The following error occurred when trying to process your request :</p><pre class='no'>"
-					print "</pre>"
-					pipeErr.close()
-				if (piperfd):
-					print "<h2>Errors found in the page</h2>"
-					format(piperfd,suggest)
-					piperfd.close()
-			else:
-				print "<p><span class='no'>Unable to read</span> <a href='%s'>%s</a> (%s). Sorry, check the URI.</p>" % (cgi.escape(uri),cgi.escape(uri), url_opener.error)
-		else:
-			print Page1 % ('',uri_text1,uri_text,cgi.escape(uri),languages_options,suggest_txt)
-		print Page2
-
-
-
+        fields = cgi.FieldStorage()
+        lang, languages_options = getLangSetup(fields)
+        suggest, suggest_txt = getSuggestSetup(fields)
+        uri = getURI(fields)
+        if not uri:
+                print Page1 % ('', '', '', clean_str(uri), languages_options,
+                               suggest_txt)
+                print Page2
+                sys.exit()
+
+        uri_text1 = clean_format("for %s", uri)
+        uri_text = clean_format(" for <a href=\"%s\">%s</a>", uri, uri)
+        print Page1 % ('<meta name="ROBOTS" content="NOINDEX,NOFOLLOW"/>',
+                       uri_text1, uri_text, clean_str(uri), languages_options,
+                       suggest_txt)
+
+        url_opener = http_auth.ProxyAuthURLopener()
+        try:
+                fp = url_opener.open(uri)
+        except IOError as e:
+                url_opener.error = "I/O error: %s %s" % (e.errno,e.strerror)
+                fp = None
+        if fp is None:
+                clean_print("<p><span class='no'>Unable to read</span> <a href='%s'>%s</a> (%s). Sorry, check the URI.</p>", uri, uri, url_opener.error)
+                print Page2
+                sys.exit()
+
+        headers = fp.info()
+        lynx_cmd = ['/usr/bin/lynx', '-cfg=/usr/local/lib/lynx.cfg', '-nolist',
+                    '-dump', '-stdin']
+        if headers.has_key('Content-Type'):
+                contentType = cgi.parse_header(headers['Content-Type'])
+                if contentType[1].has_key('charset'):
+                        lynx_cmd.append('-assume_charset=%s' %
+                                        contentType[1]['charset'])
+
+        lynx_proc = Popen(lynx_cmd, stdin=fp, stdout=PIPE)
+        aspell_proc = Popen(['/usr/bin/aspell', '-a', '--encoding=utf-8',
+                             '--sug-mode=fast', '--lang', lang],
+                            stdin=lynx_proc.stdout, stdout=PIPE)
+        lynx_proc.wait()
+        lynx_proc.stdout.close()
+        print "<h2>Errors found in the page</h2>"
+        format(aspell_proc.stdout, suggest)
+        aspell_proc.stdout.close()
+        print Page2