Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

211 lines (167 sloc) 7.416 kB
#!/usr/bin/env python
# Copyright 2009 Facebook
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Escaping/unescaping methods for HTML, JSON, URLs, and others."""
import htmlentitydefs
import re
import xml.sax.saxutils
import urllib
# json module is in the standard library as of python 2.6; fall back to
# simplejson if present for older versions.
import json
assert hasattr(json, "loads") and hasattr(json, "dumps")
_json_decode = json.loads
_json_encode = json.dumps
import simplejson
_json_decode = lambda s: simplejson.loads(_unicode(s))
_json_encode = lambda v: simplejson.dumps(v)
except ImportError:
# For Google AppEngine
from django.utils import simplejson
_json_decode = lambda s: simplejson.loads(_unicode(s))
_json_encode = lambda v: simplejson.dumps(v)
except ImportError:
def _json_decode(s):
raise NotImplementedError(
"A JSON parser is required, e.g., simplejson at "
_json_encode = _json_decode
def xhtml_escape(value):
"""Escapes a string so it is valid within XML or XHTML."""
return xml.sax.saxutils.escape(value, {'"': """})
def xhtml_unescape(value):
"""Un-escapes an XML-escaped string."""
return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
def json_encode(value):
"""JSON-encodes the given Python object."""
# JSON permits but does not require forward slashes to be escaped.
# This is useful when json data is emitted in a <script> tag
# in HTML, as it prevents </script> tags from prematurely terminating
# the javscript. Some json libraries do this escaping by default,
# although python's standard library does not, so we do it here.
return _json_encode(value).replace("</", "<\\/")
def json_decode(value):
"""Returns Python objects for the given JSON string."""
return _json_decode(value)
def squeeze(value):
"""Replace all sequences of whitespace chars with a single space."""
return re.sub(r"[\x00-\x20]+", " ", value).strip()
def url_escape(value):
"""Returns a valid URL-encoded version of the given value."""
return urllib.quote_plus(utf8(value))
def url_unescape(value):
"""Decodes the given value from a URL."""
return _unicode(urllib.unquote_plus(value))
def utf8(value):
if isinstance(value, unicode):
return value.encode("utf-8")
assert isinstance(value, str)
return value
# I originally used the regex from
# but it gets all exponential on certain patterns (such as too many trailing
# dots), causing the regex matcher to never return.
# This regex should avoid those problems.
_URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")
def linkify(text, shorten=False, extra_params="",
require_protocol=False, permitted_protocols=["http", "https"]):
"""Converts plain text into HTML with links.
For example: linkify("Hello!") would return
Hello <a href=""></a>!
shorten: Long urls will be shortened for display.
extra_params: Extra text to include in the link tag,
e.g. linkify(text, extra_params='rel="nofollow" class="external"')
require_protocol: Only linkify urls which include a protocol. If this is
False, urls such as will also be linkified.
permitted_protocols: List (or set) of protocols which should be linkified,
e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
It is very unsafe to include protocols such as "javascript".
if extra_params:
extra_params = " " + extra_params.strip()
def make_link(m):
url =
proto =
if require_protocol and not proto:
return url # not protocol, no linkify
if proto and proto not in permitted_protocols:
return url # bad protocol, no linkify
href =
if not proto:
href = "http://" + href # no proto specified, use http
params = extra_params
# clip long urls. max_len is just an approximation
max_len = 30
if shorten and len(url) > max_len:
before_clip = url
if proto:
proto_len = len(proto) + 1 + len( or "") # +1 for :
proto_len = 0
parts = url[proto_len:].split("/")
if len(parts) > 1:
# Grab the whole host part plus the first bit of the path
# The path is usually not that interesting once shortened
# (no more slug, etc), so it really just provides a little
# extra indication of shortening.
url = url[:proto_len] + parts[0] + "/" + \
if len(url) > max_len * 1.5: # still too long
url = url[:max_len]
if url != before_clip:
amp = url.rfind('&')
# avoid splitting html char entities
if amp > max_len - 5:
url = url[:amp]
url += "..."
if len(url) >= len(before_clip):
url = before_clip
# full url is visible on mouse-over (for those who don't
# have a status bar, such as Safari by default)
params += ' title="%s"' % href
return u'<a href="%s"%s>%s</a>' % (href, params, url)
# First HTML-escape so that our strings are all safe.
# The regex is modified to avoid character entites other than &amp; so
# that we won't pick up &quot;, etc.
text = _unicode(xhtml_escape(text))
return _URL_RE.sub(make_link, text)
def _unicode(value):
if isinstance(value, str):
return value.decode("utf-8")
assert isinstance(value, unicode)
return value
def _convert_entity(m):
if == "#":
return unichr(int(
except ValueError:
return "&#%s;" %
except KeyError:
return "&%s;" %
def _build_unicode_map():
unicode_map = {}
for name, value in htmlentitydefs.name2codepoint.iteritems():
unicode_map[name] = unichr(value)
return unicode_map
_HTML_UNICODE_MAP = _build_unicode_map()
Jump to Line
Something went wrong with that request. Please try again.