In [None]:
string = "Γειά σου Κόσμε"
print string
print repr(string), '\n'

ustring = u"Γειά σου Κόσμε"
print ustring
print repr(ustring)
print repr(ustring.encode('utf-8'))

In [None]:
# Alright, let's start breaking things.
print str(string)
print str(ustring)

In [None]:
# So str-casting unicode with non-ascii characters is out.
# "".format()-ing is essentially the same thing.
print "{}".format(string)
print "{}".format(ustring)

In [None]:
# The args must be able to convert to the format string's type, so let's try:
print u"{}".format(ustring)

# But what if:
print u"{}".format(string)

In [None]:
# ...yeah, no.
# Ok, so what if we unicode cast it?
print u"{}".format(unicode(string))

In [None]:
# Not like that, clearly.
print u"{}".format(unicode(string, 'utf-8'))

In [None]:
# Better.
# Just be careful how you cast unicode strings...
print unicode(ustring)
print unicode(ustring, 'utf-8')

In [None]:
# So we can't just go using `unicode(s, 'utf-8')` on any `basestring`. So if it might be str or unicode:
print unicode(string, 'utf-8') if isinstance(string, str) else unicode(string)
print unicode(ustring, 'utf-8') if isinstance(ustring, str) else unicode(ustring)

In [None]:
# This is almost identical to the `unicode_str` function I've added in a couple repos,
# but my function adds one more arg. Why?
bad_string = 'foo' + chr(255) + 'bar'
print bad_string
print repr(bad_string)
print unicode(bad_string, 'utf-8')

In [None]:
# The third arg is how to respond to errors. Default, obviously, is to raise the error.
# You can also ignore the offending characters
print unicode(bad_string, 'utf-8', 'ignore')
# or replace them with the replacement character. I do this in unicode_str.
print unicode(bad_string, 'utf-8', 'replace')

In [None]:
# What about printing/writing json?
import json

# This doesn't break, but it's not utf-8.
print json.dumps([string])
print json.dumps([ustring])

with open('sample.json', 'w') as f:
    f.write(json.dumps([string, ustring]))

with open('sample.json') as f:
    text = f.read()
    print '\n', text
    print repr(text)
    print json.loads(text)

In [None]:
# Because json.dumps defaults to encode any non-ascii output (but not to utf-8). If we tell it not to...
print repr(json.dumps([string], ensure_ascii=False))
print repr(json.dumps([ustring], ensure_ascii=False))

In [None]:
# And while we can write the str,
with open('no-error.json', 'w') as f:
    f.write(json.dumps([string], ensure_ascii=False))

with open('no-error.json') as f:
    text = f.read()
    print text
    print repr(text)
    print json.loads(text)

# we can't write the unicode.
with open('error.json', 'w') as f:
    f.write(json.dumps([ustring], ensure_ascii=False))

In [None]:
# If the strings are unicode, we can then encode the json.dumps output
print json.dumps([ustring], ensure_ascii=False).encode('utf-8')

with open('sample2.json', 'w') as f:
    f.write(json.dumps([ustring], ensure_ascii=False).encode('utf-8'))

with open('sample2.json') as f:
    text = f.read()
    print '\n', text
    print repr(text)
    print json.loads(text)

# But if they're not...
print json.dumps([string], ensure_ascii=False).encode('utf-8')

In [None]:
# mongoengine/MongoDB, requests, Flask, and click all use unicode strings.
# I'm not sure if click does this in all cases, or just infers when a string needs to be unicode.
# In any case, our main hangup seems to be that we keep trying to str()-cast and "".format() basestrings that
# may in fact contain non-ascii characters. Let's stop that.
# If we want to ensure something is a basestring,:

def unicode_str(s):
    return unicode(s, 'utf-8', 'replace') if isinstance(s, str) else unicode(s)

ustrings = map(unicode_str, [string, ustring, bad_string])
for u in ustrings:
    print u, '\t\t', repr(u)
    print u"{}".format(u), '\t\t', repr(u"{}".format(u)), '\n'

In [None]:
with open('sample3.json', 'w') as f:
    f.write(json.dumps(map(unicode_str, [string, ustring, bad_string]), indent=2, ensure_ascii=False).encode('utf-8'))

with open('sample3.json') as f:
    text = f.read()
    print '\n', text
    print repr(text)
    print json.loads(text)