# Simplifying Regular Expressions

Rewrite each regular expression to be simpler. (*Simpler* isn't a super-precise technical term. Often, shorter is simpler, but not always.)

In [30]:
import re
def abbrev(s):
    if len(s) > 20:
        return s[:10] + "..." + s[-10:]
    return s
def print_match(r, s):
    print "re.match(%s, %s) = %s" % (abbrev(repr(r)), abbrev(repr(s)), bool(re.match(r, s)))

In [33]:
r = "^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$"
print_match(r, str(2**46))
print_match(r, str(2**50))
print_match(r, str(2**54))

re.match('^[0-9][0-...-9][0-9]$', '70368744177664') = False
re.match('^[0-9][0-...-9][0-9]$', '1125899906842624') = True
re.match('^[0-9][0-...-9][0-9]$', '18014398509481984') = False


In [None]:
r = r"^http://|^https://"
print_match(r, "http://example.com")
print_match(r, "https://example.com")
print_match(r, "url=http://example.com")

In [23]:
r = "^[hH][tT][tT][pP][sS]://"
print_match(r, "https://example.com")
print_match(r, "HTTPS://EXAMPLE.COM")
print_match(r, "http://example.com")

re.match('[hH][tT][tT][pP][sS]', 'https://example.com') = True
re.match('[hH][tT][tT][pP][sS]', 'HTTPS://EXAMPLE.COM') = True
re.match('[hH][tT][tT][pP][sS]', 'http://example.com') = False


In [35]:
r = ".*mad|.*mid|.*mod|.*mud"
print_match(r, "mad hatter")
print_match(r, "rain makes mud")
print_match(r, "medical bay")

re.match('.*mad|.*m...mod|.*mud', 'mad hatter') = True
re.match('.*mad|.*m...mod|.*mud', 'rain makes mud') = True
re.match('.*mad|.*m...mod|.*mud', 'medical bay') = False


In [38]:
r = "^[abcdefghjklmnopqrstuvwxyz]+$"
print_match(r, "hello")
print_match(r, "world")
print_match(r, "igloo")

re.match('^[abcdefg...uvwxyz]+$', 'hello') = True
re.match('^[abcdefg...uvwxyz]+$', 'world') = True
re.match('^[abcdefg...uvwxyz]+$', 'igloo') = False


# Manipulating matches

Modify the code as instructed in the comments.

In [45]:
def reorder_columns(text):
    """Swaps the second and fourth columns of a multi-line text.
    
    Example:
    
        reorder_columns("1 2 3 4\na b c d\nalpha beta gamma delta\n")
          => "1 4 3 2\na d c b\nalpha delta gamma beta\n"
    """
    matches = re.findall(r"^(\s*\S+\s+)(\S+)(\s+\S+\s+)(\S+)(\s*)$", text, re.M)
    # Finish this method. You can change the code above too, if you want.
    for n, m in enumerate(matches):
        print "Match #%d: groups=%s" % (n, m)

reorder_columns("1 2 3 4\na b c d\nalpha beta gamma delta\n")

Match #0: groups=('1 ', '2', ' 3 ', '4', '')
Match #1: groups=('a ', 'b', ' c ', 'd', '')
Match #2: groups=('alpha ', 'beta', ' gamma ', 'delta', '\n')
1 4 3 2
a d c b
alpha delta gamma beta



In [52]:
def count_vowels(text):
    # Re-implement more efficiently. My best solution took
    # under a second in the timeit.
    count = 0
    for c in text:
        if c in "aeiou":
            count += 1
    return count

s = "hello world " * 2000000
%timeit count_vowels(s)

1 loop, best of 3: 7.81 s per loop


In [60]:
def add_emphasis(text, keywords):
    # Only emphasize the words in the set `keywords`. Other words should remain unchanged.
    def emphasize(word):
        return word.group().upper()
    return re.sub(r"\w+", emphasize, text)

add_emphasis("This is a very very long sentence. It took a very long time "
             "to type. I'm really sorry you had to read the whole thing.",
            {"very", "really", "whole"})

"This is a VERY VERY long sentence. It took a VERY long time to type. I'm REALLY sorry you had to read the WHOLE thing."