# Regular Expressions

RegEx - pattern matching language
Used for searches, find, replace
Expression built using string of characters: Metacharacters (special meaning) and Regular characters (literal) meaning
Simple to learn, hard + long to master

Regex is efficient for finding patterns in text

Regex allows string searching and manipulation - frequently used for web scraping

In [12]:
# Metacharacters
# . - matches any single characters
# [ ] - matches the single characters contained in [], can use ranges [a-z]
# [^ ] - matches a signle char not contained in []
# ^ - matches expression if at the start of the string
# $ - matches expression if at the end of the string
# () - contains substrings (BODMAS?)
# * - matches the preceding element zero or more times

# Option Flags
# re.I - ignore case matching
# re.M - makes $ match end of line and ^ start of line
# re.S - makes . match any char, even new line char
# re.U - interprets in Unicode
# re.X - ignores whitespace within patter

# Base Methods:
# match() - checks to see if the expression matches the entire string
# search() - checks to see if there is a match anywhere in the string

# for performance - compile pattern matches (don't recompile if using same pattern match over and over)
# my_reg = re.compile(patter)
# then result = my_reg.match/search(string)

In [2]:
# when contructing pattern/expression - use a raw string to avoice issues with Python r'pattern'

In [5]:
import re # import regex

def main():
    line = 'I think I understand regular expressions'
    
    # use regex match
    match_result = re.match(r'think', line, re.M|re.I) # look for any match of 'think' in line 
    # match only matches if whole string is as given
    if match_result:  # if successful (was a match)
        print 'Match Found: {}'.format(match_result.group())
    else:
        print 'No match found...'
    
    # use regex search
    # search looks in the entire string
    search_result = re.search(r'think', line, re.M|re.I) # search for any match of 'think'
    if search_result:
        print 'Search found: {}'.format(search_result.group())
    else:
        print 'Nothing found in search'

In [6]:
main()

No match found...
Search found: think


In [7]:
# lookup program - takes a textfile and word to search for - find word in text file
import re
# import argparse
def lookup(word, filename):
    search_file = open(filename)
    line_number = 0
    for line in search_file.readlines():
        line = line.strip('\n\r') # strip newline and CR chars
        line_number += 1
        search_result = re.search(word, line, re.M|re.I) # search for word in line
        
        if search_result:
            # found word in line
            print '{} : {}'.format(str(line_number), line)

In [8]:
ls # check what files we have here

banner.p                                 List Comprehension.ipynb
[0m[01;32mbull.py[0m*                                 [01;32mlook_and_say.py[0m*
[01;32mchannel.py[0m*                              [01;32mmap_reduce_fillter.py[0m*
channel.pyc                              [01;35moxygen.png[0m
Decorators.ipynb                         [01;32moxygen.py[0m*
equaility.txt                            [01;32mpickle_py.py[0m*
equality.txt                             pickle_py.pyc
[01;32mfind_chars_equlity.py[0m*                   [01;32mpy_main2.py[0m*
[01;32mfind_chars.py[0m*                           [01;32mpy_main.py[0m*
[01;35mfirst+second.png[0m                         random_char.txt
Generators.ipynb                         README.md
[01;32mgood.py[0m*                                 Regular Expressions (RegEx).ipynb
High Level Functional Programming.ipynb  str() vs repr().ipynb
[01;32mintegrity.py[0m*                            Things to learn:


In [10]:
lookup('list', 'map_reduce_fillter.py') # run the lookup function

23 : # using map - map allows you to map a function to every item in a list. map returns a new list of
24 : # items that are the items in the original list modified by the given func
27 : # using filter - we can filter a list to get items that return true to a given test function
52 : # the above is all nice and good . . .but LIST COMPREHENSION is where the real magic lies
53 : print 'List Comprehension....'


In [13]:
# search and repalce
# python implements s & r using a method called sub()
# sub(patter, repl, string, max=0) max = 0 - default - inf
# takes a pattern and string to replace it with and a string to search through, also how many replacements allowed

In [14]:
# second tutorial .. . . .
import re

print re.split(r'\s*', 'here are some words')  # '\s*' find spaces chars, n times, split string - produce list
# r'' interpret this string differently, '\n' - typically newline, but now different

['here', 'are', 'some', 'words']


In [15]:
print re.split(r'(\s*)', 'here are some words') # added () - group and include, now includes the string we split by

['here', ' ', 'are', ' ', 'some', ' ', 'words']


In [17]:
print re.split(r'(s*)', 'here are some words') # remove '\' - now looks at s like a normal char

['here are ', 's', 'ome word', 's', '']


In [18]:
print re.split(r'[a-f]', 'fhasdjkflkjfheofhijgfbgojbfgbkgblkrngjklbgkjlnlkcbnlSADSACFGFD')
# [a-f] - a through f: abcdef - CaSe SENSITIVE (by default)

['', 'h', 's', 'jk', 'lkj', 'h', 'o', 'hijg', '', 'goj', '', 'g', 'kg', 'lkrngjkl', 'gkjlnlk', '', 'nlSADSACFGFD']


In [33]:
print re.split(r'[a-fA-F]', 'fhasdjkflkjfheofhijgfbgojbfgbkgblkrngjklbgkjlnlkcbnlSADSACFGFD')
# re.I - case insensitive
# re.M - if multilined, evalutate continuously

['', 'h', 's', 'jk', 'lkj', 'h', 'o', 'hijg', '', 'goj', '', 'g', 'kg', 'lkrngjkl', 'gkjlnlk', '', 'nlS', '', 'S', '', '', 'G', '', '']


In [35]:
print re.split(r'[a-f][a-f]', 'hjhjhjhjkhjkhjkfajlkjkljkjlkcd') # split (a-f)(a-f) chars next to each other

['hjhjhjhjkhjkhjk', 'jlkjkljkjlk', '']


In [36]:
# find an address in a string
# \d - digits
# \D - non-digits
# \S - non-space
# want to find instaces of patter --> use re.findall
print re.findall(r'\d', 'ocinwa324 main st.asdasd') # look for chars, number, space, name, random chars

['3', '2', '4']


In [38]:
# as seen above, it finds all the digits and returns them separately, want to specify quantities of chars to look for
# * look for 0 or more instances
# + look for 1 or more instances
# ? look for 0 or 1 (only)
# {min, max} look for exactnumber in range
# {5} look for five instances
# quantity specifier after metastring
print re.findall(r'\d{1,5}','ocinwa324 main st.asdasd')  # now returns between 1 and 5 digits as a single string

['324']


In [41]:
# '\w' - alphanumeric
print re.findall(r'\d{1,5}\s\w+\s\w+\.','ocinwa324 main st.asdasd')  # note we want to stop at '.', but need to escape it

['324 main st.']


In [57]:
# practical example
# parsing web pages --> intro to web scraping
import re, urllib  # urllib - grab websites
try:
    import urllib.request
except:
    pass

sites = 'google yahoo cnn msn'.split()
pat = re.compile(r'<title>+.*</title>+', re.I|re.M) # compile pattern here to save resources - includes <title> and </title>
pat = re.compile(r'<title>+(.*)</title>+', re.I|re.M) # does not include <title> and </title>
for site in sites:
    print 'Searching: {}'.format(site)
    try:
        u = urllib.urlopen('http://{}.com'.format(site))
    except:
        pass
    text = u.read()
    #title = re.findall(r'<title>+.*</title>+',str(text), re.I|re.M) # works but inefficient
    title = re.findall(pat, str(text))
    
    print title[0]

Searching: google
Google
Searching: yahoo
Yahoo
Searching: cnn
CNN - Breaking News, U.S., World, Weather, Entertainment & Video News
Searching: msn
MSN South Africa | Latest News, Live Scores, Hotmail and Outlook
