### Regular Expressions

In [1]:
import re

In [11]:
# basic 
s = "hello, world"
m = re.search('ell', s )  # returns a Match object
m.__dir__()
print(m.group())
print(m.start())
print(m.end())
print(m.span())
print(m.groups())
print(m.re)
print(m.string)
print(m.pos)
print(m.endpos)
print(m.lastindex)
print(m.lastgroup)

# found the match starting at index 1, ending at 4
# 
print(s[1:4])

ell
1
4
(1, 4)
()
re.compile('ell')
hello, world
0
12
None
None
ell


In [13]:
# if no match, then None is returned
print(re.search('h', 'world'))

None


In [16]:
split_term = '@'
s = "my email is zmian@outlook.com"
re.split(split_term, s)

['my email is zmian', 'outlook.com']

In [18]:
s = 'To be or not to be bee'
re.findall('be', s) # returns all matches of the pattern (include the be of bee)

['be', 'be', 'be']

#### re Pattern Syntax

In [19]:
def multi_re_find(patterns,phrase):
    '''
    Takes in a list of regex patterns
    Prints a list of all matches
    '''
    for pattern in patterns:
        print('Searching the phrase using the re check: %r' %(pattern))
        print(re.findall(pattern,phrase))
        print('\n')
        
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = [ 'sd*',        # s followed by zero or more d's
                'sd+',          # s followed by one or more d's
                'sd?',          # s followed by zero or one d's
                'sd{3}',        # s followed by three d's
                'sd{2,3}',      # s followed by two to three d's
                ]

multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: 'sd*'
['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']


Searching the phrase using the re check: 'sd+'
['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']


Searching the phrase using the re check: 'sd?'
['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']


Searching the phrase using the re check: 'sd{3}'
['sddd', 'sddd', 'sddd', 'sddd']


Searching the phrase using the re check: 'sd{2,3}'
['sddd', 'sddd', 'sddd', 'sddd']




In [20]:
#### Character sets

In [21]:
test_patterns = ['[sd]',   # either s or d
                 's[sd]+']  # s followed by either one or more s or d

multi_re_find(test_patterns, test_phrase)

Searching the phrase using the re check: '[sd]'
['s', 'd', 's', 'd', 's', 's', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 'd', 's', 'd', 's', 'd', 's', 's', 's', 's', 's', 's', 'd', 'd', 'd', 'd']


Searching the phrase using the re check: 's[sd]+'
['sdsd', 'sssddd', 'sdddsddd', 'sds', 'sssss', 'sdddd']




#### Exclusion

In [24]:
test_phrase = 'I ran. Tomorrow I will run! Will you, too?'
re.findall('[^!.?, ]+', test_phrase) # exclude all punctuation marks and spaces

['I', 'ran', 'Tomorrow', 'I', 'will', 'run', 'Will', 'you', 'too']

#### Character matches

In [25]:
test_phrase = 'This is an example sentence. Lets see if we can find some letters.'

test_patterns=['[a-z]+',      # sequences of lower case letters
               '[A-Z]+',      # sequences of upper case letters
               '[a-zA-Z]+',   # sequences of lower or upper case letters
               '[A-Z][a-z]+'] # one upper case letter followed by lower case letters
                
multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '[a-z]+'
['his', 'is', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using the re check: '[A-Z]+'
['T', 'L']


Searching the phrase using the re check: '[a-zA-Z]+'
['This', 'is', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using the re check: '[A-Z][a-z]+'
['This', 'Lets']




#### Escape Codes

|Code|Meaning|
|:-|:-|
|\d|a digit |
|\D |a non-digit |
|\s |whitespace(tab, space, newline, etc.) |
|\S |non-whitespace |
|\w |alphanumeric |
|\W |non-alphanumeric |

In [33]:
test_phrase = 'There are 200 million galaxies in the universe (some argue 2 trillion)! #Amazing'

# need to use r to escape backslash
test_patterns=[ r'\d',  # single digits
                r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
                ]

multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '\\d'
['2', '0', '0', '2']


Searching the phrase using the re check: '\\d+'
['200', '2']


Searching the phrase using the re check: '\\D+'
['There are ', ' million galaxies in the universe (some argue ', ' trillion)! #Amazing']


Searching the phrase using the re check: '\\s+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


Searching the phrase using the re check: '\\S+'
['There', 'are', '200', 'million', 'galaxies', 'in', 'the', 'universe', '(some', 'argue', '2', 'trillion)!', '#Amazing']


Searching the phrase using the re check: '\\w+'
['There', 'are', '200', 'million', 'galaxies', 'in', 'the', 'universe', 'some', 'argue', '2', 'trillion', 'Amazing']


Searching the phrase using the re check: '\\W+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' (', ' ', ' ', ' ', ')! #']


