In [12]:
from IPython.display import Markdown, display
import re

# Introduction

## Reference
This notebook follows the examples and explanations in 
Chapter 2 of Speech and Language Processing by Jurafsky and Martin [here](https://web.stanford.edu/~jurafsky/slp3/2.pdf)

## Basic Usages

In [16]:
# re.match() searches from the beginning
# re.search(): search anywhere
pattern = 'searchme'
s = 'where is searchme, find it.'
r = re.search(pattern, s)
display(Markdown(s[:r.start()] + '**' + s[r.start():r.end()] + '**' + s[r.end():]))

where is **searchme**, find it.

## Display
* For each example, the pattern, the string to search and the matching part (if any) are displayed using the function below.
* The matching part is surrounded by / 

In [89]:
def search_print(pattern: str, comment: str, string: str):
    print(pattern, ':', comment)

    r = re.search(pattern, string)
    if r is None:
        print('\t' + string + ' : no match')
    else:
        s = string
        print('\t' + string + ' : ' + s[:r.start()] + '/' + s[r.start():r.end()] + '/' + s[r.end():])

# Basic

## concatenation
just a sequence of *simple* characters

In [91]:
search_print('woodchucks', 'matches woodchucks', 'intersting links to woodchucks and lemurs')
search_print('a', 'matched a', 'Mary')
search_print('!', 'matches !', 'Again!')

# case sensitive
search_print('woodchuck', 'to Woodchucks and', 'case sensitive')

woodchucks : matches woodchucks
	intersting links to woodchucks and lemurs : intersting links to /woodchucks/ and lemurs
a : matched a
	Mary : M/a/ry
! : matches !
	Again! : Again/!/
woodchuck : to Woodchucks and
	case sensitive : no match


## disjunction: []
For 'OR'

In [92]:
search_print('[wW]oodchuck', 'Woodchuck or woodchuck', 'to Woodchuck and')
search_print('[abc]', 'a, b, or c', 'in soldati')
search_print('[1234567890]', 'any digit', 'plenty of 7 to 5')

[wW]oodchuck : Woodchuck or woodchuck
	to Woodchuck and : to /Woodchuck/ and
[abc] : a, b, or c
	in soldati : in sold/a/ti
[1234567890] : any digit
	plenty of 7 to 5 : plenty of /7/ to 5


## Range: [ - ]
Use '-' within square braces

In [93]:
search_print('[A-Z]', 'upper case letter', 'we Shall', )
search_print('[a-z]', 'lower case letter', 'my beans')
search_print('[0-9]', 'single digit', 'Chapter 1, Down...')

[A-Z] : upper case letter
	we Shall : we /S/hall
[a-z] : lower case letter
	my beans : /m/y beans
[0-9] : single digit
	Chapter 1, Down... : Chapter /1/, Down...


## negation
use caret ^
* **only** if appears as the **first symbol** after the **open square brace**.
* otherwise, it means ^. In Python, looks like we need a backslash \

In [98]:
search_print('[^A-Z]', 'not an upper case letter', 'Oyfn pr')
search_print('[^Ss]', 'neither S nor s', 'I have no')
search_print('[^.]', 'not a period', 'our re')
search_print('[e^]', 'either e or ^', 'look up ^ now')
search_print('[a^b]', 'supposedly, patter a^b. But, looks like a or b in Python.', 'look up a^b now')
search_print('[a\^b]', 'pattern a^b (works with Python)', 'look up a^b now')

[^A-Z] : not an upper case letter
	Oyfn pr : O/y/fn pr
[^Ss] : neither S nor s
	I have no : /I/ have no
[^.] : not a period
	our re : /o/ur re
[e^] : either e or ^
	look up ^ now : look up /^/ now
[a^b] : supposedly, patter a^b. But, looks like a or b in Python.
	look up a^b now : look up /a/^b now
[a\^b] : pattern a^b (works with Python)
	look up a^b now : look up /a/^b now


## Optional: ? 
?: the preceding character or nothing. i.e. zero or one instances of the previous character

In [96]:
search_print('woodchucks?', 'woodchuck or woodchucks', 'woodchuck')
search_print('colou?r', 'color or colour', 'color')

woodchucks? : woodchuck or woodchucks
	woodchuck : /woodchuck/
colou?r : color or colour
	color : /color/


## Kleen *
**zero** or **more** occurrences of the immediately previous character or regular expression

In [108]:
search_print('a*', 'a, aaa, bbb (as has zero a)', 'aa')
search_print('a*', 'a, aaa, bbb (as has zero a)', 'bbb')
search_print('aa*', "a followed by zero or more a's", 'baa!')
search_print('[ab]*', "zero or more a's or b's", 'zooabab')
search_print('zoo[ab]*', "zero or more a's or b's", 'zooabab')
search_print('[0-9][0-9]*', 'integer', 'bd223bd')


a* : a, aaa, bbb (as has zero a)
	aa : /aa/
a* : a, aaa, bbb (as has zero a)
	bbb : //bbb
aa* : a followed by zero or more a's
	baa! : b/aa/!
[ab]* : zero or more a's or b's
	zooabab : //zooabab
zoo[ab]* : zero or more a's or b's
	zooabab : /zooabab/
[0-9][0-9]* : integer
	bd223bd : bd/223/bd
