# greedy vs lazy

In [1]:
import re # regular expression

## concept

In [23]:
s='price 30 (small S), 45 (medium M), 60 (large L)'
# ['small', 'medium', 'large']

In [7]:
re.findall(r'\(.*\)', s) # greedy default

['(small), 45 (medium), 60 (large))))']

In [21]:
re.findall('\(.*?\)', s) # lazy [?*+]? {0,}?

['(small)', '(medium)', '(large)']

In [32]:
re.findall('(?<=\().*', s) # lazy [?*+]? {0,}?

['small S), 45 (medium M), 60 (large L)']

In [33]:
re.findall('.*(?=\))', s) # lazy [?*+]? {0,}?

['price 30 (small S), 45 (medium M), 60 (large L', '']

In [34]:
re.findall('(?<=\().*(?=\))', s) # lazy [?*+]? {0,}?

['small S), 45 (medium M), 60 (large L']

In [37]:
re.findall('(?<=\().*?(?=\))', s) # lazy [?*+]? {0,}?

['small S', 'medium M', 'large L']

## html tag

In [8]:
s='<h1>hello <b>world</b> 2021. <u>Welcome to regex.</u></h1>'
s

'<h1>hello <b>world</b> 2021. <u>Welcome to regex.</u></h1>'

In [9]:
re.findall('<b>.*</b>', s)

['<b>world</b>']

In [10]:
re.findall('<[bu]>.*</[bu]>', s)

['<b>world</b> 2021. <u>Welcome to regex.</u>']

In [11]:
re.findall('<[bu]>.*?</[bu]>', s)

['<b>world</b>', '<u>Welcome to regex.</u>']

In [None]:
re.findall('(?<=<[bu]>).*?(?=</[bu]>)', s)

In [12]:
s='Peter Parker spiderman@sony.com '
re.findall('.*@', s

['Peter Parker spiderman@']

In [14]:
re.findall('\w+@', s) # \w alphanumeric a-z A-z 0-9

['spiderman@']

In [19]:
re.findall('\w+(?=@)', s)[0] #lookahead (?=)

'spiderman'

In [18]:
re.findall('(?<=@).*', s)[0] #lookbehind (?<=)

'sony.com'

# lookaround, lookahead (?=), lookbehind (?<=)

In [None]:
s='Peter Parker (spiderman@sony.com), Tony Stark (ironman@avenger.com), Natasha Romanoff (blackwidow@marvel.com), Clark Kent (superman@dc.com)'

In [None]:
re.findall('\(.*\)', s)

In [None]:
re.findall('\(.*?\)', s)

In [None]:
re.findall('\w+(?=@)', s)

In [None]:
re.findall('(?<=@)\w+', s)

In [None]:
re.findall('(?<=@)[\w\.]+', s)

In [None]:
import pandas as pd

In [None]:
s.split(r', ')

In [None]:
dt=pd.DataFrame(s.split(', '), columns=['info'])
dt

In [None]:
dt['info'].apply(lambda v: re.findall('\w+(?=@)', v))

In [None]:
dt['hero']=dt['info'].apply(lambda v: re.findall('\w+(?=@)', v)[0])
dt

In [None]:
dt['info'].apply(lambda v: re.findall(r'(\w+) (\w+)', v)[0])

In [None]:
dt['info'].str.extract(r'(\w+) (\w+) \((.*)\)')

In [None]:
dt['info'].apply(lambda v: v.upper())

In [None]:
dt['info'].apply(lambda v: v.upper())

In [None]:
dt['info'].map(str.upper)

## findall

In [None]:
s='1. camera 670USD, 2. lens 1200USD, 3. case 39.95USD'

In [None]:
re.findall('\d+(?=USD)', s)

In [None]:
re.findall('\d+\.?\d+(?=USD)', s)

## `#hashtag`

In [None]:
s='microsoft office #excel #powerpoint #word essential skills'
re.findall('#\w+', s)

In [None]:
re.findall('(?<=#)\w+', s)

# match object

The Match object has properties and methods used to retrieve information about the search, and the result:

* span() returns a tuple containing the start-, and end positions of the match.
* string returns the string passed into the function
* group() returns the part of the string where there was a match

In [None]:
s='1. camera 670USD, 2. lens 1200USD, 3. case 39.95USD'

m=re.search('lens', s)
if m:
    print(m.group())
    print(m.span())
else:
    print('not found')

In [None]:
m=re.search('\d+\.?\d+', s)
if m:
    print(m.group())
    print(m.span())
else:
    print('not found')