## Regular Expression

#### A Regular Expression is a special text string for describing a search pattern.

### Symbols for writing regular expression

- #### *: The preceding character is repeated zero or more times
- #### +: The preceding character is repeated at least once
- #### {}: The preceding character is repeated as many times as mentioned in the braces
- #### .: represents a single occurence of any character except newline
- #### ?: The preceding character is optional
- #### ^: specifies that the match must start at the begining of the string

### Regular Expression Operations

In [14]:
#import the library
import re

In [15]:
Nameage='''
Janice is 22 and Theon is 33
Gabriel is 44 and Joey is 21
'''

In [16]:
ages=re.findall(r'\d{1,3}', Nameage)
names=re.findall(r'[A-Z][a-z]*', Nameage)

In [17]:
ageDict={}

In [18]:
x=0

In [20]:
for eachname in names:
    ageDict[eachname]=ages[x]
    x+=1
    print(ageDict)

{'Janice': '22'}
{'Janice': '22', 'Theon': '33'}
{'Janice': '22', 'Theon': '33', 'Gabriel': '44'}
{'Janice': '22', 'Theon': '33', 'Gabriel': '44', 'Joey': '21'}


#### Finding a word in a string

In [8]:
if re.search("inform", "we need to inform him with the latest information"):
    print("There is inform")

There is inform


In [9]:
allinform=re.findall("inform", "we need to inform him with the latest information")
for i in allinform:
    print(i)

inform
inform


#### Generating an iterator

In [22]:
#iterates the first and last index of the matchinh iterators
str="we need to inform him with the latest information"
for i in re.finditer("inform", str):
    loctup=i.span()
    print(loctup)

(11, 17)
(38, 44)


#### Matching words with a particular pattern

In [23]:
str="Sat, hat, mat, pat"

In [26]:
allstr=re.findall("[Shmp]at",str)

In [28]:
for i in allstr:
 print(i)

Sat
hat
mat
pat


#### Matching series of range of characters

In [29]:
allstr=re.findall("[h-m]at",str)

In [30]:
for i in allstr:
 print(i)

hat
mat


In [31]:
allstr=re.findall("[^h-m]at",str)

In [32]:
for i in allstr:
 print(i)

Sat
pat


#### Replace string

In [34]:
food="hat rat mat pat"

In [35]:
regex=re.compile("[r]at")

In [36]:
food=regex.sub("food", food)
print(food)

hat food mat pat


#### Solving the backslash problems

In [37]:
randstr="here is \\drogba"
print(randstr)

here is \drogba


In [38]:
#we want the two slashes to show
print(re.search(r"\\drogba", randstr))

<re.Match object; span=(8, 15), match='\\drogba'>


#### Matching a single character

In [39]:
randstr='''
Keep the blue flag
flying high
Chelsea
'''
print(randstr)


Keep the blue flag
flying high
Chelsea



In [40]:
regex=re.compile("\n")
randstr=regex.sub(" ", randstr)
print(randstr)

 Keep the blue flag flying high Chelsea 


In [41]:
#\b:backspace
#\f:formfeed
#\r:carriage return
#\t:Tab
#\v:Vertical Tab
#\d:matches all digits
#\D:matches all apart from digits
#\d{}:matches a particular digit

In [42]:
randstr="12345"
print("Matches:", len(re.findall("\d", randstr)))

Matches: 5


In [43]:
randstr="12345"
print("Matches:", len(re.findall("\D", randstr)))

Matches: 0


In [50]:
randstr="12345"
print("Matches:", len(re.findall("\d{5}", randstr)))

Matches: 1


In [51]:
#matching digits with a certain range
num="123 1234 12345 123456 1234567"
print("Matches:", len(re.findall("\d{5,7}", num)))

Matches: 3


### Regular Expression Application

#### Phone number Verification

In [52]:
#\w:[a-zA-Z0-9]
#\W:[^a-zA-Z0-9]

In [53]:
phn="412-555-1212"
if re.search("\w{3}-\w{3}-\w{3}", phn):
    print("it is a phone number")

it is a phone number


In [54]:
#\s [\f\n\r\t\v]
#\S [^\f\n\r\t\v]

In [55]:
if re.search("\w{2,20}\s\w{2,20}", "Motolani Mercy"):
    print("fullname is valid")

fullname is valid


#### Email Verification

In [56]:
email="sk@aol.com md@.com @seo.com dc@.com"

In [57]:
print("EmailMatches:", len(re.findall("[\w._%+-]{1,20}@[\w._]{2,20}.[A-Za-z]{2,3}", email)))

EmailMatches: 1


#### Web Scrapping

In [59]:
import urllib.request
from re import findall

In [60]:
url="http://www.summet.com/dmsi/html/codesamples/addresses.html"

In [61]:
response=urllib.request.urlopen(url)

In [63]:
html=response.read()

In [64]:
htmlStr=html.decode()

In [66]:
pdata=findall("\(\d{3}\) \d{3}-\d{4}", htmlStr)

In [67]:
for item in pdata:
    print(item)

(257) 563-7401
(372) 587-2335
(786) 713-8616
(793) 151-6230
(492) 709-6392
(654) 393-5734
(404) 960-3807
(314) 244-6306
(947) 278-5929
(684) 579-1879
(389) 737-2852
(660) 663-4518
(608) 265-2215
(959) 119-8364
(468) 353-2641
(248) 675-4007
(939) 353-1107
(570) 873-7090
(302) 259-2375
(717) 450-4729
(453) 391-4650
(559) 104-5475
(387) 142-9434
(516) 745-4496
(326) 677-3419
(746) 679-2470
(455) 430-0989
(490) 936-4694
(985) 834-8285
(662) 661-1446
(802) 668-8240
(477) 768-9247
(791) 239-9057
(832) 109-0213
(837) 196-3274
(268) 442-2428
(850) 676-5117
(861) 546-5032
(176) 805-4108
(715) 912-6931
(993) 554-0563
(357) 616-5411
(121) 347-0086
(304) 506-6314
(425) 288-2332
(145) 987-4962
(187) 582-9707
(750) 558-3965
(492) 467-3131
(774) 914-2510
(888) 106-8550
(539) 567-3573
(693) 337-2849
(545) 604-9386
(221) 156-5026
(414) 876-0865
(932) 726-8645
(726) 710-9826
(622) 594-1662
(948) 600-8503
(605) 900-7508
(716) 977-5775
(368) 239-8275
(725) 342-0650
(711) 993-5187
(882) 399-5084
(287) 755-

### More on Basic Regular Expression Operations

In [68]:
str="Abcd 4 computer 765 Python 687"

In [69]:
pattern=r'[a-zA-Z]+'

In [70]:
match=re.findall(pattern, str)
print(match)

['Abcd', 'computer', 'Python']


In [71]:
pattern=r'[0-9]+'

In [72]:
match=re.findall(pattern, str)
print(match)

['4', '765', '687']


In [73]:
pattern=r'[a-zA-Z0-9]+'

In [74]:
match=re.findall(pattern, str)
print(match)

['Abcd', '4', 'computer', '765', 'Python', '687']


In [77]:
pattern=r'.[^ ]+'

In [78]:
match=re.findall(pattern, str)
print(match)

['Abcd', ' 4', ' computer', ' 765', ' Python', ' 687']


In [79]:
str='''
apple
banana
orange
peach
avocado
cherries
'''

In [83]:
pattern=r'.*s'

In [84]:
match=re.findall(pattern, str)
print(match)

['cherries']


In [85]:
for m in match:
    print(m)

cherries


In [86]:
pattern=r'\b[aeiou].+\b'

In [87]:
match=re.findall(pattern, str)
print(match)

['apple', 'orange', 'avocado']


In [95]:
str='''
dfshj@gmail.com
3ytgdg.56
tigacharm56h@hotmail.com
hfg123h@aol
'''

In [96]:
pattern=r'[a-z].+[0-9]*[a-z]*@[a-z]+\.com'

In [90]:
match=re.findall(pattern, str)
print(match)

['dfshj@gmail.com', 'tigacharm56h@hotmail.com']


In [98]:
match=re.finditer(pattern, str)
for m in match:
    print(m.span())

(1, 16)
(27, 51)


In [99]:
str='''
Sam
car
2453
Alexa
John
90
'''

In [101]:
pattern=r'\b[A-Z][a-z]+\b'
nstr=re.sub(pattern, "",str)
print(nstr)



car
2453


90

