# Regular Expressions in Python

### Content:
### 1. Basics
### 2. About match,search,findall
### 3. Quantifiers
### 4. Custom Character sets
### 5. Quantifiers with Custom Sets
### 6. Introduction to Groups

## 1.Basics
Python uses back slash to indicate special characters
'\n' Backslash followed by n denotes a newline.
'\t' denotes a tab.
'r' expression ,that voids the Pythons special characters (i.e it gets rid of special characters).
r'\n' means it's a raw string with two characters 'n' and '\' 

In [1]:
#Some Examples
import re
re.search('n','\n') #Here the output is none because n is a normal letter and '\n' is a special character

In [2]:
#Two ways to handle this one one way is to use \ for every backslash thus it nullifies the effect of pythons special character.
re.search('n','\\n')

<re.Match object; span=(1, 2), match='n'>

In [3]:
#if there are so many we cannot put always a extra slash 
re.search('n','\n\n\n\n\n\n\n')

In [4]:
#So to overcome above problem we can use r
re.search('n',r'\n\n\n\n\n\n\n\n') # r converts into raw string

<re.Match object; span=(1, 2), match='n'>

In [5]:
#regular expressions has its own special characters as well
#regex with '\n' and r'\n' both look for newline
re.search('\n','\n\n\n\n\n\n\n\n\n')

<re.Match object; span=(0, 1), match='\n'>

In [6]:
re.search(r'\n','\n\n') #Here also we can see that match found

<re.Match object; span=(0, 1), match='\n'>

In [7]:
#The below example doesnt work because string doesn't use newline and r'\n' looks for newline
re.search(r'\n',r'\n\n')

#Apply r for regular expressions doesn't have much effect on metacharacters 

## 2.Match ,Findall and Search Examples

In [8]:
#re.search
#Searches anywhere in the sentence 
#Syntax - re.search(pattern, string, flags) # flags special options

In [9]:
#re.match
#only beggining of the string

In [10]:
re.match('c',"abcdef") #return none because it only looks at the start of the string

In [11]:
re.search('c','abcdef') #searches anywhere 
#below u can find span so it tells us it found the match at 2 and ends at 3

<re.Match object; span=(2, 3), match='c'>

In [12]:
bool(re.match('c','abcdef')) #no match return boolean false

False

In [13]:
bool(re.match('a','abcdef')) #match returns true

True

In [14]:
#The problem with re.search is it will only searches for first instance
re.search('c','abcdefc')

<re.Match object; span=(2, 3), match='c'>

In [15]:
#search also works for multiline i.e even if the letter is placed after the \n
re.search('c','abdef\nc')

<re.Match object; span=(6, 7), match='c'>

In [16]:
#Where as match doesn't work with newline
re.match('c','abcdef\nc')

### Printing the output of match and search

In [17]:
re.match('a','abcdef').group() #string output default value is 0

'a'

In [18]:
re.match('a','abcdef').group(0) #both are same

'a'

In [19]:
re.search('n','abcdefnc abcd').group()

'n'

In [20]:
re.search('n.+','abcdefnc abcd').group()

'nc abcd'

In [21]:
#To find index
re.search('c','abdef\nc').start()

6

In [22]:
re.search('c','abdef\nc').end()

7


### Literal Matching

In [23]:
re.search('na','abcdefnc abcd') # doesnt work beacause they are no such na substring

In [24]:
re.search('n|a','abcdefnc abcd') #n or a only the first instance is pulled out(n or a).in this case it is a

<re.Match object; span=(0, 1), match='a'>

In [25]:
re.search('n|a','bcdefnc abcda')

<re.Match object; span=(5, 6), match='n'>

In [26]:
re.search('n|a|b','bcdefnc abcda') #We can give more than 2

<re.Match object; span=(0, 1), match='b'>

In [27]:
###############################              re.findall                     #############################################

In [28]:
re.findall('n|a','bcdefnc abcda') # it will find all the letters in the regex and it will pull all those

['n', 'a', 'a']

In [29]:
re.search('abcd','abcdefnc abcd') #Multiple characters literals search

<re.Match object; span=(0, 4), match='abcd'>

In [30]:
re.findall('abcd','abcdefnc abcd')

['abcd', 'abcd']

## Character Sets

In [31]:
#character sets can match a set of characters

In [32]:
# \w represents any chracters in the set [a-zA-Z0-9_]

In [33]:
re.search(r'\w\w\w\w','abcdefnc abcd')

<re.Match object; span=(0, 4), match='abcd'>

In [34]:
re.search('\w\w\w\w','abcdefnc abcd')

<re.Match object; span=(0, 4), match='abcd'>

In [35]:
re.search(r'\w\w\w\w','ab_cdefnc abcd') #matches  _ character

<re.Match object; span=(0, 4), match='ab_c'>

In [36]:
re.search('\w\w\w','a3.!-!') #no match found

In [37]:
re.search('\w\w\w','a33-!').group()

'a33'

In [38]:
#\W opposite of \w ; so nothing included in [a-zA-Z0-9_]

In [39]:
re.search('\w\w\W','a3.-!') #|W matches non characters and numbers

<re.Match object; span=(0, 3), match='a3.'>

In [40]:
re.search('\w\w\W','a3 .-_!') #it also matches space as well

<re.Match object; span=(0, 3), match='a3 '>

## 3.Quantifiers

In [41]:
"""
'+' = 1 or more
'?' = 0 or 1
'*' = 0 or more
'{n,m}' = n to m repetitions {,3}, {3,}
"""

"\n'+' = 1 or more\n'?' = 0 or 1\n'*' = 0 or more\n'{n,m}' = n to m repetitions {,3}, {3,}\n"

In [42]:
re.search('\w\w','abcdefnc abcd')

<re.Match object; span=(0, 2), match='ab'>

In [43]:
re.search('\w+','abcdefnc abcd').group() #don't know the number

'abcdefnc'

In [44]:
re.search('\w+\W+\w+','abcdefnc abcd').group() #getting the whole string

'abcdefnc abcd'

In [45]:
re.search('\w+\W+\w+','abcdefnc     abcd').group()

'abcdefnc     abcd'

In [46]:
re.search('\w+\W?\w+','abcdefncabcd').group() # ? = 0 or 1

'abcdefncabcd'

In [47]:
re.search('\w+\W?\w+','abcdefnc abcd').group()

'abcdefnc abcd'

In [48]:
re.search('\w+\W?\w+','abcdefnc  abcd').group() #Here there are two spaces so it will not take

'abcdefnc'

In [49]:
#Pulling out specific amounts

In [50]:
re.search('\w{3}','aaaaaaaaaaa') #only 3 \w characters

<re.Match object; span=(0, 3), match='aaa'>

In [51]:
re.search('\w{1,4}','abcdefg').group() #This is kind of a range where 1 is minimum and 4 is maximum

'abcd'

In [52]:
re.search('\w{1,10}\W{1,4}\w+','abcdefn cabcd').group() #1-10 \w characters, 0-4 \W characters # 1+ characters

'abcdefn cabcd'

In [53]:
re.search('\w{1,}\W{0,}\w+','abcdefnc abcd').group() #atleast 1 or atleast 0

'abcdefnc abcd'

## 4.Other type of character sets

In [54]:
#'\d' = matches digits [0-9]
#'\D' = This matches any non - digit character

In [55]:
string = '23abced++'
re.search('\d+',string).group()

'23'

In [56]:
#'\s' = matches any whitespace character #new lines, tabs, spaces etc
#'\S' = matches any non - white character 

In [57]:
re.search('\S+',string).group()

'23abced++'

In [58]:
string = "A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern. Regular expressions are widely used in UNIX world."

In [59]:
re.findall('\S+',string)

['A',
 'regular',
 'expression',
 'is',
 'a',
 'special',
 'sequence',
 'of',
 'characters',
 'that',
 'helps',
 'you',
 'match',
 'or',
 'find',
 'other',
 'strings',
 'or',
 'sets',
 'of',
 'strings,',
 'using',
 'a',
 'specialized',
 'syntax',
 'held',
 'in',
 'a',
 'pattern.',
 'Regular',
 'expressions',
 'are',
 'widely',
 'used',
 'in',
 'UNIX',
 'world.']

In [60]:
' '.join(re.findall('\S+',string)) #joining with space

'A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern. Regular expressions are widely used in UNIX world.'

In [61]:
# . the dot matches any character except new line    

In [62]:
re.search('.',string).group()

'A'

In [63]:
re.search('.+',string).group()

'A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern. Regular expressions are widely used in UNIX world.'

In [64]:
string1 = """A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern. Regular expressions are widely used in UNIX world.

The module re provides full support for Perl-like regular expressions in Python. The re module raises the exception re.error if an error occurs while compiling or using a regular expression.

We would cover two important functions, which would be used to handle regular expressions. But a small thing first: There are various characters, which would have special meaning when they are used in regular expression. To avoid any confusion while dealing with regular expressions, we would use Raw Strings as r'expression'."""

In [65]:
re.search('.+',string1).group() #no new line

'A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern. Regular expressions are widely used in UNIX world.'

In [66]:
#Here we are using third parameter flags
re.search('.+',string1, flags = re.DOTALL).group() #re.DOTALL pulls even new lines

"A regular expression is a special sequence of characters that helps you match or find other strings or sets of strings, using a specialized syntax held in a pattern. Regular expressions are widely used in UNIX world.\n\nThe module re provides full support for Perl-like regular expressions in Python. The re module raises the exception re.error if an error occurs while compiling or using a regular expression.\n\nWe would cover two important functions, which would be used to handle regular expressions. But a small thing first: There are various characters, which would have special meaning when they are used in regular expression. To avoid any confusion while dealing with regular expressions, we would use Raw Strings as r'expression'."

### Creating your own character sets

In [67]:
# [A-Z] nothing but A to Z (all upper Cases)

In [68]:
string = 'Hello, There, How, Are, You'

In [69]:
re.findall('[A-Z]',string) #Pulls out all capital letters

['H', 'T', 'H', 'A', 'Y']

In [70]:
re.findall('[A-Z,]',string) #Here we search for any capital letters or a comma

['H', ',', 'T', ',', 'H', ',', 'A', ',', 'Y']

In [71]:
string1 = 'Hello, There, How, Are, you'

In [72]:
re.findall('[A-Z,]',string1)

['H', ',', 'T', ',', 'H', ',', 'A', ',']

In [73]:
string = 'Hello, There, How, Are, You...'

In [74]:
re.findall('[A-Z,.]',string)

['H', ',', 'T', ',', 'H', ',', 'A', ',', 'Y', '.', '.', '.']

In [75]:
re.findall('[A-Z\s,.]',string) #\s is a python metacharacter so it works inside []

['H',
 ',',
 ' ',
 'T',
 ',',
 ' ',
 'H',
 ',',
 ' ',
 'A',
 ',',
 ' ',
 'Y',
 '.',
 '.',
 '.']

## 5.Quantifiers with customs sets

In [76]:
string = 'HELLO, There, How, Are, You...'

In [77]:
re.search('[A-Z]',string).group()

'H'

In [78]:
re.search('[A-Z]+',string).group()

'HELLO'

In [79]:
re.findall('[A-Z]+',string)

['HELLO', 'T', 'H', 'A', 'Y']

In [80]:
re.findall('[A-Z]{2,}',string)

['HELLO']

In [81]:
re.search('[A-Ma-z\s,]+',string).group()

'HELL'

In [82]:
re.findall('[A-Z]?[a-z\s,]+',string)

['O, ', 'There, ', 'How, ', 'Are, ', 'You']

In [83]:
re.search('[^A-Za-z\s,]+',string).group() # ^ - if we use this inside the bracket it say not matching the particular regex

'...'

In [84]:
re.findall('[^A-Z]+',string)

[', ', 'here, ', 'ow, ', 're, ', 'ou...']

## 6.Groups

In [85]:
#groups allow us to pull out sections of a match and store them

In [86]:
#Example
string = 'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'

In [87]:
re.findall('[A-Za-z]+ \w+ \d+ \w+',string)

['John has 6 cats', 'Susan has 3 dogs', 'Mike has 8 fishes']

In [88]:
re.findall('\w+\s\w+\s\d+\s\w+',string)

['John has 6 cats', 'Susan has 3 dogs', 'Mike has 8 fishes']

In [89]:
#The use of brackets denotes the group
# () = metacharacter

In [90]:
re.findall('(\w+) \w+ \d+ \w+',string) #to pull out just the names

['John', 'Susan', 'Mike']

In [91]:
re.findall('\w+ \w+ \d+ (\w+)',string) #to pull out just the animals

['cats', 'dogs', 'fishes']

In [92]:
re.findall('(\w+) \w+ (\d+) (\w+)',string) #to pull out names,numbers,animals

[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]

In [93]:
info = re.findall('(\w+) \w+ (\d+) (\w+)',string) #to pull out names,numbers,animals

In [94]:
list(zip(*info)) # seperating the three different things

[('John', 'Susan', 'Mike'), ('6', '3', '8'), ('cats', 'dogs', 'fishes')]

In [95]:
match = re.search('(\w+) \w+ (\d+) (\w+)',string)

In [96]:
match.group(0) #group 0 is the entire match

'John has 6 cats'

In [97]:
match.groups()

('John', '6', 'cats')

In [98]:
match.group(1)  #Group 1

'John'

In [99]:
match.group(2)#Group 2

'6'

In [100]:
match.group(3)#Group 3

'cats'

In [101]:
match.group(1,3) #Multiple groups

('John', 'cats')

In [102]:
match.group(3,2,1,1) #change the order

('cats', '6', 'John', 'John')

In [103]:
match.span() #span is a method to get start and end of the match

(0, 15)

In [104]:
match.span(2) #span of 2nd group

(9, 10)

In [105]:
match.span(3) #span of 3rd group

(11, 15)

In [106]:
match.start(3)

11

In [107]:
#findall has no group function

In [108]:
data = re.findall('(([A-Za-z]+) \w+ (\d+) (\w+))',string)

In [109]:
data

[('John has 6 cats', 'John', '6', 'cats'),
 ('Susan has 3 dogs', 'Susan', '3', 'dogs'),
 ('Mike has 8 fishes', 'Mike', '8', 'fishes')]

In [110]:
for i in data:
    print(i[0])

John has 6 cats
Susan has 3 dogs
Mike has 8 fishes


In [111]:
#Iteration ###iterators will exhaust

In [118]:
it = re.finditer('(([A-Za-z]+) \w+ (\d+) (\w+))',string)
print(it)

<callable_iterator object at 0x00000237E77BAB60>


In [113]:
next(it).groups()

('John has 6 cats', 'John', '6', 'cats')

In [114]:
for element in it:
    print(element.group(1,3,2))

('Susan has 3 dogs', '3', 'Susan')
('Mike has 8 fishes', '8', 'Mike')


In [120]:
it = re.finditer('(([A-Za-z]+) \w+ (\d+) (\w+))',string)
for element in it:
    print(element.group())

John has 6 cats
Susan has 3 dogs
Mike has 8 fishes


In [116]:
it = re.finditer('(([A-Za-z]+) \w+ (\d+) (\w+))',string)
for element in it:
    print(element.groups())

('John has 6 cats', 'John', '6', 'cats')
('Susan has 3 dogs', 'Susan', '3', 'dogs')
('Mike has 8 fishes', 'Mike', '8', 'fishes')
