# Greedy Vs Non Greedy

# *?, +?, ??

In [45]:
import re

print(re.search(r'<.*>', '<a> b <c>')) # it is greedy and it match the entire string 
print(re.search(r'<.*?>', '<a> b <c>')) # it will match as much as it need

<re.Match object; span=(0, 9), match='<a> b <c>'>
<re.Match object; span=(0, 3), match='<a>'>


In [26]:
print(re.search(r'<.+>', '<a> b <c>'))
print(re.search(r'[^<]*', '<a> b <c>'))

<re.Match object; span=(0, 9), match='<a> b <c>'>
<re.Match object; span=(0, 0), match=''>


In [8]:
print(re.search(r'\w*?E', '123EEE'))

<re.Match object; span=(0, 4), match='123E'>


In [11]:
print(re.search(r'.*?p', 'two_apples'))

<re.Match object; span=(0, 6), match='two_ap'>


In [25]:
# more on lazy quantifier
print(re.search(r'^(A*?)A$','AA'))

<re.Match object; span=(0, 2), match='AA'>


# Greedy match

A greedy match means that the regex engine (the one which tries to find your pattern in the string) matches as many characters as possible.

In [28]:
print(re.findall('a?', 'aaaa'))
print(re.search('a?', 'aaaa'))

['a', 'a', 'a', 'a', '']
<re.Match object; span=(0, 1), match='a'>


In [38]:
print(re.findall('a*', 'aaaa')) # 0 or more
print(re.findall('a+', 'aaaa')) # 1 or more, it means matching a emptry string is not an option anymore
print(re.findall('a{3}', 'aaaa'))
print(re.findall('a{1,2}', 'aaaa')) # it is greedy, that's why it matches as much as possible 

['aaaa', '']
['aaaa']
['aaa']
['aa', 'aa']


# Non Greedy Match

A non-greedy match means that the regex engine matches as few characters as possible—so that it still can match the pattern in the given string.

It gives the shortest possible match 

In [44]:
print(re.findall('a?', 'aaaa'))
print(re.findall('a+?', 'aaaa'))
print(re.findall('a*?', 'aaaa')) #empty string is also valid, after it matches empty string, it is forced to match 'a'
print(re.findall('a??', 'aaaa'))
print(re.findall('a{2,3}?', 'aaaa')) # it match the shortest possible match 

['a', 'a', 'a', 'a', '']
['a', 'a', 'a', 'a']
['', 'a', '', 'a', '', 'a', '', 'a', '']
['', 'a', '', 'a', '', 'a', '', 'a', '']
['aa', 'aa']


# Look Around

# Positive Look ahead

Positive look ahead will succeed if the passed non-consuming expression does match against the forthcoming input.

The syntax is A(?=B) where A is the actual expression and B is the non-consuming expression.


In [47]:
# search "Isaac" if it's followed by Asim
print(re.search(r'Isaac(?=Asim)','IsaacAsimove'))
print(re.search(r'Isaac(?=Asim)','IsaacThan'))


<re.Match object; span=(0, 5), match='Isaac'>
None


In [49]:
test = 'I love python, I love regax'
print(re.search(r'love(?=\sregax)', test))

<re.Match object; span=(17, 21), match='love'>


In [51]:
# find all words which is followed by , or .
txt = "My favorite colors are red, green, and blue."
print(re.findall(r'\w+(?=,|\.)', txt))

['red', 'green', 'blue']


In [18]:
test = 'gooooo!'
for m in re.finditer(r'o(?=oo)',test):
    print(m)

<re.Match object; span=(1, 2), match='o'>
<re.Match object; span=(2, 3), match='o'>
<re.Match object; span=(3, 4), match='o'>


# Negative Look ahead: A(?!B)

In [53]:
# search "Isaac" if it's not followed by Asim
print(re.search(r'Isaac(?!Asim)','IsaacAsimove'))
print(re.search(r'Isaac(?!Asim)','IsaacThan'))
# the second one gives result cause it's not followed by Asim

None
<re.Match object; span=(0, 5), match='Isaac'>


In [54]:
# find regax which is not followed by space and regax
test = 'I love python, I love regax'
print(re.search(r'love(?!\sregax)', test))

<re.Match object; span=(2, 6), match='love'>


# Look Behind
Look behind mechanism checks the match for a non-consuming expression behind a given pattern.


# Positive Look behind

1.Positive look behind will succeed if the passed non-consuming expression does match against the forthcoming input.

2.The syntax is (?<=B)A where A is the actual expression and B is the non-consuming expression.

In [57]:
print(re.search(r'(?<=abc)\w+','abcPython'))
print(re.search(r'(?<=abc)\w+','defPython'))

<re.Match object; span=(3, 9), match='Python'>
None


In [65]:
txt = "love regex or hate regex, can't ignore regex"
# find regax if love or hate exist behind it 
pat = re.compile(r'(?<=(love|hate)\s)regex')
matches = pat.finditer(txt)
for match in matches:
    print(match)

<re.Match object; span=(5, 10), match='regex'>
<re.Match object; span=(19, 24), match='regex'>


# Negative look behind

In [67]:
txt = "love regex or hate regex, can't ignore regex"
# find regax if love or hate doesn't exist behind it 
pat = re.compile(r'(?<!(love|hate)\s)regex')
matches = pat.finditer(txt)
for match in matches:
    print(match)

<re.Match object; span=(39, 44), match='regex'>


# Non Capturing-Group

There are cases when we want to use groups, but we're not interested in extracting the information, i.e. capturing the matched text inside paranthesis only. An example is alteration.

In [14]:
#Let's consider an example where we want to find the strings i love cats or i love dogs in the given text.
txt = """
i love cats
i love dog"""
#print(re.findall('i love (cats|dog)', txt))
pattern = re.compile(r'i love (cats|dog)')
matches = pattern.finditer(txt)
for match in matches:
    print(match.group(0))
    print(match.group(1))



i love cats
cats
i love dog
dog


In [71]:
pattern = re.compile('i love (cats|dog)')
for match in pattern.finditer(txt):
    print("Complete regex match (default):", match.group(0))
    print("Match captured by 1st group:", match.group(1))
# As we can see, the group captured part contains only cats or dogs instead of complete sentences.

Complete regex match (default): i love cats
Match captured by 1st group: cats
Complete regex match (default): i love dog
Match captured by 1st group: dog


In [4]:
print(re.findall('i love (?:cats|dog)', txt))


['i love cats', 'i love dog']


In [5]:
pattern = re.compile('i love (?:cats|dog)')
for match in pattern.finditer(txt):
    print("Complete regex match (default):", match.group(0))
    print("Match captured by 1st group:", match.group(1))
 # After using the new syntax, we have the same functionality as before, but now we're saving resources and 
#the regex is easier to maintain. Note that the group cannot be referenced.

Complete regex match (default): i love cats


IndexError: no such group

In [16]:
pat = re.compile(r'(?:abc){3}') # we cannot refer to any group
matches = pat.finditer('abcabcabc')
for match in matches:
    print(match.group(0))

abcabcabc


In [86]:
# back reference
pattern = re.compile(r'(abc|def)=\1')
matches = pattern.finditer('abc=abc or def=def')
for match in matches:
    print(match)

<re.Match object; span=(0, 7), match='abc=abc'>
<re.Match object; span=(11, 18), match='def=def'>


In [6]:
import re 
test_string = """
it is not your fault
it is your fault
"""
print(re.findall(r'it is (not)? your fault',test_string))

['not']


# Simple Password Validation

1. The password must have between six and ten word characters \w
2. It must include at least one lowercase character [a-z]
3. It must include at least three uppercase characters [A-Z]
4. It must include at least one digit \d

In [93]:
password = 'ABCa0win12'
# checking password if it has six to ten \w
print(bool(re.search(r'(?=^\w{6,10}$)',password)))
print(re.search(r'(?=^\w{6,10}$)\w+',password)) # it just check if the pattern has or not # if we want to show result we 
# have to write pattern outside parenthesis

True
<re.Match object; span=(0, 10), match='ABCa0win12'>


In [2]:
import re
p2 = 'ABCA0WIN12'
# checking at least one lowercase character 
print(re.search(r'(?=[^a-z]*[a-z])', p2))
# checking at least one uppercase character
print(re.search(r'(?=[^A-Z]*[A-Z]{3})', p2))
# checking at least one digit
print(re.search(r'(?=\D*\d)', p2))


None
<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(0, 0), match=''>


In [105]:
# By combining all we get
print(bool(re.search(r'(?=^\w{6,10}$)(?=[^a-z]*[a-z])(?=[^A-Z]*[A-Z]{3})(?=\D*\d)',password)))

True


In [106]:
pass_list = []
for _ in range(int(input())):
    pass_word = input()
    pass_list.append(pass_word)
for password in pass_list:
    if re.match(r'(?=^\w{6,10}$)(?=[^a-z]*[a-z])(?=[^A-Z]*[A-Z]{3})(?=\D*\d)', password):
        print('Valid')
    else:
        print('Invalid')
    

2
ABCA0WIN12
ABCa0win12
Invalid
Valid


# HackerRank - Creditcard Validating
1. it must start with 4,5 or 6
2. it must contain exactly 16 digits
3. it must only consist 0-9
4. it may have digits in group of 4 separated by one -
5. It must NOT have  4 or more consecutive repeated digits.

In [None]:
# test credit card number 
num_1 = '4253625879615787'
num_2 = '4123-1234-8821-1231'
num_3 = '61234-567-8912-3456'
num_4 = '5133336789123456'

In [107]:
num = []
for _ in range(int(input())):
    c_num = input()
    num.append(c_num)
for n in num:
    if re.match(r'(?=^[456]([\d]{15}|[\d]{3}(-[\d]{4}){3}))',n) and not re.search(r'(\d)\1\1\1',n.replace("-", "")):
        print('Valid')
    else:
        print('Invalid')

4
4253625879615787
4123-1234-8821-1231
61234-567-8912-3456
5133336789123456
Valid
Valid
Invalid
Invalid


# Group Number 

In [1]:
# find number which is more than 1
import re
st = '0, 1, 22, 33, 44, 55, 66, 77, 88 or 999'
re.findall(r'(\d)\1+',st)


['2', '3', '4', '5', '6', '7', '8', '9']

In [37]:
s = 'malayam'
re.findall(r'([\w])',s)

['m', 'a', 'l', 'a', 'y', 'a', 'm']

In [38]:
name = 'thanthan'
print(re.match(r'(\w)(\w)(\w)(\w)\1\2\3\4',name))

<re.Match object; span=(0, 8), match='thanthan'>


In [3]:
test = 'o'
re.search(r'(b)?o\1',test)

In [15]:
s = '12345678'
s2 = '12-34-56-87'
re.match(r'^\d{8}|^\d{2}(-)(\d{2}\1){2}\d{2}$',s2)

<re.Match object; span=(0, 11), match='12-34-56-87'>

In [23]:
import re
import random
num = '123245'
num2 = '112325'
print(bool(re.search(r'(\d)(?=\d\1)',num)))
print(bool(re.search(r'(\d)(?=\d\1)','4542867')))
print(re.match(r'^\d{6}$','4542867'))

True
True
None


In [66]:
import random 
random.randint(100000,999999)

209149

In [37]:
email = """
hackers@hackerrank.com hi man interviewstreet@hackerrank.com this is email product@hackerrank.com
"""
email_list = []
for em in re.finditer(r'[a-zA-Z\d.-]+@[a-zA-Z-]+\.(com|edu|net)', email):
    email_list.append(em.group(0))
print(";".join(email_list))
    

hackers@hackerrank.com;interviewstreet@hackerrank.com;product@hackerrank.com


# Matching Ip adrress - Hacker rank problem

In [1]:
import re
pat_1 = re.compile(r'^([0-9]|[01]?[0-9][0-9]|2[0-4][0-9]|25[0-5])(\.([0-9]|[01]?[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}$')
pat_2 =re.compile(r'^([0-9a-fA-F]){1,4}(:([0-9a-fA-F]{1,4})){7}$')
def check_ip(ip):
    if re.search(pat_1,ip):
        print('IPv4 ')
    elif re.search(pat_2,ip):
        print('IPv6')
    else:
        print('Neither')
if __name__=="__main__":
    ip_list = []
    for _ in range(int(input())):
        ip_address = input()
        ip_list.append(ip_address)
for IP in ip_list:
    check_ip(IP)

1
1050:1000:1000:a000:5:600:300c:326b
IPv6


In [2]:
test_ip = """
1050:1000:1000:a000:5:600:300c:326b
1050:1000:2000:ab00:5:600:300c:326a
1050:1000:3000:abc0:5:600:300c:326c
1051:1000:4000:abcd:5:600:300c:326b
22.231.113.64
22.231.113.164
222.231.113.64
"""

# Detect html tag and attri

In [83]:
import re
html_tag = []
html = '<p><a href="http://www.quackit.com/html/tutorial/html_links.cfm">Example Link</a></p><div class="more-info">'
tags=re.findall(r'(?<=<)[^>]+(?=>)',html)
#tags = re.findall(r'<[^>]+>', html)
for tag in tags:
    if not re.search(r'^/',tag):
        html_tag.append(tag)
print(html_tag)
# filtering out tag
tagg=[]
for tag in html_tag:
    tag=tag.split()
    tagg.append(tag[0])
print(tagg)
    #print('{}:{}'.format())

['p', 'a href="http://www.quackit.com/html/tutorial/html_links.cfm"', 'div class="more-info"']
['p', 'a', 'div']


In [88]:
for m in tagg:
    att=re.search(r'(?<={}\s).+(?==)'.format(m),html)
    print(att)

None
<re.Match object; span=(6, 95), match='href="http://www.quackit.com/html/tutorial/html_l>
<re.Match object; span=(90, 95), match='class'>


In [76]:
s = 'div class="more-info"'
s.split()

['div', 'class="more-info"']

In [80]:
a = [1,2,3]
b = [10,20,30]
for x,y in zip(a,b):
    print(str(x)+':'+str(y))

1:10
2:20
3:30
