** Exercises of https://www.w3resource.com/python-exercises/re/ **   
** Reference: https://regex101.com/ **

#### check that a string contains only a certain set of characters (in this case a-z, A-Z and 0-9)

In [1]:
import re

s = 'adsfasAdANKJK10Anksdjf'
ss = '^#&&#&'

def match(s):
    p = re.compile(r'[a-zA-Z0-9]') 
    x = p.search(s)
    return True if x else False

print(match(s))
print(match(ss))

True
False


####  match a string that has an a followed by zero or more b's

In [2]:
s = 'acdfbd'
ss = 'vbbjdfkjbbbb'

# re.search 扫描整个字符串并返回第一个成功的匹配
# ? 匹配前面的子表达式零次或一次，或指明一个非贪婪限定符。
# . 匹配除 "\n" 之外的任何单个字符
print(re.search(r'b*', s))
print(re.search(r'.b*', ss))
print(re.findall(r'.b*', ss))

<_sre.SRE_Match object; span=(0, 0), match=''>
<_sre.SRE_Match object; span=(0, 3), match='vbb'>
['vbb', 'j', 'd', 'f', 'k', 'jbbbb']


#### match a string that has an a followed by one or more b's

In [3]:
s = 'acdfd'
ss = 'jbbjdfkjbbbb'

print(re.search(r'b+?', s))
print(re.search(r'b+', ss))
print(re.search(r'b+?', ss))

None
<_sre.SRE_Match object; span=(1, 3), match='bb'>
<_sre.SRE_Match object; span=(1, 2), match='b'>


#### match a string that has an a followed by zero or one 'b'.

In [4]:
s = 'acdfd'
ss = 'bbjdfkjbbbbx'

print(re.search(r'b?', s))
print(re.search(r'b?', ss))
print(re.findall(r'.b+', ss))

<_sre.SRE_Match object; span=(0, 0), match=''>
<_sre.SRE_Match object; span=(0, 1), match='b'>
['bb', 'jbbbb']


#### match a string that has an a followed by three 'b'.

In [5]:
s = 'acdfd'
ss = 'bbjdfkjbbbb'

# {n}, 精确匹配n个前面表达式。
print(re.search(r'.b{3}', s))
print(re.search(r'.b{3}', ss))
# [^...] 不在[]中的字符：[^abc] 匹配除了a,b,c之外的字符。
print(re.search(r'.b{3}[^b]', ss))
print(re.search(r'.b{3,}', ss))

None
<_sre.SRE_Match object; span=(6, 10), match='jbbb'>
None
<_sre.SRE_Match object; span=(6, 11), match='jbbbb'>


#### match a string that has an a followed by two to three 'b'

In [6]:
s = 'acdfd'
ss = 'bbjdfkjbbbb'

print(re.search(r'b{2,3}', s))
print(re.search(r'b{2,3}', ss))
print(re.findall(r'b{2,3}', ss))

None
<_sre.SRE_Match object; span=(0, 2), match='bb'>
['bb', 'bbb']


#### find sequences of lowercase letters joined with a underscore

In [7]:
s = 'bbjUEIRU_jbb_bAb_'

print(re.search(r'[a-z]+_', s))
print(re.findall(r'[a-z]+_', s))

<_sre.SRE_Match object; span=(9, 13), match='jbb_'>
['jbb_', 'b_']


#### find sequences of one upper case letter followed by lower case letters

In [8]:
s = 'bbjUEIRUc_jbb_bAbbb_'

print(re.search(r'[A-Z]{1}[a-z]+', s))
print(re.findall(r'[A-Z]{1}[a-z]+', s))

<_sre.SRE_Match object; span=(7, 9), match='Uc'>
['Uc', 'Abbb']


#### match a string that has an 'a' followed by anything, ending in 'b'

In [9]:
s = 'bbjUEIRUc_jbb_bAbbb_'
ss = 'vbjUEaIRUc_jbb_bAbbb'

# $ 匹配字符串的末尾
print(re.findall(r'a.*b$', s))
print(re.findall(r'a.*b$', ss))

[]
['aIRUc_jbb_bAbbb']


#### match a word at the beginning of a string.

In [10]:
s = ' bbjUEIRUc_jbb_bAbbb_'
ss = 'abjUEaIRUc_jbb_bAbbb'

# $ 匹配字符串的末尾
print(re.findall(r'^\w.*', s))
print(re.findall(r'^\w.*', ss))

[]
['abjUEaIRUc_jbb_bAbbb']


#### match a word at end of string, with optional punctuation. 

In [11]:
s = ' bbjUEIRUc_jbb_bAbbb_ weter'
ss = 'abjUEaIRUc_jbb_bAbbb8'

# \S 匹配任意非空字符
# \w 匹配包括下划线的任何单词字符
print(re.findall(r'\w+\S[^0-9]$', s))
print(re.findall(r'\w+\S[^0-9]$', ss))

['weter']
[]


#### match a word containing 'z'.

In [12]:
s = ' bbjUEIRUc_jbb_bAbbb_ weter sdfjkz'
ss = 'abjUEaIRUc_jbb_bAbbb8'

print(re.findall(r'\w*z.*\w*', s))
print(re.findall(r'\w*z.*\w*', ss))

['sdfjkz']
[]


#### matcha word containing 'z', not start or end of the word

In [13]:
s = ' bbjUEzIRUc_jbb_bAbbb_ sdfjkz'
ss = 'abjUEaIRUc_jbb_bAbbb8 sdfzjk zjzdk lzzl'

# \b 匹配一个单词边界，也就是指单词和空格间的位置
# \B 匹配非单词边界。'er\B' 能匹配 "verb" 中的 'er'，但不能匹配 "never" 中的 'er'
# \S 匹配任意非空字符
print(re.findall(r'[^z\s]\w+z+[^z\s]+', s))
print(re.findall(r'[^z\s]\w+z+[^z\s]+', ss)) # 非z和空格开头和结尾，包含z的单词

['bbjUEzIRUc_jbb_bAbbb_']
['sdfzjk', 'lzzl']


#### match a string that contains only upper and lowercase letters, numbers, and underscores

In [14]:
s = ' bbjUEzIRUc_jbb_bAbbb_ sdfjkz'
ss = 'abjUEaIRUc_jbb_bAbbb8 sdfz.jk zjzdk lzzl'

# \w 匹配包括下划线的任何单词字符。等价于'[A-Za-z0-9_]'。
print(re.findall(r'[0-9A-Za-z_]+', s))
print(re.findall(r'\w+', ss)) 

['bbjUEzIRUc_jbb_bAbbb_', 'sdfjkz']
['abjUEaIRUc_jbb_bAbbb8', 'sdfz', 'jk', 'zjzdk', 'lzzl']


#### match a string which starts with a specific number

In [15]:
s = '3bbjUEzIRUc_jbb_bAbbb_ sdfjkz'
ss = 'abjUEaIRUc_jbb_bAbbb8 sdfz.jk zjzdk lzzl'

print(re.findall(r'^[0-9]+\w+', s))
print(re.findall(r'^[0-9]+\w+', ss))

['3bbjUEzIRUc_jbb_bAbbb_']
[]


#### remove leading zeros from an IP address

In [16]:
ip = '192.001.02.3'

re.sub('\.[0]*', '.', ip)

'192.1.2.3'

#### check for a number at the end of a string

In [17]:
s = '3bbjU3 984jfd3'
ss = 'abjUEaIRUc_jbb_bAbbb8 sdfz.jk zjzdk lzzl'

print(re.findall(r'.*\d+$', s))
print(re.findall(r'.*\d+$', ss))

['3bbjU3 984jfd3']
[]


#### search the numbers (0-9) of length between 1 to 3 in a given string.

In [18]:
s = 'bbjU3884'

print(re.findall(r'\d{1,3}', s))

['388', '4']


#### search some literals strings in a string

In [19]:
s = 'The quick brown fox jumps over the lazy dog'

lists = ['dog', 'fox']
for l in lists:
    print(re.findall(l, s))

['dog']
['fox']


#### search a literals string in a string and also find the location within the original string where the pattern occurs.

In [20]:
s = 'The quick brown fox jumps over the lazy dog'

lists = ['dog', 'fox']
for l in lists:
    print(re.findall(l, s))
    print(re.search(l, s).span())

['dog']
(40, 43)
['fox']
(16, 19)


#### find the substrings within a string.

In [21]:
s = 'abjUEaIRUc_jbb_babbb8 sdfz.jk zjzdk lzzl'
p = 'ab'

print(re.findall(p, s))

['ab', 'ab']


#### find the occurrence and position of the substrings within a string

In [22]:
s = 'abjUEaIRUc_jbb_babbb8 sdfz.jk zjzdk lzzl'
p = 'ab'

# re.finditer: 在字符串中找到正则表达式所匹配的所有子串，并把它们作为一个迭代器返回

matches = re.finditer(r'ab', s)
for match in matches:
    print('{0}:{1}'.format(match.start(), match.end()))

0:2
16:18


#### replace whitespaces with an underscore and vice versa

In [23]:
s = 'abjUEaIRUc_jbb_babbb8 sdfz.jk zjzdk lzzl'

# re.sub(pattern, repl, string, count=0)
print(re.sub(r'\s', '_', s))
print(re.sub(r'_', ' ', s))

abjUEaIRUc_jbb_babbb8_sdfz.jk_zjzdk_lzzl
abjUEaIRUc jbb babbb8 sdfz.jk zjzdk lzzl


#### extract year, month and date from a an url

In [24]:
url= "https://www.washingtonpost.com/news/football-insider/wp/2016/09/02/odell-beckhams-fame-rests-little-ball-josh-norman-tells-author/" 

re.search(r'\d{4}/\d{2}/\d{2}', url).group()

'2016/09/02'

#### convert a date of yyyy-mm-dd format to dd-mm-yyyy format

In [25]:
d = '2016-09-02'

# re.sub(pattern, repl, string, count=0)
# 当repl是一个字符串时，可以使用\id引用分组，但不能使用编号0
re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\3-\2-\1', d)

'02-09-2016'

#### match if two words from a list of words starting with letter 'P'.

In [26]:
words = ["Python PHP", "Java JavaScript", "c c++"]

for w in words:
    print(re.findall(r'^P\w+\sP\w+', w))

['Python PHP']
[]
[]
