In [1]:
import re

## basic matching on Python

In [2]:
# matching string
pattern1 = 'cat'
pattern2 = 'bird'
string = "dog runs to cat"
print(pattern1 in string)
print(pattern2 in string)

True
False


## re matching

In [3]:
# regular expression
pattern1 = 'cat'
pattern2 = 'bird'
string = "dog runs to cat"
print(re.search(pattern1, string))
print(re.search(pattern2, string))

<re.Match object; span=(12, 15), match='cat'>
None


## multiple matching, use [ ]

In [5]:
# multiple patterns ("run" or "ran")
pattern = r"r[au]n"
print(re.search(pattern, 'dog runs to cat'))

<re.Match object; span=(4, 7), match='run'>


In [7]:
print(re.search(r"r[A-Z]n", 'dog runs to cat'))
print(re.search(r"r[a-z]n", 'dog runs to cat'))
print(re.search(r"r[0-9]n", 'dog r2ns to cat'))
print(re.search(r"r[0-9a-z]n", 'dog runs to cat'))

None
<re.Match object; span=(4, 7), match='run'>
<re.Match object; span=(4, 7), match='r2n'>
<re.Match object; span=(4, 7), match='run'>


## number matching

In [8]:
# \d : decimal digit
print(re.search(r'r\dn', 'run r4n'))
# \D : any non-decimal digit
print(re.search(r'r\Dn', 'run r4n'))

<re.Match object; span=(4, 7), match='r4n'>
<re.Match object; span=(0, 3), match='run'>


## blank space matching

In [9]:
# \s : any white space [\t\n\r\f\v] tab retrun ......
print(re.search(r'r\sn', 'r\nn r4n'))
# \S : opposite to \s, any non-white space
print(re.search(r'r\Sn', 'r\nn r4n'))

<re.Match object; span=(0, 3), match='r\nn'>
<re.Match object; span=(4, 7), match='r4n'>


## alphabet, digit and "_"

In [10]:
# \w : [a-zA-Z0-9_]
print(re.search(r'r\wn', 'r\nn r4n'))
# \W : opposite to \w
print(re.search(r'r\Wn', 'r\nn r4n'))

<re.Match object; span=(4, 7), match='r4n'>
<re.Match object; span=(0, 3), match='r\nn'>


## empty string

In [13]:
# \b : empty string (only at the start or end of the word)
print(re.search(r'\bruns\b', 'dog runs to cat'))
# \B : empty string (but not at the start or end of a word)
print(re.search(r'\B runs \B', 'dog   runs   to cat'))

<re.Match object; span=(4, 8), match='runs'>
<re.Match object; span=(5, 11), match=' runs '>


## unique character

In [26]:
# \\ : match \
print(re.search(r"runs\\", "runs\ to me"))
# . : match anything (except \n)
print(re.search(r'r.n', 'r[ns to me'))

<re.Match object; span=(0, 5), match='runs\\'>
<re.Match object; span=(0, 3), match='r[n'>


## line beginning and line ending

In [27]:
# ^ : match line beginning
print(re.search(r'^dog', "dog runs to cat"))
# $ : match line ending
print(re.search(r'cat$', 'dog runs to cat'))

<re.Match object; span=(0, 3), match='dog'>
<re.Match object; span=(12, 15), match='cat'>


## may or may not

In [28]:
# ? : may or may not occur
print(re.search(r"Mon(day)?", "Monday"))
print(re.search(r"Mon(day)?", "Mon"))

<re.Match object; span=(0, 6), match='Monday'>
<re.Match object; span=(0, 3), match='Mon'>


## multiple line

In [29]:
# multi-line
string = """
dog runs to cat.
I run to dog.
"""
print(re.search(r"^I", string))
print(re.search(r"^I", string, flags=re.M))

None
<re.Match object; span=(18, 19), match='I'>


## occur 0 or more times

In [30]:
# * : occur 0 or more times
print(re.search(r"ab*", "a"))
print(re.search(r"ab*", "abbbbb"))

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 6), match='abbbbb'>


## occur 1 or more times

In [32]:
# + : occur 1 or more times
print(re.search(r"ab+", "a"))
print(re.search(r"ab+", "abbbbb"))

None
<re.Match object; span=(0, 6), match='abbbbb'>


## occur n to m times

In [33]:
# {n, m} : occur n to m times
print(re.search(r"ab{2,10}", "a"))
print(re.search(r"ab{2,10}", "abbbbb"))

None
<re.Match object; span=(0, 6), match='abbbbb'>


## group

In [34]:
# group
match = re.search(r"(\d+), Date: (.+)", "ID: 021523, Date: Mar/26/2022")
print(match.group())
print(match.group(1))
print(match.group(2))

021523, Date: Mar/26/2022
021523
Mar/26/2022


In [35]:
# ?P<id> use a key to search, ?P<XX>  match.group('XX')
match = re.search(r"(?P<id>\d+), Date: (?P<date>.+)", "ID: 021523, Date: Mar/26/2022")
print(match.group('id'))
print(match.group('date'))

021523
Mar/26/2022


## find all matching

In [37]:
# findall
print(re.findall(r"r[ua]n", "run ran ren"))

['run', 'ran']


In [38]:
# | : or
print(re.findall(r"run|ran", "run ran ren"))

['run', 'ran']


## replace matching

In [39]:
# re.sub() replace
print(re.sub(r"r[au]ns", "catches", "dog runs to cat"))

dog catches to cat


## split matching

In [43]:
# re.split()
print(re.split(r"[,;\.]", "a;b,c.d;e"))

['a', 'b', 'c', 'd', 'e']


## compile 

In [44]:
# compile
compiled_re = re.compile(r"r[ua]n")
print(compiled_re.search("dog ran to cat"))

<re.Match object; span=(4, 7), match='ran'>


## source : morvanzhou 36 RegEx

## Example

In [45]:
import requests
import re

url = 'http://news.baidu.com/'

headers = {
    "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
# response.text is not accurate, parse by guess
data = requests.get(url, headers=headers).content.decode("utf-8")

# use re to parse data
# each title, url of news

# <a href="https://baijiahao.baidu.com/s?id=1720708705449601897" target="_blank" mon="a=9">车企“出逃”宁德时代</a>

pattern = re.compile('<a href="(.*?)" target="_blank" mon="a=9">(.*?)</a>')
# pattern = re.compile('<a (.*?)</a>',re.S)
result = pattern.findall(data)

print(result)

    

[('https://baijiahao.baidu.com/s?id=1728071133698837512&wfr=spider&for=pc', '俄罗斯航天发动机断供，马斯克坐地起价'), ('https://baijiahao.baidu.com/s?id=1728070458807553772&wfr=spider&for=pc', '苹果服务器两天内两次宕机，服务被切断'), ('https://baijiahao.baidu.com/s?id=1728046404068814219&wfr=spider&for=pc', '黑客宣称窃取了微软37GB源代码'), ('https://baijiahao.baidu.com/s?id=1728053656460983935&wfr=spider&for=pc', '谷歌母公司分拆旗下量子技术部门Sandbox'), ('https://baijiahao.baidu.com/s?id=1728049420613876032&wfr=spider&for=pc', '阿里巴巴1600亿“抄底”阿里巴巴'), ('https://baijiahao.baidu.com/s?id=1727985051609272646&wfr=spider&for=pc', '美团优选、淘菜菜后，京喜也被传将裁员'), ('https://baijiahao.baidu.com/s?id=1728068771521538157&wfr=spider&for=pc', '三星显示将于6月完全关闭LCD面板生产线'), ('https://baijiahao.baidu.com/s?id=1728058832438117951&wfr=spider&for=pc', '华为的全屋智能，和汽车有什么关系？'), ('https://baijiahao.baidu.com/s?id=1728065906197536832&wfr=spider&for=pc', '小米最大利润来源又变成互联网业务'), ('https://baijiahao.baidu.com/s?id=1728043860203897438&wfr=spider&for=pc', 'Facebook母公司遭澳大利亚监管机构起诉'), ('https://baiji