# Module 

In [2]:
import re

# Code 

## Backreference

In [50]:
pair_pattern = re.compile(r".*(.).*\1")

In [51]:
a = re.match(
    pattern=pair_pattern,
    string="77534"
)

In [53]:
a.group(0)

'77'

In [54]:
a.group(1)

'7'

In [28]:
pair_pattern.match("7171").group()

'7171'

## re.match vs search 

In [32]:
match = re.match("c", 'abc')
match 

In [33]:
match = re.search("c", 'abc')
match 

<_sre.SRE_Match object; span=(2, 3), match='c'>

In [34]:
match = re.search("^c", 'abc')
match 

## re.split 

In [35]:
re.split(r'\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [36]:

re.split(r'(\W+)', 'Words, words, words.')

['Words', ', ', 'words', ', ', 'words', '.', '']

In [37]:

re.split(r'\W+', 'Words, words, words.', 1)

['Words', 'words, words.']

## Raw string notation 

In [63]:
print(re.findall(r"\\section", "\section"))
print(re.findall("\\\\section", "\section"))
print(re.findall("\\section", "\section"))

['\\section']
['\\section']
[]


In [57]:
print(re.findall(r"\\", "\\"))
print(re.findall("\\\\", "\\"))
print(re.findall("\\", "\\"))

['\\']
['\\']


error: bad escape (end of pattern) at position 0

In [44]:
re.findall(r"\w", "hah")

['h', 'a', 'h']

# Use Cases 

## Key words matching 

In [69]:
strings = [
    "恐怖",
    "恐怖袭击",
    "恐--怖",
    "83恐==-怖0--0"
]
key_word = "恐怖"
pattern = '.*'.join(key_word)  # Converts 'djm' to 'd.*j.*m'
regex = re.compile(pattern)  # Compiles a regex.

for s in strings:
    match = re.findall(pattern, s)
    print(s)
    print(match)
    print()

恐怖
['恐怖']

恐怖袭击
['恐怖']

恐--怖
['恐--怖']

83恐==-怖0--0
['恐==-怖']



## Filter special char 

In [71]:
text = "you are 834 *@#$%^&*(())"
string = re.sub('[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？丶“”‘’！[\\]^_`{|}~\s]+', "", text)
string

'youare'

## Content between two marks

In [4]:
begin = "<"
end = ">"
string = "<hah><ffd>"

pat = re.compile(begin+'(.*?)'+end,re.S)
non_greedy_result = pat.findall(string)
pat = re.compile(begin+'(.*)'+end,re.S)
greedy_result = pat.findall(string)
print(f"Non-greedy result: {non_greedy_result}")
print(f"Greedy result: {greedy_result}")

Non-greedy result: ['hah', 'ffd']
Greedy result: ['hah><ffd']
