In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## String -- Common string operations

### String constants

In [2]:
import string
string.ascii_letters    # this value is not locale-dependent?
string.ascii_lowercase
string.ascii_uppercase
string.digits
string.hexdigits
string.octdigits
string.punctuation
string.printable
string.whitespace

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

'abcdefghijklmnopqrstuvwxyz'

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

'0123456789'

'0123456789abcdefABCDEF'

'01234567'

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

' \t\n\r\x0b\x0c'

### Custom String Formatting

In [3]:
# format, string.format method
"{0} is year, {1} is month, {2} is day. And this is the literal {{}}".format(2023, 8, 5)

# If the arg_name is a number, it refers to a positional argument, else it is a keyword argument
"{0!r} is year, {1} is month, {_} is day. And this is the literal {{}}".format(2023, 8, _=12)

year = 2023
month = 8
day = 5

# f-string
f'{year} is year, {month} is month, {day} is day.'

# % string
'%d is year, %d is month, %d is day.' % (year, month, day)

'2023 is year, 8 is month, 5 is day. And this is the literal {}'

'2023 is year, 8 is month, 12 is day. And this is the literal {}'

'2023 is year, 8 is month, 5 is day.'

'2023 is year, 8 is month, 5 is day.'

In [4]:
format('123')

'123'

### Metasyntax

*Element* of metasyntax
- *Terminals*: a stand-alone syntactic structure. Terminals could be denoted by double quoting the name of the terminals.
  e.g. `"else"`, `"if"`, `"then"`, `"while"`
- *Nonterminals*: a symbolic representation defining a set of allowable syntactic structures that is composed of a subset of elements. Nonterminals could be denoted by angle bracketing the name of the nonterminals.
  e.g. `<int>`, `<char>`, `<boolean>`
- *Metasymbol*: a symbolic representation denoting syntactic information.
  e.g. `:=`, `|`, `{}`, `()`, `[]`, `*`


*Methods* of phrase termination
- Juxtaposition: e.g. `A B`
- Alternation: e.g. `A|B`
- Repetition: e.g. `{A B}`
- Optional phrase: e.g. `[A B]`
- Grouping: e.g. `(A|B)`

### Format String Syntax

Format strings contain "replacement fields" surrounded by curly braces `{}`. The grammar for a replacement filed is as follow:

```code
replacement_field ::=  "{" [field_name] ["!" conversion] [":" format_spec] "}"
field_name        ::=  arg_name ("." attribute_name | "[" element_index "]")*
arg_name          ::=  [identifier | digit+]
attribute_name    ::=  identifier
element_index     ::=  digit+ | index_string
index_string      ::=  <any source character except "]"> +
conversion        ::=  "r" | "s" | "a"
format_spec       ::=  <described in the next section>
```

In [5]:
name = "Bryant"
players = ["Kobe", "Jamas"]

"First, thou shalt count to {0}".format(12, 14)    # the last one is unused
"My quest is {name}".format(name="Bryant")
"Weight in tons {0.count}".format(["Kobe", "Jamas"])
"Units destroyed: {players[0]}".format(players = ["Kobe", "Jamas"])

'First, thou shalt count to 12'

'My quest is Bryant'

'Weight in tons <built-in method count of list object at 0x10ecc7e80>'

'Units destroyed: Kobe'

In [6]:
# Three conversion flags are currently supported
"Harold's a clever {0!s}".format('boy')        # Calls str() on the argument first
"Bring out the holy {name!r}".format(name='cow')    # Calls repr() on the argument first
"More {!a}".format('A')                      # Calls ascii() on the argument first

"Harold's a clever boy"

"Bring out the holy 'cow'"

"More 'A'"

In [7]:
# Accessing arguments by name
coord = {'latitude': '37.24N', 'longitude': '-115.81W'}
'Coordinates: {latitude}. {longitude}'.format(**coord)

'Coordinates: 37.24N. -115.81W'

In [8]:
# Accessing arguments' attributes
c = 3-5j
('The complex number {0} is formed from the real part {0.real} '
 'and the imaginary part {0.imag}.').format(c)

'The complex number (3-5j) is formed from the real part 3.0 and the imaginary part -5.0.'

In [9]:
# Accessing arguments' items
coord = (3, 5)
'X: {0[0]};  Y: {0[1]}'.format(coord)

'X: 3;  Y: 5'

In [10]:
# Replacing %s and %r
"repr() shows quotes: {!r}; str() doesn't: {!s}".format('test1', 'test2')

"repr() shows quotes: 'test1'; str() doesn't: test2"

In [11]:
# Aligning the text and specifying a width
'{:<30}'.format('left aligned')
'{:>30}'.format('right aligned')
'{:^30}'.format('centered')
'{:*^30}'.format('centered')    # use '*' as a fill char

'left aligned                  '

'                 right aligned'

'           centered           '

'***********centered***********'

In [12]:
# Replacing %x and %o and converting the value to different bases
"int: {0:d}; hex: {0:x}; oct: {0:o}; bin: {0:b}".format(42)

'int: 42; hex: 2a; oct: 52; bin: 101010'

In [13]:
# Math specifying
'{:,}'.format(1234567890)    # using a comma as a thousand separator
'Correct answers: {:.2%}'.format(19/22)    # exressing a percentage

# Using type-specific formatting
import datetime
d = datetime.datetime(2010, 7, 4, 12, 15, 58)
'{:%Y-%m-%d %H:%M:%S}'.format(d)

'1,234,567,890'

'Correct answers: 86.36%'

'2010-07-04 12:15:58'

In [14]:
# Nesting arguments
for align, text in zip('<^>', ['left', 'center', 'right']):
    '{0:{fill}{align}16}'.format(text, fill=align, align=align)

octets = [192, 168, 0, 1]
'{:02X}{:02X}{:02X}{:02X}'.format(*octets)

int(_, 16)

width = 5
for num in range(5, 12):
    for base in 'dXob':    # d represents 'decimal', X represnets 'Hexdecimal', o represent 'octal', b represents 'binary'
        print('{0:{width}{base}}'.format(num, base=base, width=width), end=' ')
    print()


'left<<<<<<<<<<<<'

'^^^^^center^^^^^'

'>>>>>>>>>>>right'

'C0A80001'

3232235521

    5     5     5   101 
    6     6     6   110 
    7     7     7   111 
    8     8    10  1000 
    9     9    11  1001 
   10     A    12  1010 
   11     B    13  1011 


### Template strings

In [15]:
from string import Template
s = Template('$who likes $what')
s.substitute(who='tim', what='kung pao')

d = dict(who='tim')
Template('$who likes $what').safe_substitute(d)

'tim likes kung pao'

'tim likes $what'

## re - Regular expression operations

Usually patterns will be expressed in Python code using *raw string notation*.

### Regular Expression Syntax

The special characters:

**Character classes and class-like constructs**:
- `.` In the default mode, this matches any character except a newline. If the DOTALL flag has been specified, this matches any character including a newline.
- `\` Either escapes special characters or signals a special sequences.
- `[]` Used to indicate a set of characters.


- `\d` For Unicode (str) patterns: Matches any Unicode decimal digit (that is, any character in Unicode character category `[Nd]`). This includes `[0-9]`, and also many other digit characters. If the ASCII flag is used only `[0-9]` is matched. For 8-bit (bytes) patterns: Matches any decimal digit; this is equivalent to `[0-9]`.
- `\D` Matches any character which is not a decimal digit. This is the opposite of `\d`. If the ASCII flag is used this becomes the equivalent of `[^0-9]`.

- `\s` Matches Unicode whitespace characters (which includes `[ \t\n\r\f\v]`, and also many other characters, for example, the non-breaking spaces mandated by typography rules in many languages). If the ASCII flag is used, only `[ \t\n\r\f\v]` is matched. For 8-bit (bytes) patterns: Matches characters considered whitespace in the ASCII character set; this is equivalent to `[ \t\n\r\f\v]`.
- `\S` Matches any character which is not a whitespace character. This is the opposite of `\s`. If the ASCII flag is used this becomes the equivalent of `[^ \t\n\r\f\v]`.

- `\w` For Unicode (str) patterns: Matches Unicode word characters; this includes alphanumeric characters (as defined by str.isalnum()) as well as the underscore (`_`). If the ASCII flag is used, only `[a-zA-Z0-9_]` is matched.
For 8-bit (bytes) patterns:
Matches characters considered alphanumeric in the ASCII character set; this is equivalent to `[a-zA-Z0-9_]`. If the LOCALE flag is used, matches characters considered alphanumeric in the current locale and the underscore.
- `\W` Matches any character which is not a word character. This is the opposite of `\w`. If the ASCII flag is used this becomes the equivalent of `[^a-zA-Z0-9_]`. If the [LOCALE](https://docs.python.org/3/library/re.html#re.LOCALE) flag is used, matches characters which are neither alphanumeric in the current locale nor the underscore.

**Anchors（锚点）**:
- `^` Matches the start of the string, and in MULTILINE mode also matches immediately after each newline.
- `$` Matches the end of the string or just before the newline at the end of the string, and in MULTILINE mode also matches before a newline.
- `(?=...)` Matches if `...` matches next, but doesn't consume any of the string. This is called *lookahead assertion*. For example, `Isaac (?=Asimov)` will match `'Isaac '` only if it’s followed by `'Asimov'`.
- `(?!...)` Matches if `...` doesn’t match next. This is a *negative lookahead assertion*. For example, `Isaac (?!Asimov)` will match `'Isaac '` only if it’s not followed by `'Asimov'`.
- `(?<=...)` Matches if the current position in the string is preceded by a match for `...` that ends at the current position. This is called a *positive lookbehind assertion*.

In [16]:
import re
m = re.search('(?<=abc)def', 'abcdef')
m.group(0)

# This example looks for a word following a hyphen
m = re.search(r'(?<=-)\w+', 'spam-egg')
m.group(0)

'def'

'egg'

- `(?<!...)` Matches if the current position in the string is not preceded by a match for `....` This is called a *negative lookbehind assertion*. Patterns which start with negative lookbehind assertions may match at the beginning of the string being searched.

- `\A` Matches only at the start of the string.
- `\b` Matches the empty string, but only at the beginning or end of a word. A word is defined as a sequence of word characters.
- `\B` Matches the empty string, but only when it is *not* at the beginning or end of a word.
- `\Z` Matches only at the end of the string.

**Grouping, Capturing, Conditional, and Control**:
- `*` Causes the resulting RE to match 0 or more repetitions of the preceding RE, as many repetitions as possible. `ab*` will match 'a', 'ab', or 'a' followed by any number of 'b's.
- `+` Causes the resulting RE to match 1 or more repetitions of the preceding RE. `ab+` will match 'a' followed by any non-zero number of 'b's; it will not match just 'a'.
- `?` Causes the resulting RE to match 0 or 1 repetitions of the preceding RE. `ab?` will match either 'a' or 'ab'.
- `*?`, `+?`, `??` No-greedy matches quantifiers.
- `*+`, `++`, `?+` Possessive quantifiers, these do not allow back-tracking when the expression following it fails to match.
- `{m}` Specifies that exactly *m* copies of the previous RE should be matched.
- `{m, n}` Causes the resulting RE to match from *m* to *n* repetitions of the preceding RE, attempting to match as many as possible.
- `|` `A|B`, where *A* and *B* can be arbitrary REs, creates a regular expression that will match either *A* or *B*.
- `(...)` Matches whatever regular expression is inside the parentheses, and indicates the start and end of a group; the contents of a group can be retrieved after a match has been performed, and can be matched later in the string with the `\number` special sequence, described below.

- `(?:...)` *Grouping-only parentheses*. A non-capturing version of regular parentheses.

- `(?>...)` *Atomic grouping*(固化分组): Attempts to match `...` as if it was a separate regular expression, and if successful, continues to match the rest of the pattern following it

- `(?P<name>...)` Similar to regular parentheses, but the substring matched by the group is accessible via the symbolic group name *name*.
- `(?P=name)` A backreference to a named group; it matches whatever text was matched by the earlier group named *name*.

- `(?(id/name)yes-pattern|no-pattern)` Will try to match with `yes-pattern` if the group with given *id* or *name* exists, and with `no-pattern` if it doesn’t. `no-pattern` is optional and can be omitted.

**Mode modifier**: *(?modifier)*, such as *(?a)* or *(?-i)*

- `(?...)` This is an extension notation. The first character after the `?` determines what the meaning and further syntax of the construct is.

- `(?aiLmsux)` One or more letters from the set 'a', 'i', 'L', 'm', 's', 'u', 'x'. The group matches the empty string; the letters set the corresponding flags: re.A (*ASCII-only matching*), re.I (*ignore case*), re.L (*locale dependent*), re.M (*multi-line*), re.S (*dot matches all*), re.U (*Unicode matching*), and re.X (*verbose*), for the entire regular expression.

Changed in version 3.11: This construction can only be used at the start of the expression.
在 3.11 版更改：**此构造只能在表达式的开头使用**。

- `(?aiLmsux-imsx:...)` *Mode-modified span(模式作用范围)*, (?*modifier*:...), such as (?:...). 

- `(?#...)` A comment; the contents of the parentheses are simply ignored.


### Module Contents

Functions

In [17]:
# re.compile
prog = re.compile('abc')    # the regex object can be reused many times
string = 'abc, efg'
result = prog.match(string)
result

re.match('abc', string)

<re.Match object; span=(0, 3), match='abc'>

<re.Match object; span=(0, 3), match='abc'>

In [18]:
# re.search
string = '''
Running throughout the system is a collection of electrical conduits called buses that carry bytes of information back and forth between the components. 
Buses are typically designed to transfer fixed-size chunks of bytes known as words. The number of bytes in a word (the word size) is a fundamental system parameter that varies across systems. 
Most machines today have word sizes of either 4 bytes (32 bits) or 8 bytes (64 bits). In this book, we do not assume any fixed definition of word size. 
Instead, we will specify what we mean by a "word" in any context that requires this to be defined.
'''

pattern = 'size'
re.search(pattern, string)

<re.Match object; span=(202, 206), match='size'>

In [19]:
# re.match, this only works for the characters at the beginning of the string match the regular expression pattern
print(re.match(pattern, string))    # this return None

s = 'abc, efg'
re.match('abc', s)

None


<re.Match object; span=(0, 3), match='abc'>

In [20]:
# re.fullmatch
print(re.fullmatch(pattern, string))    # this returns None

s = 'abc'
re.fullmatch(s, s)

None


<re.Match object; span=(0, 3), match='abc'>

In [21]:
# re.split
s1 = 'Words, words, words.'
re.split(r'\W+', s1)
re.split(r'(\W+)', s1)
re.split(r'\W+', s1, 1)

re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)

['Words', 'words', 'words', '']

['Words', ', ', 'words', ', ', 'words', '.', '']

['Words', 'words, words.']

['0', '3', '9']

In [22]:
# re.findall
re.findall(r'\bf[a-z]*', 'which foot or hand feel fastest')
re.findall(r'(\w+)=(\d+)', 'set width=20 and height=10')

['foot', 'feel', 'fastest']

[('width', '20'), ('height', '10')]

In [23]:
# re.finditer
i = re.finditer(r'\bf[a-z]*', 'which foot or hand feel fastest')
i.__next__()
i.__next__()
i.__next__()

<re.Match object; span=(6, 10), match='foot'>

<re.Match object; span=(19, 23), match='feel'>

<re.Match object; span=(24, 31), match='fastest'>

In [24]:
# re.sub
re.sub(r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):',
       r'static PyObject*\npy_\1(void)\n{',
       'def myfunc():')

def dashrepl(matchobj):
    if matchobj.group(0) == '-':
        return ' '
    else:
        return '-'

re.sub('-{1,2}', dashrepl, 'pro--gram-files')

'static PyObject*\npy_myfunc(void)\n{'

'pro-gram files'

Exceptions

### Regular Expression Examples

In [30]:
# Checking for a pair
def displaymatch(match):
    if match is None:
        return None
    return '<Match: %r, groups=%r>' % (match.group(), match.groups())

valid = re.compile(r"^[a2-9tjqk]{5}$")
displaymatch(valid.match("akt5q"))

pair = re.compile(r".*(.).*\1")    # regular expressions object
displaymatch(pair.match("717ak"))
displaymatch(pair.match("354aa"))
pair.match("717ak").group(1)    # the attribution of match object
pair.match("354aa").group(1)

"<Match: 'akt5q', groups=()>"

"<Match: '717', groups=('7',)>"

"<Match: '354aa', groups=('a',)>"

'7'

'a'

In [33]:
# Simulating scanf()
r = re.compile(r"(\S+) - (\d+) errors, (\d+) warnings")
m = r.match("/usr/sbin/sendmail - 0 errors, 4 warnings")
m
r.search("/usr/sbin/sendmail - 0 errors, 4 warnings")
r.fullmatch("/usr/sbin/sendmail - 0 errors, 4 warnings")







In [35]:
# search() vs. match()
re.match("c", "abcdef")    # No match, match() checks for a match only at the beginning of the strign
re.search("c", "abcdef")    # Match
re.fullmatch("c", "abcdef")    # No match, fullmatch() checks for entire string to be a match
re.fullmatch("p.*n", "python")    # Match

<re.Match object; span=(2, 3), match='c'>

<re.Match object; span=(0, 6), match='python'>