In [None]:
%%html
<style>
h1, h2, h3, h4, h5 {
    color: darkblue;
    font-weight: bold !important;
}
h2 {
    border-bottom: 8px solid darkblue !important;
    padding-bottom: 8px;
}
h3 {
    border-bottom: 2px solid darkblue !important;
    padding-bottom: 6px;
}
.info, .success, .warning, .error {
    border: 1px solid;
    margin: 10px 0px;
    padding:15px 10px;
}
.info {
    color: #00529b;
    background-color: #bde5f8;
}
.success {
    color: #4f8a10;
    background-color: #dff2bf;
}
.warning {
    color: #9f6000;
    background-color: #FEEFB3;
}
.error {
    color: #D8000C;
    background-color: #FFBABA;
}
.language-bash {
    font-weight: 900;
}
.ex {
    font-weight: 900;
    color: rgba(27,27,255,0.87) !important;
}
.mn {
    font-family: Menlo, Consolas, "DejaVu Sans Mono", monospace
}
table {
    margin-left: 0 !important;}
</style>

# Day 3: Up and Running with Python

## 3.4 Regular Expression

-   Regular expression is a syntax to search, extract and manipulate string patterns from a large text.


-   Regular expression is used in text validation, natural language processing (NLP) and text mining.


-   Regular expression is implemented by the standard module `re`.


-   There are two parts in using `re`:
    -   Define a regular expression pattern (aka *pattern*) to match other large text
    -   Apply a regular expression method using the pattern on a large text 


-   Online regular expression tester https://regex101.com/

In [None]:
import re
print(dir(re))

In [None]:
import re
re.search??

In [None]:
import re
re.match??

In [None]:
import re
re.findall??

In [None]:
import re
re.finditer??

In [None]:
import re
re.split??

In [None]:
import re
re.sub??

In [None]:
import re
re.subn??

## Basic and quantifiers used in regular expressions

| **Syntax** | **Meaning**                                                                     |
|:-----------|:--------------------------------------------------------------------------------|
| `.`        | Match any character except newline (`\n`)                                       |
| `^`        | Match *start* of string (if `MULTILINE`, also match right after newline (`\n`)  |
| `$`        | Match *end* of string (if `MULTILINE`, also match right before newline (`\n`)   |
| `*`        | Match 0 or more cases of the previous RE; greedy (as many as possible)          |
| `+`        | Match 1 or more cases of the previous RE; greedy (as many as possible)          |
| `?`        | Match 0 or 1 case of the previous RE; greedy (match one if possible)            |
| `*?`       | Non-greedy version of `*` (match as few as possible)                            |
| `+?`       | Non-greedy version of `+` respectively (match as few as possible)               |
| `??`       | Non-greedy version of `?` respectively (match as few as possible)               |
| `{m}`      | Match exactly `m` cases of the previous RE (greedy)                             |
| `{m, n}`   | Match between `m` and `n` cases of the previous RE (greedy)                     |
| `{m, n}?`  | Match between `m` and `n` cases of the previous RE (non-greedy)                 |
| `[...]`    | Match any **one** of a set of characters contained within the brackets          |
| `[^...]`   | Match one character *not* contained within the brackets after the caret (`^`)   |
| `\|`       | Match either the preceding RE or the following RE                               |


| **Pattern**    | **Meaning**                                                                  |
|:---------------|:-----------------------------------------------------------------------------|
| `a`            | Match the character `a`                                                      |
| `ab`           | Match the string `ab`                                                        |
| `a\|b`         | Match `a` or `b`                                                             |
| `a*`           | Match 0 or more `a`'s                                                        |
| `a+`           | Match 1 or more `a`'s                                                        |
| `a?`           | Match 0 or 1 `a`                                                             |
| `a{2}`         | Match exactly 2 `a`'s                                                        |
| `a{2,5}`       | Match between 2 and 5 `a`'s                                                  |
| `a{2,}`        | Match 2 or more `a`'s                                                        |
| `a{,5}`        | Match up to 5 `a`'s                                                          |

In [None]:
import re

text = 'aaaa'
print(re.search('a', text))
print(re.search('a*', text))
print(re.search('a+', text))
print(re.search('a?', text))
print()
print(re.match('a', text))
print(re.match('a*', text))
print(re.match('a+', text))
print(re.match('a?', text))
print()
print(re.findall('a', text))
print(re.findall('a*', text))
print(re.findall('a+', text))
print(re.findall('a?', text))

<span class='ex'>Example: <span class='mn'>re.findall()</span></span>

In [None]:
import re

text ='a bcada'
print(re.findall('.', text))  # Find all occurence of any character except newline
print(re.findall('a', text))  # Find all occurence of "a"
print(re.findall('^a', text)) # Find all occurrence of "a" in the beginning of a text
print(re.findall('a$', text)) # Find all occurrence of "a" in the end of a text

In [None]:
import re

text ='$1.00 $'
print(re.findall(r'\$$', text))   # Find "$"'s at the end of a string
print(re.findall(r'^\$', text))   # Find all "$"'s at the beginning of a string
print(re.findall(r'\$', text))    # Find all "$"'s in a string

In [None]:
import re

text = 'aabc aaaabb aaabbaaaa'
print(re.findall(r'a{2,3}', text))  # Greedy, i.e., try to match 'aaa' over 'aa'
print(re.findall(r'a{2,3}?', text)) # Non-greedy, i.e., try to match 'aa' over 'aaa'

<span class='ex'>Example: <span class='mn'>re.search()</span></span>

In [None]:
import re

text = r'a';       print(re.search('aa*', text)[0])   # a* is greedy
text = r'aa';      print(re.search('aa*', text)[0])   # a* is greedy
text = r'aaaa';    print(re.search('aa*', text)[0])   # a* is greedy
text = r'a';       print(re.search('aa+', text))      # 
text = r'a';       print(re.search('a?' , text))
text = r'a';       print(re.search('a'  , text))

## Regular Expression Examples on Basic and Quantifiers

<span class='ex'>Example: Quantifiers</span>

In [None]:
import re
m = re.search(r'aa', 'aaaa')  # First the first match only
print(m)
print(m[0])

In [None]:
import re

text = 'aaaa'

def print_search(pattern):
    m = re.search(pattern, text)
    if m is None:
        print(f'{pattern:>8} -> No match')
    else:
        print(f'{pattern:>8} -> {m[0]}')

print_search('a')      # Find exactly 'a'
print_search('b')      # Find exactly 'b
print_search('a?')     # Find 0 or 1 'a'    (Greedy)
print_search('a+')     # Find 1 or more 'a' (Greedy)
print_search('a*')     # Find 0 or more 'a' (Greedy)
print_search('a+?')    # Find 1 or more 'a' (Non-greedy)
print_search('a*?')    # Find 0 or more 'a' (Non-greedy)
print_search('a|b')    # Find 'a' or 'b'
print_search('a{2}')   # Find 'aa'
print_search('a{2,5}') # Find 2 to 5 'a'
print_search('a{,4}')  # Find up to 4 'a'
print_search('a{2,}')  # Find at least 2 'a'
print_search('a{5}')   # Find exactly 5 'a'

## `Match` vs `Search`

-   `r'box'` matches strings such as `'box'` and `'boxes'` but not `'inbox'`.  
    In other words, an RE match is implicitly anchored at the start of the target string, as if the RE's pattern started with `\A`.

<span class='ex'>Example: <span class='mn'>re.match()</span> vs <span class='mn'>re.search()</span></span>

In [None]:
import re
print(re.match(r'box', 'boxes'))
print(re.match(r'box', 'inbox'))
print(re.search(r'box', 'boxes'))
print(re.search(r'box', 'inbox'))
print(re.search(r'^box', 'inbox'))

In [None]:
import re

print('match  "box"  in "inbox" -> ', end='')
if re.match(r'box', 'inbox'): # Search 'box' from the beginning
    print('succeeds')
else:
    print('fails') # prints: match fails

print('search "box"  in "inbox" -> ', end='')
if re.search(r'box', 'inbox'): # Search 'box'
    print('succeeds') # prints: search succeeds
else:
    print('fails')
    
print('search "^box" in "inbox" -> ', end='')
if re.search(r'^box', 'inbox'): # Search 'box' from the beginning
    print('succeeds') # prints: search succeeds
else:
    print('fails')

## Cheatsheet - Character Classes

| **Syntax** | **Meaning**                                                      |
|:-----------|:-----------------------------------------------------------------|
| `[a-e]`    | Match one character of a, b, c, d and e                          |
| `[^a-e] `  | Match one character except a, b, c, d and e                      |
| `[\b]`     | Match blackspace character (ASCII 8)                             |
| `[\t]`     | Match tab character (ASCII 9)                                    |
| `[\n]`     | Match newline character (ASCII 10)                               |
| `[\r]`     | Match carriage return character (ASCII 13)                       |
| `\A`       | Match an empty string, but only at the start of the whole string |
| `\b`       | Match an empty string, but only at the start or end of a word    |
| `\B`       | Match an empty string, but not at the start or end of a word     |
| `\d`       | Match one digit (equal to [0-9])                                 |
| `\D`       | Match one non-digit (equal to [^0-9]                             |
| `\s`       | Match one whitespace (equal to [\r\n\t\f\v ])                    |
| `\S`       | Match one non-whitespace (equal to [^\r\n\t\f\v ])               |
| `\w`       | Match one word character (equal to [a-zA-Z0-9_])                 |
| `\W`       | Match one non-word character (equal to [^a-zA-Z0-9_])            |
| `\Z`       | Match an empty string, but only at the end of the whole string   |
| `\\`       | Match one backslash character                                     |

<span class='ex'>Example: Character Classes</span>

In [None]:
import re

def print_search(pattern):
    m = re.findall(pattern, text)
    if not m:
        print('{:>8} -> No match'.format(pattern))
    else:
        print('{:>8} -> {}'.format(pattern, m))

text = 'Abc abc   1234 To-be Hello01 __doc__'

print_search('[abc]+')
print_search('[1]+')
print_search('\s+')
print_search('\S+')
print_search('\w+')
print_search('\W+')
print_search('\d+')
print_search('\d{2}')
print_search('0\d{1}')
print_search('\D+')

## Exercises

Retrieve valid mobile phone numbers from a text. Valid phone mumbers consist of 8 digits, optionally separated by hyphen (-) and start with '9' or '8'.

text = "9812-1234 88882323 8800=2323 00001234 12345678"

In [None]:
import re

pat = r'[89][0-9]{3}[-]?[0-9]{4}'

text = "9812-1234 88882323 8800=2323 00001234 12345678"

mm = re.findall(pat, text)
for m in mm:
    print(m)

## Exercises

Retrieve valid IPv4 addresses phone numbers from a text.

text = "255.255.255.0 300.129.182.0 0.0.0.0 192.168.9.0 -192.11.341.2"

In [None]:
text = "255.255.255.0 300.129.182.0 0.0.0.0 192.168.9.0 -192.11.341.2"

ips = text.split()
for ip in ips:
    octs = ip.split('.')
    bad = [-1 < int(i) < 256 for i in octs]
    if False in bad:
        continue
    else:
        print(ip, end=' - ')

## Cheatsheet - Capturing Groups

| **Syntax**   | **Meaning**                                                        |
|:-------------|:-------------------------------------------------------------------|
| `(...)`      | Matches the RE within the parentheses and indicates a group        |
| `(?iLmsux)`  | Alternate way to set optional flags; no effect on match<sup>1</sup>|
| `(?:...)`    | Like `(...)` but does not indicate a group                         |
| `(?P<id>...)`| Like `(...)` but the group also gets the name `id`                 |
| `(?P=id)`    | Matches whatever was previously matched by group named `id`        |
| `(?#...)`    | Content of parentheses is just a comment; no effect on match       |
| `(?=...)`    | Positive lookahead assertion: matches if `...` matches what comes next, but does not consume any part of the string|
| `(?!...)`    | Negative lookahead assertion: matches if `...` does not match what comes next, and does not consume any part of the string |
| `(?<=...)`   | Positive lookbehind assertion: matches if there is a match ending at the current position for `...` (`...` must match a fixed length) |
| `(?<!...)`   | Negative lookbehind assertion: matches if there is no match ending at the current position for `...` (`...` must match a fixed length) |

<sup>1</sup>:  Always place the `(?...)` construct, if any, at the start of the pattern, for readability; since 3.6, placing it elsewhere causes a `DeprecationWarning`.
-  **i**:  Ignore case
-  **L**:  Cause `\w`, `\W`, `\b`, `\B` matches to depend on what the current locale deems alphanumeric; depreciated in Python 3.
-  **m**:  Make the special characters `^` and `$` match at the start and end of each line (i.e., right after/before a newline), as well as at the start and end of the whole string (`\A` and `\Z` always match only the start and end of the whole string)
-  **s**:  Cause the special character `.` to match any character, including a newline
-  **u**:  Make `\w`, `\W`, `\b` and `\B` match depend on what Unicode deems alphanumeric; depreciated in Python 3.
-  **v**:  Cause whitespace in the pattern to be ignored, except when escaped or in a character set, and makes a `#` character in the pattern begin a comment that lasts until the end of the line

<span class='ex'>Example: Retrieve "destination network" and "gateway" from "route add" commands</span>

In [None]:
import re

texts = [
    "route add 192.168.99.0 mask 255.255.255.0 192.168.10.11",  # This is valid
    "route add 0.0.0.0      mask 0.0.0.0",                      # This is invalid
    "route add 192.168.10.0 mask",                              # This is invalid
    "          192.168.90.0      255.255.255.0 192.168.10.11",  # This is invalid
    ]

pattern  = r'route\s+add\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s+'
pattern += r'mask\s+\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+'
pattern += r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'

r1 = re.compile(pattern)
for t in texts:
    m = r1.findall(t)
    if m:
        print(m)
        print(f'Destination={m[0][0]}, Gateway={m[0][1]}')

<span class='ex'>Example: Search based on lookahead condition</span>

In [None]:
import re

texts = '''\
Isaac Isaac Isaac
Isaac Newton
Isaac Asimov\
'''

def print_search(pattern):
    for t in texts.split('\n'):
        m = re.findall(pattern, t)
        if not m:
            print('"{:<20}" - {:<20} -> No match'.format(t, pattern))
        else:
            print('"{:<20}" - {:<20} -> {}'.format(t, pattern, m))
    print('-'*70)

print_search('(Isaac)')
print_search('(?:Isaac)')
print_search('Isaac (?=Asimov)')
print_search('Isaac (?!Asimov)')

<span class='ex'>Example: Search based on lookbehind condition</span>

In [None]:
import re

texts = '''\
12   Isaac
222  Newton
2122 Asimov\
'''

def print_search(pattern):
    for t in texts.split('\n'):
        m = re.findall(pattern, t)
        if not m:
            print('"{:<15}" - {:<20} -> No match'.format(t, pattern))
        else:
            print('"{:<15}" - {:<20} -> {}'.format(t, pattern, m))
    print('-'*70)

print_search('(?<=[2]{3})\s+(\w+)')

## Exercise

Retrieve DIDs from a text with the central number **`6460`**. For example, retrieve `6717` from 64606717. Phone numbers all have 8 digits and optionally separated by hyphen in the middle:

text = "6460-6717 64603232 66667777 4312-0000"

In [None]:
import re

text = "6460-6717 64603232 66667777 4312-0000"

patt = r'6460[-]?([0-9]{4})'

r1 = re.compile(patt)

mm = r1.findall(text)
for m in mm:
    print(m)

<span class='ex'>Example: Netmiko with regular expression</span>

In [None]:
%%file ./cisco_devices.txt
192.168.99.3
192.168.99.4

In [None]:
from netmiko import Netmiko
import time
import re

ipfile = './cisco_devices.txt'

pattern = re.compile(r'hostname\s+(\w+)')

my_device = {
    "ip": "192.168.99.2",
    "username": "admin",
    "password": "class",
    "device_type": "cisco_ios",
    "secret": "class",
    "global_delay_factor": 2    
}

####################################################
start_time = time.time()
####################################################
with open(ipfile) as f:
    ips = f.read().splitlines() # ips is a list of IPs
    
for ip in ips:
    my_device['ip'] = ip
    device = Netmiko(**my_device)
    device.enable()
    output = device.send_command("show run")
    device.disconnect()
    m = pattern.search(output)
    if m:
        #print(m)
        print(f'{ip} - {m[1]}')
####################################################
end_time = time.time()
####################################################
print(f'Time taken: {end_time - start_time:.2f} seconds')

<span class='ex'>Example: Find out network device ports with errors</span>

In [None]:
from netmiko import Netmiko
import time
import re

ipfile = './cisco_devices.txt'

pattern = re.compile(r'error?\s+(\w+)')

my_device = {
    "ip": "192.168.99.2",
    "username": "admin",
    "password": "class",
    "device_type": "cisco_ios",
    "secret": "class"
}

####################################################
start_time = time.time()
####################################################
with open(ipfile) as f:
    ips = f.read().splitlines()

tofile = ''
for ip in ips:
    my_device['ip'] = ip
    device = Netmiko(**my_device)
    device.enable()
    output = device.send_command("show int")
    tofile += output
    device.disconnect()
    m = pattern.search(output)
    if m:
        print(f'{ip} - {m[1]}')
####################################################
end_time = time.time()
####################################################
print(f'Time taken: {end_time - start_time:.2f} seconds')

<span class='ex'>Example: Using Multithreading to find out network device ports with errors</span>

In [None]:
from netmiko import Netmiko
import threading
import time
import re

ipfile = './cisco_devices.txt'

pattern = re.compile(r'error?\s+(\w+)')

my_device = {
    "ip": "192.168.99.2",
    "username": "admin",
    "password": "class",
    "device_type": "cisco_ios",
    "secret": "class"
}

def gethostname(devdict):
    device = Netmiko(**my_device)
    device.enable()
    output = device.send_command("show int")
    with open(device['ip']+'txt', 'w') as f:
        f.write(output)
    device.disconnect()
    m = pattern.search(output)
    if m:
        data.append([ip, m[1]])

####################################################
start_time = time.time()
####################################################
with open(ipfile) as f:
    ips = f.read().splitlines()

threads = []
data = []
for ip in ips:
    my_device['ip'] = ip
    t = threading.Thread(target=gethostname, args=(my_device,))
    threads.append(t)
    t.start()
                        
for t in threads:
    t.join()

for d in data:
    print(d)

####################################################
end_time = time.time()
####################################################
print(f'Time taken: {end_time - start_time:.2f} seconds')