## 2.5 Search and replace text - replace(), re.sub()

In [3]:
# simple str.replace(a,b)
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace('yeah', 'yep')


'yep, but no, but yep, but no, but yep'

In [4]:
# re
import re
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
datepat.sub(r'\3-\1-\2', text)


'Today is 2012-11-27. PyCon starts 2013-3-13.'

In [5]:
# match_rule.sub(sub_fct,txt)
from calendar import month_abbr
def change_date(m):
	mon_name = month_abbr[int(m.group(1))]
	return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
datepat.sub(change_date, text)

'Today is 27 Nov 2012. PyCon starts 13 Mar 2013.'

## 2.6 Search and replace case-insensitive text - flags=re.IGNORECASE

In [6]:
text = 'UPPER PYTHON, lower python, Mixed Python'
import re
re.findall('python', text, flags=re.IGNORECASE)
# ['PYTHON', 'python', 'Python']
re.sub('python', 'snake', text, flags=re.IGNORECASE)
# 'UPPER snake, lower snake, Mixed snake'

'UPPER snake, lower snake, Mixed snake'

In [8]:
# the most advanced sub rule!
def matchcase(word): 
	def replace(m): 
		text = m.group() 
		if text.isupper(): 
			return word.upper() 
		elif text.islower(): 
			return word.lower() 
		elif text[0].isupper(): 
			return word.capitalize() 
		else: return word 
	return replace

re.sub('python', matchcase('snake'), text, flags=re.IGNORECASE)
# 'UPPER SNAKE, lower snake, Mixed Snake'

'UPPER SNAKE, lower snake, Mixed Snake'

## 2.9 Unicode text to a standard form - unicodedata

In [9]:
# fully composed
s1 = 'Spicy Jalape\u00f1o'

# fully decomposed
s2 = 'Spicy Jalapen\u0303o'
s1 == s2	# False

import unicodedata
t1 = unicodedata.normalize('NFC', s1)
t2 = unicodedata.normalize('NFC', s2)
t1 == t2	# True

True

In [14]:
# NFKC NFKD
s = '\ufb01'	# A single character
print(unicodedata.normalize('NFD', s))
print(unicodedata.normalize('NFKD', s))
print(unicodedata.normalize('NFKC', s))

ﬁ
fi
fi


In [15]:
# normalize
t1 = unicodedata.normalize('NFD', s1)
''.join(c for c in t1 if not unicodedata.combining(c))

'Spicy Jalapeno'

In [22]:
t1 = unicodedata.normalize('NFC', s1)
''.join(c for c in t1 if not unicodedata.combining(c))

'Spicy Jalapeño'

## 2.12 Sanitize and clean up text - translate() normalize() 2.9 upper() lower()

In [24]:
s = 'pýthöñ\fis\tawesome\r\n'
remap = { ord('\t') : ' ', ord('\f') : ' ', 
		ord('\r') : None # Deleted]
		}
a = s.translate(remap)	# 'pýthöñ is awesome\n'
a

'pýthöñ is awesome\n'

In [25]:
# remove combining char
import unicodedata
import sys
cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode)
						if unicodedata.combining(chr(c)))
b = unicodedata.normalize('NFD', a)
b.translate(cmb_chrs)	# 'python is awesome\n'

'python is awesome\n'

In [26]:
# Arabic to ASCII
digitmap = { c: ord('0') + unicodedata.digit(chr(c))
		for c in range(sys.maxunicode)
		if unicodedata.category(chr(c)) == 'Nd' }
x = '\u0661\u0662\u0663'
x.translate(digitmap)

'123'

In [27]:
# simple way
b = unicodedata.normalize('NFD', a)
b.encode('ascii', 'ignore').decode('ascii')

'python is awesome\n'

In [28]:
a = 'Is Chicago'

## 2.14 Combine and concatenate strings - join() + '{} {}'.format(a,b)

In [32]:
def sample():
    yield 'Is'
    yield 'Chicago'
    yield 'Not'
    yield 'Chicago?'
    
text = ' '.join(sample())
text

'Is Chicago Not Chicago?'

In [35]:
def combine(source,maxlen):
    parts = []
    size = 0
    for part in source:
        parts += part
        size += len(part)
        if size > maxlen:
            yield ' '.join(parts)
            parts = []
            size = 0
    yield ' '.join(parts)
        

## 2.15 Set Vars in strings - str.format(attr=value) format_map(vars())

In [60]:
# normal case
s = '{name} has {n} messages.'
s.format(name='Guido', n=37)

'Guido has 37 messages.'

In [61]:
# use format_map(vars())
name = 'Guido'
n = 37
s.format_map(vars())
# 'Guido has 37 messages.'

'Guido has 37 messages.'

In [62]:
# use format_map(vars(class_obj))
class Info:
	def __init__(self, name, n):
		self.name = name
		self.n = n
a = Info('Guido',37)
s.format_map(vars(a))
# 'Guido has 37 messages.'


'Guido has 37 messages.'

In [63]:
# aware of missing variables
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'
del n 	# delete n n can be only deleted once
s.format_map(safesub(vars()))

'Guido has {n} messages.'

In [64]:
import sys
def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))

In [67]:
# frame hack
import sys
def sub(text):
	return text.format_map(safesub(sys._getframe(1).f_locals))

## 2.16 Reformat text to a fixed number of columns - textwrap

In [68]:
s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."

In [71]:
import textwrap
print(textwrap.fill(s,40))


Look into my eyes, look into my eyes,
the eyes, the eyes, the eyes, not around
the eyes, don't look around the eyes,
look into my eyes, you're under.


In [77]:
import os
#os.get_terminal_size().columns

## 2.18 Tokenize text - re, r'(?P<TOKENNAME>match_rule)', scanner()

In [96]:
import re 
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)' 
NUM = r'(?P<NUM>\d+)' 
PLUS = r'(?P<PLUS>\+)' 
TIMES = r'(?P<TIMES>\*)' 
EQ = r'(?P<EQ>=)' 
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
scanner = master_pat.scanner('foo = 42')
scanner.match() # <_sre.SRE_Match object at 0x100677738>


<_sre.SRE_Match object; span=(0, 3), match='foo'>

In [97]:
_.lastgroup,_.group() #('NAME', 'foo')

('NAME', 'foo')

In [101]:
from collections import namedtuple
Token = namedtuple('Token',['type','value'])

def generate_tokens(pat,text):
	scanner = pat.scanner(text)
	for i in iter(scanner.match,None):
		yield Token(i.lastgroup, i.group())

In [102]:
list(generate_tokens(master_pat, 'foo = 42'))

[Token(type='NAME', value='foo'),
 Token(type='WS', value=' '),
 Token(type='EQ', value='='),
 Token(type='WS', value=' '),
 Token(type='NUM', value='42')]

## 2.19 Write a recursive descent parser - one rule one function

In [144]:
import collections
import re
# Token specification 
NUM 	= r'(?P<NUM>\d+)' 
PLUS 	= r'(?P<PLUS>\+)' 
MINUS	= r'(?P<MINUS>-)' 
TIMES	= r'(?P<TIMES>\*)' 
DIVIDE	= r'(?P<DIVIDE>/)'
LPAREN	= r'(?P<LPAREN>\()' 
RPAREN	= r'(?P<RPAREN>\))' 
WS	= r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, 
		MINUS, TIMES, DIVIDE, LPAREN, RPAREN, WS]))

# Tokenizer 
Token = collections.namedtuple('Token', ['type','value'])

def generate_tokens(text):
    scanner = master_pat.scanner(text)
    for i in  iter(scanner.match,None): # must be like this!
        tok = Token(i.lastgroup, i.group())
        if tok.type != 'WS':
            yield tok


In [149]:
class ExpressionEvaluator:
    def parse(self,text):
        self.tokens = generate_tokens(text)#2.18
        self.tok = None	#last symbol consumed
        self.nexttok = None # next symbol tokenized
        self._advance()	#load first lookahead token
        return self.expr()
    
    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok,next(self.tokens, None)
        
    def _accept(self,toktype):
        'test and consume the next token if matches'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self,toktype):
        if not self._accept(toktype):
            raise SyntaxError('Expected '+toktype)

    # gramma rules
    def expr(self):
        '''expression ::= term {('+'|'-') term}'''
        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
    
    def term(self):
        '''term ::= factor {('*'|'/') factor}'''

        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.term()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval

    def factor(self):
        'factor ::= NUM | (expr)'

        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')

In [154]:
e = ExpressionEvaluator()
print(e.parse('2*(2+2)'))

8


## 2.20 Operations on byte strings

In [156]:
# match pattern
data = b'Hello World'
data.startswith(b'Hello')
import re
re.split(b'[\s]',data)

[b'Hello', b'World']