# Strings and Text
 - Strings are immutable sequences of Unicode code points.
 - Immutable means

### Splitting Text

In [7]:
line = 'asdf fjdk; afed, fjek,asdf, foo'
import re
list1 = re.split(r'[;,\s]\s*', line) # split by , or ; or space followed by zero or more spaces
print(list1)
print()

# capture group in enclosed parantheses. If capture group is used, then the matched text is also included in the result,
print("Using capture groups")
list2 = re.split(r'(;|,|\s)\s*', line)
print(list2)
print()

# If you still need to use parantheses but do not want to capture the matched text, use non-capturing group (?:...)
print("Using non-capturing groups")
list3 = re.split(r'(?:;|,|\s)\s*', line)
print(list3)


['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

Using capture groups
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

Using non-capturing groups
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']


### Matching Text

In [21]:
line = 'http://www.python.org'
print(line.startswith('https'))
print(line.endswith('.org'))
print(line.find('python'))
print()

filenames = [ 'Makefile', 'foo.c', 'bar.py', 'spam.c', 'spam.h' ]
print(any(name.endswith('.py') for name in filenames))
print()


print("Find all dates in the text")
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'

print("Using findall, non capture group, as a list of matched strings")
date_pattern =re.compile(r'\d+/\d+/\d+') # non-capturing group
print(date_pattern.findall(text))
print()

print("Using findall, capture group, as list of tuple of matched groups")
date_pattern = re.compile(r'(\d+)/(\d+)/(\d+)') # capture group
print(date_pattern.findall(text))
for month, day, year in date_pattern.findall(text):
    print('{}-{}-{}'.format(year, month, day))
print()

print("Using match, capture group")
print("'match' only search at the beggining of string. If you want exact match, use '^' and '$' at the beginning and end of the pattern")
date_pattern = re.compile(r'(\d+)/(\d+)/(\d+)')
m = date_pattern.match('11/27/2012 is the dat')
print(m)
print(m.groups())
print(m.group(0), m.group(1), m.group(2), m.group(3))
print()


print("If you want a findall functionallity with match, use finditer")
for m in date_pattern.finditer(text):
    print(m.groups())

print()



False
True
11

True

Find all dates in the text
Using findall, non capture group, as a list of matched strings
['11/27/2012', '3/13/2013']

Using findall, capture group, as list of tuple of matched groups
[('11', '27', '2012'), ('3', '13', '2013')]
2012-11-27
2013-3-13

Using match, capture group
'match' only search at the beggining of string
<re.Match object; span=(0, 10), match='11/27/2012'>
('11', '27', '2012')
11/27/2012 11 27 2012

If you want a findall functionallity with match, use finditer
('11', '27', '2012')
('3', '13', '2013')



### Search and Replace


In [26]:
print("Use replace function to replace the exact substring, and not pattern")
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(text)
print(text.replace('2012', '2014'))
print()

print("Use sub function to replace the pattern")
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print("Replace all dates with '2014-09-18'")
print(text)
date_pattern = re.compile(r'(\d+)/(\d+)/(\d+)')
print(date_pattern.sub('2014-09-18', text))
print()

print("Use sub function to rewrite the dates")
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(text)
print(re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text))


Use replace function to replace the exact substring, and not pattern
Today is 11/27/2012. PyCon starts 3/13/2013.
Today is 11/27/2014. PyCon starts 3/13/2013.

Use sub function to replace the pattern
Replace all dates with '2014-09-18'
Today is 11/27/2012. PyCon starts 3/13/2013.
Today is 2014-09-18. PyCon starts 2014-09-18.

Use sub function to rewrite the dates
Today is 11/27/2012. PyCon starts 3/13/2013.
Today is 2012-11-27. PyCon starts 2013-3-13.


In [29]:
print("Match and replace string disregarding the case, but retain the case of the matched string")
text = 'UPPER PYTHON, lower python, Mixed Python'
print(text)
print(re.sub('python', 'snake', text, flags=re.IGNORECASE))
print()

print("Use a function to replace the matched string")
def match_case(word): # This function returns a fundtion
    def replace(match):
        text = match.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

print(match_case("snake"))
print(re.sub('python', match_case("snake"), text, flags=re.IGNORECASE))
print()


Match and replace string disregarding the case, but retain the case of the matched string
UPPER PYTHON, lower python, Mixed Python
UPPER snake, lower snake, Mixed snake

Use a function to replace the matched string
<function match_case.<locals>.replace at 0x10e46c8b0>
UPPER SNAKE, lower snake, Mixed Snake



e