### 你需要将一个字符串分割为多个字段，但是分隔符(还有周围的空格)并不是固定的

In [4]:
line = 'asdf fjdk; afed, fjek,asdf, foo'

import re
l=re.split(r'[;,\s]\s*', line)
print(l)

fields = re.split(r'(;|,|\s)\s*', line)
print(fields)

values = fields[::2]
print(values)

delimiters = fields[1::2] + ['']
print(delimiters)

t=re.split(r'(?:,|;|\s)\s*', line)
print(t)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
[' ', ';', ',', ',', ',', '']
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']


### 你需要通过指定的文本模式去检查字符串的开头或者结尾，比如文件名后缀，URL Scheme等等

In [7]:
filename = 'spam.txt'
print(filename.endswith('.txt'))
print(filename.startswith('file:'))

url = 'http://www.python.org'
print(url.startswith('http:'))

import os
filenames = os.listdir('.')
ll=[name for name in filenames if name.endswith(('.ipynb', '.h')) ]
print(ll)
print(any(name.endswith('.py') for name in filenames))


True
False
True
['audiEx1.ipynb', 'CategoryEncode.ipynb', 'checkDevice.ipynb', 'crm.ipynb', 'CRM2.ipynb', 'CRM5.ipynb', 'FuzzyMatch.ipynb', 'geopy.ipynb', 'JsonEx.ipynb', 'oracle-example - Copy.ipynb', 'oracle-example.ipynb', 'PythonCrashNotes-1.ipynb', 'PythonCrashNotes-2.ipynb', 'PythonCrashNotes-3.ipynb', 'PythonCrashNotes-4.ipynb', 'PythonCrashNotes-Date.ipynb', 'PythonCrashNotes-split-text1.ipynb', 'PythonCrashNotes5.ipynb', 'SCM-Prediction1.ipynb', 'scm-rec.ipynb', 'scm1.ipynb', 'scm2.ipynb', 'SplitByRowCol.ipynb', 'Untitled.ipynb', 'Untitled1.ipynb']
True


### 你想使用 Unix Shell 中常用的通配符(比如 *.py , Dat[0-9]*.csv 等)去匹配文本字符串
fnmatch() 函数匹配能力介于简单的字符串方法和强大的正则表达式之间

In [8]:
addresses = [
    '5412 N CLARK ST',
    '1060 W ADDISON ST',
    '1039 W GRANVILLE AVE',
    '2122 N CLARK ST',
    '4802 N BROADWAY',
]

from fnmatch import fnmatchcase
st=[addr for addr in addresses if fnmatchcase(addr, '* ST')]
print(st)
myloc=[addr for addr in addresses if fnmatchcase(addr, '54[0-9][0-9] *CLARK*')]
print(myloc)

['5412 N CLARK ST', '1060 W ADDISON ST', '2122 N CLARK ST']
['5412 N CLARK ST']


### 你想匹配或者搜索特定模式的文本
使用re模块进行匹配和搜索文本的最基本方法。 核心步骤就是先使用 re.compile() 编译正则表达式字符串， 然后使用 match() , findall() 或者 finditer() 等方法。如果你打算做大量的匹配和搜索操作的话，最好先编译正则表达式，然后再重复使用它。 模块级别的函数会将最近编译过的模式缓存起来，因此并不会消耗太多的性能， 但是如果使用预编译模式的话，你将会减少查找和一些额外的处理损耗。

In [9]:
text = 'yeah, but no, but yeah, but no, but yeah'
print(text.startswith('yeah'))
print(text.endswith('no'))
print(text.find('no'))

text1 = '11/27/2012'
text2 = 'Nov 27, 2012'

import re
# Simple matching: \d+ means match one or more digits
if re.match(r'\d+/\d+/\d+', text1):
     print('yes')
else:
     print('no')
        
if re.match(r'\d+/\d+/\d+', text2):
     print('yes')
else:
     print('no')
        
        

True
False
10
yes
no


In [12]:
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')

text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
print(datepat.findall(text)   )

[('11', '27', '2012'), ('3', '13', '2013')]


### 你想在字符串中搜索和匹配指定的文本模式

In [14]:
text = 'yeah, but no, but yeah, but no, but yeah'
text1=text.replace('yeah', 'yep')
print(text1)

yep, but no, but yep, but no, but yep


In [15]:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
import re
text2=re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)
print(text2)

Today is 2012-11-27. PyCon starts 2013-3-13.


In [16]:
#如果你打算用相同的模式做多次替换，考虑先编译它来提升性能
datepat = re.compile(r'(\d+)/(\d+)/(\d+)')
text3=datepat.sub(r'\3-\1-\2', text)
print(text3)

Today is 2012-11-27. PyCon starts 2013-3-13.


In [17]:
#对于更加复杂的替换，可以传递一个替换回调函数来代替
from calendar import month_abbr

def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))

text4=datepat.sub(change_date, text)
print(text4)

Today is 27 Nov 2012. PyCon starts 13 Mar 2013.
