In [1]:
# 字符串操作
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import re

In [2]:
# 字符串对象方法

In [3]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [4]:
pieces = [x.strip() for x in val.split(',')] # strip过滤字符串前后的不可见字符
pieces

['a', 'b', 'guido']

In [5]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [6]:
'::'.join(pieces)

'a::b::guido'

In [7]:
'guido' in val # 判断sub string是否存在

True

In [8]:
val.index(',') # 第一次出现的索引位置，与find不同，找不到就抛出异常。

1

In [9]:
val.find(':') # 找不到返回-1

-1

In [10]:
 val.count(',') # 计算某个字符出现次数

2

In [11]:
print(val.replace(',', '::')) # 替换
print(val.replace(',', ''))

a::b::  guido
ab  guido


In [12]:
# Python内置的字符串方法（自己google去查详细，懒得打了...）
# count
# startswith/endswith
# join
# index
# find
# rfind
# replace
# strip/lstrip/rstrip
# split
# lower/upper
# ljust/rjust

In [13]:
# 正则表达式

In [14]:
text = "foo   bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [15]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [16]:
regex.findall(text) # 找到所有匹配'\s+'的内容

['   ', '\t ', '  \t']

In [17]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.IGNORECASE) # 忽略大小写
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [18]:
m = regex.search(text)
print(m)
text[m.start():m.end()]

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>


'dave@google.com'

In [19]:
print(regex.match(text)) # 返回None，因为它只匹配出现在字符串开头的模式。

None


In [20]:
print(regex.sub('REDACTED', text)) # 匹配到的模式替换为指定字符串

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [21]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' # 用()包含group
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [22]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [23]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [24]:
regex = re.compile(r"""
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)
m = regex.match('wesm@bright.net')
m.groupdict()

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

In [25]:
# 正则表达式方法（不是重点，自己google...）
# fundall/finditer
# match
# search
# split
# sub/subn


In [26]:
# pandas中矢量化的字符串函数

In [27]:
data = {'Dave': 'dave@google.com',
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com',
        'Wes': np.nan}
data = Series(data)
data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [28]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [29]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches # 原教材与现状不一致，matches反映每一个key是否匹配。

Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object

In [30]:
data.str[:5] # 字符串统一切片

Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

In [31]:
# 矢量化字符串方法（参考普通字符串...）
# cat
# contains
# count
# endswith/startswith
# findall
# get
# join
# len
# lower/upper
# match
# pad
# center
# repeat
# replace
# slice
# split
# strip/lstrip/rstrip