数据清洗与准备

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_note_interactivity = "all"
%matplotlib inline

import os
import sys
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

1. 处理缺失值

————过滤缺失值

————补全缺失值

2. 数据转换

————删除重复值

————使用函数或映射进行数据转换

————替代值

————重命名轴索引

————离散化和分箱

————检测和过滤异常值

————置换和随机抽样

————计算指标/虚拟变量

3. 字符串操作

————字符串对象方法

————pandas向量化字符串函数


1. 处理缺失值

In [4]:
#过滤缺失值（pandas对象所有描述性统计信息默认情况下自动排除缺失值）

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
#None在对象数组中也按照NA（not available）处理
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [7]:
string_data.copy().dropna()

1    artichoke
3      avocado
dtype: object

In [8]:
string_data.copy().fillna('No data')

0      No data
1    artichoke
2      No data
3      avocado
dtype: object

In [9]:
string_data.notnull()

0    False
1     True
2    False
3     True
dtype: bool

综上，处理缺失值的函数主要为：dropna，fillna，isnull，notnull

In [11]:
#过滤缺失值
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data = pd.DataFrame([[1., 6.5, 3.],
                    [1., NA, NA],
                    [NA, NA, NA],
                    [NA, 6.5, 3.]])

In [14]:
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [17]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [18]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,1.118981,0.339911,0.042321
1,-0.929033,0.113959,1.640502
2,0.669614,1.229927,0.996007
3,1.191934,0.43601,-0.667041
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [22]:
df.iloc[:4, 1] = NA

In [23]:
df.iloc[:2, 2] = NA

In [24]:
df

Unnamed: 0,0,1,2
0,1.118981,,
1,-0.929033,,
2,0.669614,,0.996007
3,1.191934,,-0.667041
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [25]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [26]:
df.dropna(thresh=2)
#thresh参数为去掉缺省行的个数阈值，达到后视为满足条件，即从元数据表汇总排除。

Unnamed: 0,0,1,2
2,0.669614,,0.996007
3,1.191934,,-0.667041
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [28]:
#补全缺失值
df.fillna(0)

Unnamed: 0,0,1,2
0,1.118981,0.0,0.0
1,-0.929033,0.0,0.0
2,0.669614,0.0,0.996007
3,1.191934,0.0,-0.667041
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [29]:
#可以利用传入的字典映射来对不同的列进行不同的缺省值填充
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,1.118981,0.5,0.0
1,-0.929033,0.5,0.0
2,0.669614,0.5,0.996007
3,1.191934,0.5,-0.667041
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [34]:
#inplace就地修改（还记得drop吗？）
_ = df.fillna(0, inplace=True)

In [35]:
df

Unnamed: 0,0,1,2
0,1.118981,0.0,0.0
1,-0.929033,0.0,0.0
2,0.669614,0.0,0.996007
3,1.191934,0.0,-0.667041
4,-0.94554,2.22922,-0.178714
5,-0.356147,-1.243495,1.051312
6,-0.616833,-0.106167,-0.617333


In [36]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA

In [37]:
df

Unnamed: 0,0,1,2
0,0.23928,-1.194649,-0.867324
1,-1.532862,-0.175263,-0.21854
2,-0.176559,,0.907853
3,-0.481783,,0.238462
4,1.06149,,
5,1.110777,,


In [38]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.23928,-1.194649,-0.867324
1,-1.532862,-0.175263,-0.21854
2,-0.176559,-0.175263,0.907853
3,-0.481783,-0.175263,0.238462
4,1.06149,-0.175263,0.238462
5,1.110777,-0.175263,0.238462


In [39]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,0.23928,-1.194649,-0.867324
1,-1.532862,-0.175263,-0.21854
2,-0.176559,,0.907853
3,-0.481783,,0.238462
4,1.06149,,
5,1.110777,,


In [40]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.23928,-1.194649,-0.867324
1,-1.532862,-0.175263,-0.21854
2,-0.176559,-0.175263,0.907853
3,-0.481783,-0.175263,0.238462
4,1.06149,,0.238462
5,1.110777,,0.238462


In [41]:
#常见操作，利用均值或中位数来进行缺省值的插值工作

data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [42]:
data.fillna(data.median())

0    1.0
1    3.5
2    3.5
3    3.5
4    7.0
dtype: float64

In [43]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

fillna的参数

value

method插值方法

axis需要填充的轴

inplace是否就地修改

limit用于向前或向后填充时的最大填充范围

2. 数据转换

In [44]:
#删除重复值
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                    'k2': [1, 1, 2, 3, 3, 4, 4]})

In [45]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [46]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [47]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [48]:
data['v1'] = range(7)

In [49]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [53]:
data.drop_duplicates(['k1'])
#保留了第一个观测值
#其实房价的那个没有聚合版本的数据就可以用这个函数，依照小区名字列进行去重，可以得到每一个小区的第一个交易记录

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [69]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                             'Pastrami', 'corned beef', 'Bacon',
                             'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [70]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [71]:
meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'corned beef':'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon',
    'pastrami': 'cow'
}

In [72]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [73]:
#利用map函数进行字典映射
data['animal'] = lowercased.map(meat_to_animal)

In [74]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [75]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

利用map和applymap是一种可以便捷执行按元素转换及其他清洗相关操作的手段。

In [76]:
#替代值（利用replace更容易实现）
data = pd.Series([1., -999., 2., -999, -1000, 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [77]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [78]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [79]:
#用不同的值替代不同的值
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [80]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [81]:
#重命名轴索引
data = pd.DataFrame(np.arange(12).reshape(3, 4),
                   index=['Ohio', 'Colorado', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [82]:
transform = lambda x: x[:4].upper()

In [83]:
data.index.map(transform)

array(['OHIO', 'COLO', 'NEW '], dtype=object)

In [84]:
data.index = data.index.map(transform)

In [85]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [86]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [88]:
data.rename(index={'OHIO': 'INDIANA'},
                   columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [91]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [93]:
#离散化和分箱操作
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)

In [94]:
cats
#返回的对象是Categorical的特殊对象。

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [95]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [96]:
cats.categories

Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')

In [98]:
pd.value_counts(cats)
#得到的是分箱数量的计数

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [99]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [100]:
#利用labels可选参数，为分箱序列提供箱名
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [103]:
#等份儿分箱

data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.038, 0.26], (0.26, 0.48], (0.038, 0.26], (0.26, 0.48], (0.038, 0.26], ..., (0.48, 0.71], (0.038, 0.26], (0.48, 0.71], (0.038, 0.26], (0.038, 0.26]]
Length: 20
Categories (4, object): [(0.038, 0.26] < (0.26, 0.48] < (0.48, 0.71] < (0.71, 0.93]]

In [104]:
#分位数分箱

data = np.random.randn(1000)
cats = pd.qcut(data, 4)

In [105]:
cats

[(0.69, 2.94], (0.0169, 0.69], (0.69, 2.94], [-2.813, -0.687], (0.0169, 0.69], ..., (0.0169, 0.69], [-2.813, -0.687], [-2.813, -0.687], [-2.813, -0.687], (0.69, 2.94]]
Length: 1000
Categories (4, object): [[-2.813, -0.687] < (-0.687, 0.0169] < (0.0169, 0.69] < (0.69, 2.94]]

In [106]:
pd.value_counts(cats)

(0.69, 2.94]        250
(0.0169, 0.69]      250
(-0.687, 0.0169]    250
[-2.813, -0.687]    250
dtype: int64

In [107]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.0169, 1.284], (0.0169, 1.284], (0.0169, 1.284], [-2.813, -1.243], (0.0169, 1.284], ..., (0.0169, 1.284], [-2.813, -1.243], (-1.243, 0.0169], [-2.813, -1.243], (0.0169, 1.284]]
Length: 1000
Categories (4, object): [[-2.813, -1.243] < (-1.243, 0.0169] < (0.0169, 1.284] < (1.284, 2.94]]

In [108]:
#在聚合和分组一节中还会用到

In [109]:
#监测和过滤异常值

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.05175,0.026522,-0.010548,0.024063
std,1.028166,0.99218,0.99043,1.02243
min,-3.107349,-3.236035,-3.467668,-3.013903
25%,-0.726977,-0.642596,-0.701092,-0.647641
50%,-0.024417,-0.006613,-0.005592,0.013307
75%,0.631496,0.655364,0.684449,0.717323
max,3.020092,3.590144,2.719713,3.891951


In [111]:
col = data[2]
col[np.abs(col) > 3]

421   -3.138447
721   -3.467668
954   -3.258485
Name: 2, dtype: float64

In [112]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
53,1.512596,-3.236035,-0.51617,0.129803
73,-0.719833,3.590144,0.256781,0.403781
183,-3.042407,-0.716864,-1.26238,1.083967
328,0.281265,0.687234,-0.45969,3.891951
421,-0.012748,1.586749,-3.138447,1.842142
468,-0.316503,-1.027646,-0.555831,-3.013903
519,0.980986,3.305336,-0.74795,-0.773725
721,-0.808868,1.88317,-3.467668,-0.631259
764,-3.107349,-0.276815,0.666304,1.672259
830,3.020092,-1.152042,-0.304113,-1.035824


In [121]:
len(data[(np.abs(data) > 3).any(1)])

11

In [122]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [123]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.051621,0.025863,-0.009683,0.023185
std,1.027669,0.988515,0.98766,1.019395
min,-3.0,-3.0,-3.0,-3.0
25%,-0.726977,-0.642596,-0.701092,-0.647641
50%,-0.024417,-0.006613,-0.005592,0.013307
75%,0.631496,0.655364,0.684449,0.717323
max,3.0,3.0,2.719713,3.0


In [127]:
np.sign(data)[:5] # sign根据数据的正负分别生成1和-1

Unnamed: 0,0,1,2,3
0,1.0,-1.0,-1.0,1.0
1,1.0,1.0,-1.0,1.0
2,-1.0,1.0,-1.0,1.0
3,-1.0,1.0,-1.0,1.0
4,1.0,1.0,1.0,-1.0


In [128]:
#置换与随机抽样
#permutation可以对DF中的Series或行进行置换操作（随机重排序）
#根据用户需要的轴长度可以产生一个表示新顺序的整数数组
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [129]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [131]:
sampler = np.random.permutation(5)
sampler

array([0, 3, 2, 4, 1])

In [132]:
#随机重排序后的索引可以用在iloc或者take函数中
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7


In [133]:
#利用sample方法可以返回一个无重复的随机样本

df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19


In [134]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19


In [136]:
#生成带有替代值的样本（允许重复选择）
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
print(choices)
draws

0    5
1    7
2   -1
3    6
4    4
dtype: int64


3    6
3    6
4    4
1    7
3    6
3    6
0    5
2   -1
4    4
0    5
dtype: int64

In [137]:
#计算指标/虚拟变量

In [138]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})

In [140]:
#注意get_dummies是pandas的函数
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [142]:
#用prefix可选参数在数据列中增加前缀
dummies = pd.get_dummies(df['key'], prefix='key')

In [144]:
df_with_dummy = df[['data1']].join(dummies)

In [145]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [146]:
mnames = ['movie_id', 'title', 'genres']

In [149]:
# MovieLens 1M dataset
movies = pd.read_table('code/datasets/movielens/movies.dat', sep='::',header=None, names=mnames)

  from ipykernel import kernelapp as app


In [150]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [161]:
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))

In [162]:
genres = pd.unique(all_genres)

In [163]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [172]:
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [170]:
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

In [167]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [173]:
for i,gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [174]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [175]:
#将get_dummies与cut等离散化函数结合使用是统计应用的一个常用方法
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [176]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [177]:
type(pd.cut(values, bins))

pandas.core.categorical.Categorical

In [183]:
pd.cut(values, bins)

[(0.8, 1], (0.2, 0.4], (0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, object): [(0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1]]

3. 字符串操作

字符串对象方法：

count字符串中元素非重复次数

endswith后缀出现字符串时返回真

startswith前缀出现字符串时返回真

join使用字符串做为间隔符号，用于粘合其他字符串序列

index如果在字符串中找到，则返回字符串中第一个字符的位置；没发现则ValueError

find返回字符串中第一个出现子字符的第一个字符的位置；没发现-1

rfind返回子字符串在字符串中最后一次出现时第一个字符的位置；-1

replace替换

strip, rstrip, lstrip裁剪空白、换行符

split依照指定的分隔符对字符串进行拆分，得到一个子字符串列表

lower小写变大写

upper大写变小写

casefold将字符转换为小写

ljust, rjust左对齐或右对齐；用空格（或其他字符）填充字符串的相反侧以返回具有最小宽度的字符串

In [184]:
val = 'a, b,   guido'
val.split(',')

['a', ' b', '   guido']

In [185]:
pieces = [x.strip() for x in val.split(',')]

In [186]:
pieces

['a', 'b', 'guido']

In [187]:
first, second, third = pieces

In [190]:
first + '::' + second + '::' + third

'a::b::guido'

In [191]:
#in是检测子字符串的最佳方法
'guido' in val

True

In [192]:
val.index(',')

1

In [193]:
val.find(':')

-1

In [196]:
#find和index的区别在于find找不着字符串会返回异常（ValueError）
val.index(':')

ValueError: substring not found

In [197]:
val.count(',')

2

In [202]:
val.count(',')

2

In [205]:
val.replace(',', '::::')

'a:::: b::::   guido'

In [206]:
val.replace(',', '')

'a b   guido'

正则表达式REGEX

方法：

findall

finditer

match

search

split

sub, subn

步骤：

1. import re

2. pattern = r'xxxx'

3. regex = re.compile(pattern)

4. regex.findall/finditer.....


In [207]:
#正则表达式 regex
import re
text = "foo   bar\t baz   \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [208]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [209]:
#获取所有匹配正则表达式的模式的列表

regex.findall(text)

['   ', '\t ', '   \t']

原生字符串语法：r'c:\xxx'与'c:\\xxx'是一样滴

In [210]:
text = """
Steve steve@gmail.com
Rob rob@gmail.com
Ryan rayn@yahoo.com
"""

In [211]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [212]:
#re.IGNORCASE使正则表达式不区分大小写
regex = re.compile(pattern, flags=re.IGNORECASE)

In [213]:
#search会返回第一个匹配到的字符串
m = regex.search(text)

In [214]:
#匹配对象只能告诉用户模式在字符串中起始和结束的位置（索引）
m

<_sre.SRE_Match object; span=(7, 22), match='steve@gmail.com'>

In [216]:
text[m.start():m.end()]

'steve@gmail.com'

In [218]:
text[:]

'\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan rayn@yahoo.com\n'

In [219]:
#regex.match只在模式出现于字符串起始位置时进行匹配，如果没有匹配到，这返回None
print(regex.match(text))

None


In [220]:
#sub非常常用，会返回一个新的字符串，原字符串中的模式会被一个新的字符串所替代
print(regex.sub('REDACTED', text))


Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [221]:
#用括号括起来感兴趣域（用户名，域名，域名后缀）
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [222]:
m = regex.match('wesm@bright.net')

In [223]:
# 返回模式组间的元组
m.groups()

('wesm', 'bright', 'net')

In [224]:
#返回元组列表
regex.findall(text)

[('steve', 'gmail', 'com'), ('rob', 'gmail', 'com'), ('rayn', 'yahoo', 'com')]

In [225]:
#sub可使用\1和\2，访问每一个匹配对象中的分组。例如，符号\1代表的是第一个匹配了的分组

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))


Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: rayn, Domain: yahoo, Suffix: com



向量化字符串数组

*非常有用且高效的字符串处理方套路*

In [226]:
#清理杂乱的数据集用于分析通常需要大量的字符串处理和正则化

In [227]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [228]:
data = pd.Series(data)

In [229]:
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

In [230]:
data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

In [231]:
data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [232]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [233]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [234]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

  if __name__ == '__main__':


Dave     (dave, google, com)
Rob        (rob, gmail, com)
Steve    (steve, gmail, com)
Wes                      NaN
dtype: object

In [235]:
matches.str.get(1)

Dave     google
Rob       gmail
Steve     gmail
Wes         NaN
dtype: object

In [237]:
matches.str[0]

Dave      dave
Rob        rob
Steve    steve
Wes        NaN
dtype: object

In [238]:
data.str[:5]

Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

部分向量化字符串方法列表

cat

contains

count

extract

endswith

startswith

findall

get

isalnum

isalpha

isdecimal

isdigit

islower

isnumeric

isupper

join

len

lower, upper

match

pad

center

repeat

replace

slice

split

strip,rstrip,lstrip