# 第7章数据清洗与准备

## 7.1处理缺失值

In [2]:
import numpy as np
import pandas as pd

#### pandas使用浮点数 NaN(not a number )来表示缺失值

In [9]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [10]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

#### python 内建的none值在对象组中也被当做NA处理

In [16]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [None]:
string_data[0]=None

In [17]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 过滤缺失值
#### 在Series上使用dropna,它会返回Series中所有非空数据及其索引值

In [7]:
from numpy import nan as NA

In [19]:
data=pd.Series([1,NA,3.5,NA,7])

In [20]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

#### 以上例子与下面的代码是等价的：

In [21]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [23]:
data=pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

#### 当处理dataframe的时候，dropna默认情况下会删除包含缺失值的行 

In [24]:
cleaned=data.dropna()

In [25]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [26]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


#### 当传入how='all'时，将删除所有值均为NA的行 

In [27]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


####  如果要用同样的方式去删除列，传入参数axis=1 

In [28]:
data[4]=NA

In [29]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [30]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


#### 过滤 DataFrame的行的相关方法涉及时间序列数据。假设你只想保留一定数量的观察值和行。可以用thresh参数来表示

In [32]:
df=pd.DataFrame(np.random.randn(7,3))

In [34]:
df.iloc[:4,1]=NA
df

Unnamed: 0,0,1,2
0,-0.214706,,1.226505
1,-0.140284,,-0.520167
2,0.052916,,1.21727
3,-0.928509,,-0.099105
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


In [36]:
df.iloc[:2,2]=NA
df

Unnamed: 0,0,1,2
0,-0.214706,,
1,-0.140284,,
2,0.052916,,1.21727
3,-0.928509,,-0.099105
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


In [37]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


#### thresh=N要求一行至少具有N非NaN才能存活 

In [39]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.052916,,1.21727
3,-0.928509,,-0.099105
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


### 7.1.2补全缺失值 

#### 大多数情况下 用fillna方法补全缺失值。调用fillna可以使用一个常数来代替缺失值。 

In [40]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.214706,0.0,0.0
1,-0.140284,0.0,0.0
2,0.052916,0.0,1.21727
3,-0.928509,0.0,-0.099105
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


#### 在调用fillna时使用字典，你可以为不同列设定不同的填充值。 

In [41]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.214706,0.5,0.0
1,-0.140284,0.5,0.0
2,0.052916,0.5,1.21727
3,-0.928509,0.5,-0.099105
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


#### fillna 返回的是一个新的对象，但你也可以修改已经存在的对象：

In [44]:
_=df.fillna(0,inplace=True)

In [45]:
df

Unnamed: 0,0,1,2
0,-0.214706,0.0,0.0
1,-0.140284,0.0,0.0
2,0.052916,0.0,1.21727
3,-0.928509,0.0,-0.099105
4,-1.073584,0.25431,-0.740994
5,-0.332966,0.1588,0.262657
6,-0.146998,-0.396423,-1.262556


####  用于重建索引的相同的插值方法也可以用于fillna:

In [3]:
df=pd.DataFrame(np.random.randn(6,3))

In [4]:
df

Unnamed: 0,0,1,2
0,1.705175,0.070796,-0.597105
1,0.072877,0.712839,0.221495
2,-0.469859,0.384731,-0.286309
3,3.017468,0.778424,1.678712
4,-1.121435,-0.966743,1.365751
5,0.089673,-0.809988,-1.353512


#### df.iloc[行索引，列索引]=NA 

In [9]:
df.iloc[2:,1]=NA

In [10]:
df

Unnamed: 0,0,1,2
0,1.705175,0.070796,-0.597105
1,0.072877,0.712839,0.221495
2,-0.469859,,-0.286309
3,3.017468,,1.678712
4,-1.121435,,1.365751
5,0.089673,,-1.353512


In [12]:
df.iloc[4:,2]=NA

In [13]:
df

Unnamed: 0,0,1,2
0,1.705175,0.070796,-0.597105
1,0.072877,0.712839,0.221495
2,-0.469859,,-0.286309
3,3.017468,,1.678712
4,-1.121435,,
5,0.089673,,


#### method='ffill'  表示后向填充，limit表示可以连续填充的最大数量，axis表示填充的方向,value 表示填充值

#### axis=1 表示横向填充  axis=0或者不指定此参数表示 纵向填充 

In [22]:
df.fillna(method='ffill',axis=1)

Unnamed: 0,0,1,2
0,1.705175,0.070796,-0.597105
1,0.072877,0.712839,0.221495
2,-0.469859,-0.469859,-0.286309
3,3.017468,3.017468,1.678712
4,-1.121435,-1.121435,-1.121435
5,0.089673,0.089673,0.089673


In [24]:
df.fillna(value=2,limit=2)

Unnamed: 0,0,1,2
0,1.705175,0.070796,-0.597105
1,0.072877,0.712839,0.221495
2,-0.469859,2.0,-0.286309
3,3.017468,2.0,1.678712
4,-1.121435,,2.0
5,0.089673,,2.0


#### 使用fillna 用平均值或中位数代替Series的缺失值

In [16]:
data=pd.Series([1,NA,3.5,NA,7])

In [17]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### 7.2.2使用函数或映射进行数据转换 

#### 收集到关于肉类的假设数据

In [28]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})

In [29]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


#### 如果你想要添加一列用于表明每种食物的动物肉类型。让我们先写下一个食物和肉类的映射： 

In [30]:
meat_to_animal={'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}

#### 因为food里面的肉类大小写不一致，而 meat_to_animal里面的肉类都是小写，所以把food里面的肉类都小写化 

In [34]:
lowercased=data['food'].str.lower()

In [33]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [35]:
data['animal']=lowercased.map(meat_to_animal)

In [36]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


#### 我们也可以传入一个能够完成所有工作的函数： 

In [37]:
data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

###  7.2.3替代值 

####  使用fillna 填充缺失值是通用值替换的特殊案例。map可以用来修改一个对象中的子集的值，但是replace提供了更为简单林火的实现。 

In [38]:
data=pd.Series([1.0,-999.,2.,-999.,-1000.,3.])

In [39]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

#### -999可能是缺失值，如果要使用NA来代替这些值，我们可以用replace 方法生成新的Series

In [40]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

#### 如果你想要一次替代多个值，你可以传入一个列表和替代值 

In [41]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

####  要将不用的值替换为不同的值，可以传入替代值的列表 

In [42]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

####  参数也可以通过字典传递 

In [43]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

####  data.replace方法与data.str.replace方法是不同的，data.str.replace是对字符串进行按元素替代的。

### 7.2.4重命名轴索引 

In [45]:
data=pd.DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])

In [46]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [47]:
transform=lambda x:x[:4].upper()

In [48]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

#### 你可以赋值给index,修改DataFrame： 

In [49]:
data.index=data.index.map(transform)

In [50]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


#### 如果你想要创建数据集转换后的版本，并且不修改原有的数据集，一个有用的方法是rename

In [51]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


#### 值得注意的是，rename可以结合字典型对象使用，为轴标签的子集提供新的值： 

In [52]:
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


#### 如果想要修改原数据集，传入inplace=True:

In [54]:
data.rename(index={'OHIO':'INDIANA'},inplace=True)

In [55]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 7.2.5离散化和分箱 

#### 连续值经常需要离散化，或者分离成‘箱子’进行分析。假设你有某项研究中的一组人群的数据，想将他们分组，放入离散的年龄组中： 

In [57]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]

#### 分为18-25，26-35，36-60以及60-100及以上等若干组。可以用pandas中的cut实现 

In [56]:
bins=[18,25,35,60,100]

In [58]:
cats=pd.cut(ages,bins)

In [59]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [60]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [61]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

#### pd.value_counts(cats)是对pandas.cut的结果中的箱数量的计数 

In [62]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

####  可以通过传递right=False 来改变哪一边是封闭的 

In [63]:
pd.cut(ages,[18,26,36,61,100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

#### 可以通过向labels选项传递一个列表或数组来传入自定义的箱名

In [64]:
group_names=['Youth','YoungAdult','MiddleAged','Senior']

In [65]:
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

#### 如果传给cut 整数个的箱来替代显式的箱边，pandas将根据数据中的最小值和最大值计算出等长的箱

In [66]:
data=np.random.rand(20)

In [67]:
data

array([0.10924928, 0.40305634, 0.60792752, 0.34685506, 0.23476601,
       0.19956664, 0.7156894 , 0.28474229, 0.45325005, 0.75795576,
       0.34538502, 0.82051962, 0.22478254, 0.6374462 , 0.23633133,
       0.16861522, 0.93790786, 0.42946921, 0.03585738, 0.50328933])

In [68]:
pd.cut(data,4,precision=2)

[(0.035, 0.26], (0.26, 0.49], (0.49, 0.71], (0.26, 0.49], (0.035, 0.26], ..., (0.035, 0.26], (0.71, 0.94], (0.26, 0.49], (0.035, 0.26], (0.49, 0.71]]
Length: 20
Categories (4, interval[float64]): [(0.035, 0.26] < (0.26, 0.49] < (0.49, 0.71] < (0.71, 0.94]]

#### precision=2将精度限制在2位 

#### qcut是一个与分箱密切相关的函数，它基于样本分位数进行分箱。

In [69]:
data=np.random.randn(1000)#正太分布

In [70]:
cats=pd.qcut(data,4) #切成4份

In [71]:
cats

[(0.636, 2.878], (-0.686, -0.0249], (-3.565, -0.686], (-0.686, -0.0249], (-0.686, -0.0249], ..., (-0.0249, 0.636], (-0.686, -0.0249], (0.636, 2.878], (-3.565, -0.686], (-0.686, -0.0249]]
Length: 1000
Categories (4, interval[float64]): [(-3.565, -0.686] < (-0.686, -0.0249] < (-0.0249, 0.636] < (0.636, 2.878]]

In [72]:
pd.value_counts(cats)

(0.636, 2.878]       250
(-0.0249, 0.636]     250
(-0.686, -0.0249]    250
(-3.565, -0.686]     250
dtype: int64

#### 与cut类似，你可以传入自定义的分位数（0和1之间的数据，包括边）: 

In [73]:
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(-0.0249, 1.195], (-1.265, -0.0249], (-1.265, -0.0249], (-1.265, -0.0249], (-1.265, -0.0249], ..., (-0.0249, 1.195], (-1.265, -0.0249], (-0.0249, 1.195], (-3.565, -1.265], (-1.265, -0.0249]]
Length: 1000
Categories (4, interval[float64]): [(-3.565, -1.265] < (-1.265, -0.0249] < (-0.0249, 1.195] < (1.195, 2.878]]

### 7.2.6 检测和过滤异常值 

In [74]:
data=pd.DataFrame(np.random.randn(1000,4))

In [75]:
data

Unnamed: 0,0,1,2,3
0,0.237595,-0.952749,-0.183755,-0.595207
1,0.501177,-0.693966,0.601899,-0.330949
2,0.787904,-1.003360,-1.970080,-0.141888
3,-1.069470,-0.289300,0.008187,0.534372
4,-1.033908,0.101948,0.015775,-0.436921
5,0.828267,0.400554,-0.275419,0.801575
6,0.525023,-0.378540,0.762093,-1.135581
7,-0.692721,0.061683,-0.013919,-1.367555
8,-0.965001,-1.042707,0.778273,-0.601600
9,-0.616847,1.877856,-1.522180,-0.262311


In [76]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.02733,0.055325,-0.027794,-0.003556
std,0.967312,1.019618,1.016715,1.027004
min,-3.085806,-3.56307,-3.199745,-3.385891
25%,-0.681144,-0.573675,-0.69487,-0.674571
50%,-0.050574,0.045847,-0.01206,-0.009809
75%,0.655568,0.719054,0.666825,0.716565
max,3.449425,3.375728,3.233909,3.468359


#### 假设你想要找出一列中绝对值大于三的值： 

In [77]:
col=data[2]

In [78]:
col

0     -0.183755
1      0.601899
2     -1.970080
3      0.008187
4      0.015775
5     -0.275419
6      0.762093
7     -0.013919
8      0.778273
9     -1.522180
10    -0.824416
11     1.619702
12    -0.354753
13    -0.309694
14    -1.676501
15     0.483332
16    -1.048424
17    -1.059163
18     0.824231
19     0.942088
20    -0.637421
21     0.125252
22     0.757524
23     1.302039
24    -1.069443
25    -1.204435
26     0.905616
27     1.065850
28     1.221691
29     2.235359
         ...   
970   -1.197976
971   -1.088212
972   -2.897270
973    0.671809
974    0.060687
975    0.747891
976   -0.640617
977   -1.318945
978   -0.069668
979   -0.499158
980    1.449029
981   -0.025576
982    0.679560
983   -1.599167
984   -0.515663
985   -2.614987
986   -0.717066
987    2.239222
988    2.375305
989    0.099504
990   -1.929953
991   -0.189680
992    0.616739
993    0.144779
994    0.059675
995   -0.722635
996   -0.070526
997   -1.994986
998   -0.774899
999    1.967135
Name: 2, Length: 1000, d

In [79]:
col[np.abs(col)>3]

148    3.233909
204   -3.199745
325   -3.182376
336   -3.127073
838   -3.071142
Name: 2, dtype: float64

#### 要选出所有值大于3或小于-3的行，你可以对布尔值DataFrame 使用any方法： 

In [81]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
71,-0.789513,-0.368595,-1.908175,-3.049222
108,-0.381105,1.368769,0.264706,-3.019973
148,0.096456,0.314705,3.233909,0.14637
204,0.899791,-1.412371,-3.199745,-1.221594
307,3.449425,0.178626,-0.683471,-1.298938
325,0.843395,0.237342,-3.182376,0.932748
334,3.182153,1.062768,0.35653,-1.486814
336,0.547223,-1.886818,-3.127073,-0.790015
453,-3.002803,1.355459,-0.003439,-0.734677
460,-3.085806,1.228171,0.21913,0.669501


#### 值可以根据这些标准来设置，下面代码限制了-3到3之间的数值： 

In [82]:
data[np.abs(data)>3]=np.sign(data)*3

In [83]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.027873,0.055512,-0.027448,-0.003637
std,0.964932,1.016614,1.014243,1.02241
min,-3.0,-3.0,-3.0,-3.0
25%,-0.681144,-0.573675,-0.69487,-0.674571
50%,-0.050574,0.045847,-0.01206,-0.009809
75%,0.655568,0.719054,0.666825,0.716565
max,3.0,3.0,3.0,3.0


In [85]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,-1.0,-1.0
1,1.0,-1.0,1.0,-1.0
2,1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,1.0,1.0
4,-1.0,1.0,1.0,-1.0


### 置换和随机抽样 

#### 使用numpy.random.permutation 对DataFrame中的Series 或行进行置换是很方便的，可以根据想要的轴长度产生一个表示新顺序的整数数组： 

In [87]:
np.arange(20)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [88]:
df=pd.DataFrame(np.arange(5*4).reshape((5,4)))

In [89]:
sampler=np.random.permutation(5)

In [90]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [91]:
sampler

array([2, 0, 4, 3, 1])

####  使用df.take(sampler)置换行顺序 

In [94]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7


#### 要选出一个不含有替代值的随机子集，你可以使用Series和DataFrame的sample方法 

In [96]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
4,16,17,18,19


#### 要生成一个带有替代值的样本（允许有重复选择），将replace=True传入sample方法： 

In [97]:
choices=pd.Series([5,7,-1,6,4])

In [98]:
draws=choices.sample(n=10,replace=True)

In [100]:
draws

3    6
0    5
0    5
3    6
1    7
4    4
0    5
0    5
1    7
2   -1
dtype: int64

### 7.2.8计算指标/虚拟变量 

#### 如果DataFrame中有一列k个不同的值，则可以衍生一个k列的值为1和0的矩阵。pandas 有一个get_dummies 函数用于实现该功能。

In [101]:
df=pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})

In [102]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


#### 在某些情况下可以给列标签加前缀，然后与其他数据合并 

In [103]:
dummies=pd.get_dummies(df['key'],prefix='key')

In [104]:
df_with_dummy=df[['data1']].join(dummies)

In [105]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [106]:
mnames=['movie_id','title','genres']

In [109]:
movies=pd.read_table('D:/data_analysis/bookcode/pydata-book/datasets/movielens/movies.dat',encoding="unicode_escape",sep='::'',header=None,names=mnames)

  """Entry point for launching an IPython kernel.


In [108]:
movies[:10]

Unnamed: 0,1::Toy Story (1995)::Animation|Children's|Comedy
0,2::Jumanji (1995)::Adventure|Children's|Fantasy
1,3::Grumpier Old Men (1995)::Comedy|Romance
2,4::Waiting to Exhale (1995)::Comedy|Drama
3,5::Father of the Bride Part II (1995)::Comedy
4,6::Heat (1995)::Action|Crime|Thriller
5,7::Sabrina (1995)::Comedy|Romance
6,8::Tom and Huck (1995)::Adventure|Children's
7,9::Sudden Death (1995)::Action
8,10::GoldenEye (1995)::Action|Adventure|Thriller
9,"11::American President, The (1995)::Comedy|Dra..."


In [110]:
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [111]:
all_genres=[]

In [122]:
movies.genres

0          Animation|Children's|Comedy
1         Adventure|Children's|Fantasy
2                       Comedy|Romance
3                         Comedy|Drama
4                               Comedy
5                Action|Crime|Thriller
6                       Comedy|Romance
7                 Adventure|Children's
8                               Action
9            Action|Adventure|Thriller
10                Comedy|Drama|Romance
11                       Comedy|Horror
12                Animation|Children's
13                               Drama
14            Action|Adventure|Romance
15                      Drama|Thriller
16                       Drama|Romance
17                            Thriller
18                              Comedy
19                              Action
20                 Action|Comedy|Drama
21                Crime|Drama|Thriller
22                            Thriller
23                        Drama|Sci-Fi
24                       Drama|Romance
25                       

In [124]:
type(movies.genres)

pandas.core.series.Series

In [None]:
#  extend 和append 的区别l
#list.append(object) 向列表中添加一个对象object
#list.extend(sequence) 把一个序列seq的内容添加到列表中

music_media = ['compact disc', '8-track tape', 'long playing record']
new_media = ['DVD Audio disc', 'Super Audio CD']
music_media.append(new_media)
print music_media
>>>['compact disc', '8-track tape', 'long playing record', ['DVD Audio disc', 'Super Audio CD']]
使用append的时候，是将new_media看作一个对象，整体打包添加到music_media对象中。

music_media = ['compact disc', '8-track tape', 'long playing record']
new_media = ['DVD Audio disc', 'Super Audio CD']
music_media.extend(new_media)
print music_media
>>>['compact disc', '8-track tape', 'long playing record', 'DVD Audio disc', 'Super Audio CD']
#使用extend的时候，是将new_media看作一个序列，将这个序列和music_media序列合并，并放在其后面。



In [112]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [113]:
genres=pd.unique(all_genres)

In [114]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

#### 使用全0的dataframe是构造指标dataframe的一种方式 

In [117]:
zero_matrix=np.zeros((len(movies),len(genres)))

In [118]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [119]:
np.zeros((5,4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [120]:
dummies=pd.DataFrame(zero_matrix,columns=genres)

In [121]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 现在遍历每一部电影，将dummies每一行的条目设置为1.为了实现该功能，我们使用dummies.colums来计算每一个流派的列指标

In [125]:
gen=movies.genres[0]

In [126]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [127]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

####  之后，使用.loc根据这些指标来设置值

In [128]:
for i,gen in enumerate(movies.genres):
    indices=dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i,indices]=1

In [129]:
movies_windic=movies.join(dummies.add_prefix('Genre_'))

In [130]:
movies_windic.iloc[0] #第一行

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [131]:
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children's,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 对于更大的数据，以上这种使用多成员构建指标变量并不是很快速，更好的方法是写一个直接将数据写为numpy数组的底层函数，然后将结果封装近DataFrame

####  将get_dummies 与cut 等离散化函数结合使用是有用的方法

In [132]:
np.random.seed(12345)

In [133]:
values=np.random.rand(10)

In [134]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [135]:
bin=[0,0.2,0.4,0.6,0.8,1]

In [136]:
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(18, 25]","(25, 35]","(35, 60]","(60, 100]"
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0
8,0,0,0,0
9,0,0,0,0


## 7.3字符串操作

#### 7.3.1 字符串对象方法 

In [137]:
val='a,b, guido'

In [138]:
val.split(',')

['a', 'b', ' guido']

In [139]:
pieces=[x.strip() for x in val.split(',')]

In [140]:
pieces

['a', 'b', 'guido']

In [141]:
first,second,third=pieces

In [142]:
first+'::'+second+'::'+third

'a::b::guido'

In [143]:
'::'.join(pieces)

'a::b::guido'

In [144]:
'guido' in val

True

In [145]:
val.index(',')

1

In [146]:
val.find(':')

-1

In [147]:
val.count(',')

2

In [148]:
val.replace(',','::')

'a::b:: guido'

In [149]:
val.replace(',','')

'ab guido'

### 7.3.2正则表达式 

In [150]:
import re

In [151]:
text='foo    bar\t  bax   \tqux'

In [152]:
re.split('\s',text)

['foo', '', '', '', 'bar', '', '', 'bax', '', '', '', 'qux']

In [153]:
regex=re.compile('\s+')

In [154]:
regex.split(text)

['foo', 'bar', 'bax', 'qux']

In [155]:
regex.findall(text)

['    ', '\t  ', '   \t']

In [160]:
text="""Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [163]:
regex=re.compile(pattern,flags=re.IGNORECASE)

In [164]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [165]:
pattern=r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [162]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan ryan@yahoo.com'

In [166]:
m=regex.search(text)

In [167]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [168]:
print(regex.match(text))

None


In [169]:
m=regex.match('wesm@bright.net')

In [170]:
m.groups()

()

### pandas中向量化字符串函数

In [171]:
data={'Dave':'dave@google.com','Steve':'steve@gmail.com','Rob':'rob@gmail.com','Wes':np.nan}

In [172]:
data=pd.Series(data)

In [173]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [174]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [175]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [177]:
pattern=r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [178]:
regex=re.compile(pattern,flags=re.IGNORECASE)

In [179]:
m=regex.match('wesm@bright.net')

In [180]:
m.groups()

('wesm', 'bright', 'net')

In [181]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [182]:
print(regex.sub(r'Username: \1,Domain: \2,Suffix: \3',text))

Dave Username: dave,Domain: google,Suffix: com
Steve Username: steve,Domain: gmail,Suffix: com
Rob Username: rob,Domain: gmail,Suffix: com
Ryan Username: ryan,Domain: yahoo,Suffix: com
