# 7 数据清洗与准备
重点讨论缺失值、重复值、字符串操作和其他数据转换工具

## 7.1 处理缺失值

In [1]:
# pandas对象的所有描述性统计信息默认情况下是排除缺失值的
import pandas as pd
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [2]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# NA意思是not available（不可用）
# 当数据清洗时，对缺失数据本身进行分析已确定数据收集问题或数据丢失导致的数据偏差通常很重要。
# None在对象数组中也被当作NA处理
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 7.1.1 过滤缺失值

In [6]:
from numpy import nan as NA

data = pd.Series([NA, 1, 2, NA, 4])
# Series中使用dropna，会返回所有非NA的数据
data.dropna()

1    1.0
2    2.0
4    4.0
dtype: float64

In [8]:
data[data.notnull()]

1    1.0
2    2.0
4    4.0
dtype: float64

In [11]:
# dropna默认删除包含NA的行
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.55, 3.]])
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.55,3.0


In [12]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
# 当传入how='all'时，会删除所有值均为NA的行
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.55,3.0


In [15]:
data[4] = NA

In [16]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.55,3.0,


In [18]:
# 删除列加上axis=1
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.55,3.0


In [19]:
# 只想保留一定数量的行，使用thresh参数来表示
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,2.220267,,
1,1.648893,,
2,1.502815,,-1.576268
3,-0.791778,,-0.098838
4,0.206316,0.207775,0.431345
5,0.552498,-1.905871,0.377752
6,-0.794606,-2.39731,-1.736436


In [20]:
df.dropna()

Unnamed: 0,0,1,2
4,0.206316,0.207775,0.431345
5,0.552498,-1.905871,0.377752
6,-0.794606,-2.39731,-1.736436


In [21]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.502815,,-1.576268
3,-0.791778,,-0.098838
4,0.206316,0.207775,0.431345
5,0.552498,-1.905871,0.377752
6,-0.794606,-2.39731,-1.736436


### 7.1.2 补全缺失值

In [23]:
# 大多数情况下使用fillna来不全缺失值
df.fillna(0)

Unnamed: 0,0,1,2
0,2.220267,0.0,0.0
1,1.648893,0.0,0.0
2,1.502815,0.0,-1.576268
3,-0.791778,0.0,-0.098838
4,0.206316,0.207775,0.431345
5,0.552498,-1.905871,0.377752
6,-0.794606,-2.39731,-1.736436


In [24]:
# 可以使用字典来为不同的列填充不同的值
df.fillna({1: 0.5, 2: 1})

Unnamed: 0,0,1,2
0,2.220267,0.5,1.0
1,1.648893,0.5,1.0
2,1.502815,0.5,-1.576268
3,-0.791778,0.5,-0.098838
4,0.206316,0.207775,0.431345
5,0.552498,-1.905871,0.377752
6,-0.794606,-2.39731,-1.736436


In [27]:
# fillna默认不会修改原有的对象，加inplace=True会就地修改
_ = df.fillna({1: 0.5, 2: 1}, inplace=True)
df

Unnamed: 0,0,1,2
0,2.220267,0.5,1.0
1,1.648893,0.5,1.0
2,1.502815,0.5,-1.576268
3,-0.791778,0.5,-0.098838
4,0.206316,0.207775,0.431345
5,0.552498,-1.905871,0.377752
6,-0.794606,-2.39731,-1.736436


In [28]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.365008,0.284544,-0.677766
1,0.592913,-1.810273,-2.835809
2,0.256803,,-0.688223
3,-1.758211,,1.070885
4,-0.309773,,
5,-0.772189,,


In [30]:
# 使用前一项值来填充空值
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.365008,0.284544,-0.677766
1,0.592913,-1.810273,-2.835809
2,0.256803,-1.810273,-0.688223
3,-1.758211,-1.810273,1.070885
4,-0.309773,-1.810273,1.070885
5,-0.772189,-1.810273,1.070885


In [31]:
# 也可以使用均值等来填充缺失值
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.365008,0.284544,-0.677766
1,0.592913,-1.810273,-2.835809
2,0.256803,-0.762864,-0.688223
3,-1.758211,-0.762864,1.070885
4,-0.309773,-0.762864,-0.782728
5,-0.772189,-0.762864,-0.782728
