In [21]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('data/dogNames2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16220 entries, 0 to 16219
Data columns (total 2 columns):
Row_Labels          16217 non-null object
Count_AnimalName    16220 non-null int64
dtypes: int64(1), object(1)
memory usage: 253.5+ KB


## `DataFrame` 排序

In [14]:
df.sort_values(by='Count_AnimalName', ascending=False).head(5)

Unnamed: 0,Row_Labels,Count_AnimalName
1156,BELLA,1195
9140,MAX,1153
2660,CHARLIE,856
3251,COCO,852
12368,ROCKY,823


In [15]:
df.sort_values(by='Row_Labels', ascending=False).head(5)

Unnamed: 0,Row_Labels,Count_AnimalName
15200,Ü,1
10679,ÔISHI,1
16211,ZZ,2
16210,ZYU,1
16208,ZUZU,16


## `DataFrame` 取行或者列的注意事项
+ 方括号中写数组，表示取行，对行进行操作 （返回的是一个`dataframe`）
+ 方括号中写字符串，表示取索引，对列进行操作 （返回的是一个`series`）

In [17]:
df[:10] 

Unnamed: 0,Row_Labels,Count_AnimalName
0,1,1
1,2,2
2,40804,1
3,90201,1
4,90203,1
5,102201,1
6,3010271,1
7,MARCH,2
8,APRIL,51
9,AUGUST,14


In [20]:
df['Row_Labels'].head(10)

0          1
1          2
2      40804
3      90201
4      90203
5     102201
6    3010271
7      MARCH
8      APRIL
9     AUGUST
Name: Row_Labels, dtype: object

In [18]:
df[:10]['Row_Labels']

0          1
1          2
2      40804
3      90201
4      90203
5     102201
6    3010271
7      MARCH
8      APRIL
9     AUGUST
Name: Row_Labels, dtype: object

 ## 布尔索引

In [42]:
df[df['Count_AnimalName'] > 800]

Unnamed: 0,Row_Labels,Count_AnimalName
1156,BELLA,1195
2660,CHARLIE,856
3251,COCO,852
9140,MAX,1153
12368,ROCKY,823


In [43]:
df[(df['Count_AnimalName'] > 800) & (df['Count_AnimalName'] < 1000)]

Unnamed: 0,Row_Labels,Count_AnimalName
2660,CHARLIE,856
3251,COCO,852
12368,ROCKY,823


In [44]:
df[(df['Row_Labels'].str.len() > 4) & (df['Count_AnimalName'] > 700)]

Unnamed: 0,Row_Labels,Count_AnimalName
1156,BELLA,1195
2660,CHARLIE,856
8552,LUCKY,723
12368,ROCKY,823


## `pandas` 之 `loc`
+ `df.loc` 通过 **标签(字符串)** 获取行数据
+ `df.iloc` 通过 **位置(数字)** 获取行数据

In [23]:
t3 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('abc'), columns=list('WXYZ'))
t3

Unnamed: 0,W,X,Y,Z
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [25]:
t3.loc['a']

W    0
X    1
Y    2
Z    3
Name: a, dtype: int64

In [26]:
t3.loc['a', :]

W    0
X    1
Y    2
Z    3
Name: a, dtype: int64

In [27]:
t3.loc[:, 'Z']

a     3
b     7
c    11
Name: Z, dtype: int64

In [28]:
t3.loc['Z']

KeyError: 'the label [Z] is not in the [index]'

In [24]:
t3.loc['a', 'Z']

3

### 一次取多行、多列

In [29]:
t3.loc[['a', 'c']]

Unnamed: 0,W,X,Y,Z
a,0,1,2,3
c,8,9,10,11


In [30]:
t3.loc[['a', 'c'], :]

Unnamed: 0,W,X,Y,Z
a,0,1,2,3
c,8,9,10,11


In [31]:
t3.loc[:, ['W', 'Z']]

Unnamed: 0,W,Z
a,0,3
b,4,7
c,8,11


In [32]:
t3.loc[['a', 'b'], ['W', 'Z']]

Unnamed: 0,W,Z
a,0,3
b,4,7


In [36]:
t3.loc['a':'c', 'X':'Z']

Unnamed: 0,X,Y,Z
a,1,2,3
b,5,6,7
c,9,10,11


### 通过位置获取数据

In [37]:
t3.iloc[1, :]

W    4
X    5
Y    6
Z    7
Name: b, dtype: int64

In [38]:
t3.iloc[:, 2]

a     2
b     6
c    10
Name: Y, dtype: int64

In [39]:
t3.iloc[:, [2, 1]]

Unnamed: 0,Y,X
a,2,1
b,6,5
c,10,9


In [40]:
t3.iloc[[0, 2], [2, 1]]

Unnamed: 0,Y,X
a,2,1
c,10,9


In [41]:
t3.iloc[1:, :2]

Unnamed: 0,W,X
b,4,5
c,8,9


## 缺失数据的处理

+ 判断数据是否为`NaN`
```python
pd.isnull(df)
pd.notnull(df)
```

+ 处理方法1: 删除`NaN`所在的行列
```python
dropna(axis=0, how='any', inplace=False)
```

+ 处理方法2：填充数据
```python
t.fillna(t.mean())     # 计算平均值等情况时，nan不参与计算，但是0会
t.fillna(t.median())
t.fillna(0)
```

+ 处理为`0`的数据
```python
t[t==0] = np.nan
```

In [46]:
t3.iloc[1:, :2] = np.nan
t3

Unnamed: 0,W,X,Y,Z
a,0.0,1.0,2,3
b,,,6,7
c,,,10,11


In [47]:
pd.isnull(t3)

Unnamed: 0,W,X,Y,Z
a,False,False,False,False
b,True,True,False,False
c,True,True,False,False


In [48]:
pd.notnull(t3)

Unnamed: 0,W,X,Y,Z
a,True,True,True,True
b,False,False,True,True
c,False,False,True,True


In [49]:
t3[pd.notnull(t3['W'])]

Unnamed: 0,W,X,Y,Z
a,0.0,1.0,2,3


In [65]:
t3.dropna(axis=0)

Unnamed: 0,W,X,Y,Z
a,0.0,1.0,2,3


In [66]:
t3.dropna(axis=0, how='all')      # how = ['all', 'any']

Unnamed: 0,W,X,Y,Z
a,0.0,1.0,2,3
b,,,6,7
c,,,10,11


In [67]:
t3.dropna(axis=1, how='any')

Unnamed: 0,Y,Z
a,2,3
b,6,7
c,10,11


In [75]:
t3.fillna(t3.mean())

Unnamed: 0,W,X,Y,Z
a,0.0,1.0,2,3
b,0.0,1.0,6,7
c,0.0,1.0,10,11


In [77]:
t3['X'].fillna(t3['X'].mean())

a    1.0
b    1.0
c    1.0
Name: X, dtype: float64