In [1]:
import numpy as np
import pandas as pd

# 01.函数

## 1.1 直接使用numpy函数

In [50]:
df = pd.DataFrame(np.random.randn(3,2))
df

Unnamed: 0,0,1
0,-0.2244,0.806205
1,-0.156604,0.078354
2,0.622751,0.796878


In [51]:
np.abs(df)

Unnamed: 0,0,1
0,0.2244,0.806205
1,0.156604,0.078354
2,0.622751,0.796878


## 1.2 使用自定义函数

### 1.2.1 通过apply方法将函数应用到列或行

In [5]:
df = pd.DataFrame(np.random.rand(4,3),index=list('abcd'),columns=list('ABC'))
df

Unnamed: 0,A,B,C
a,0.850124,0.444995,0.733747
b,0.880202,0.839141,0.485449
c,0.197179,0.076783,0.819077
d,0.086722,0.274434,0.482028


In [6]:
f_max = lambda x:x.max()
df.apply(f_max)  # 默认情况下：axis=0

A    0.880202
B    0.839141
C    0.819077
dtype: float64

In [7]:
df.apply(f_max,axis=1)

a    0.850124
b    0.880202
c    0.819077
d    0.482028
dtype: float64

### 1.2.2 通过applymap方法将函数应用到每一个数据

In [9]:
f_fmt = lambda x:'%.2f'%x
df.applymap(f_fmt)

Unnamed: 0,A,B,C
a,0.85,0.44,0.73
b,0.88,0.84,0.49
c,0.2,0.08,0.82
d,0.09,0.27,0.48


# 02.方法

## 2.1 排序

### 2.1.1 索引排序——`sort_index()`方法

**Series**

In [52]:
s = pd.Series(np.arange(4), index=list('dbca'))
s

d    0
b    1
c    2
a    3
dtype: int32

In [15]:
s.sort_index()  # 默认升序

a    3
b    1
c    2
d    0
dtype: int32

In [16]:
s.sort_index(ascending=False)  # 指定降序

d    0
c    2
b    1
a    3
dtype: int32

**DataFrame**

In [19]:
df = pd.DataFrame(np.arange(12).reshape(4,3),index=list('bdca'),columns=list('CBA'))
df

Unnamed: 0,C,B,A
b,0,1,2
d,3,4,5
c,6,7,8
a,9,10,11


In [22]:
df.sort_index()  # 按照行排序

Unnamed: 0,C,B,A
a,9,10,11
b,0,1,2
c,6,7,8
d,3,4,5


In [23]:
df.sort_index(axis=1)  # 按照列排序

Unnamed: 0,A,B,C
b,2,1,0
d,5,4,3
c,8,7,6
a,11,10,9


### 2.2.2 值排序——`sort_values()`方法

**Series**

In [25]:
s = pd.Series([5,np.nan,6,9,np.nan])
s

0    5.0
1    NaN
2    6.0
3    9.0
4    NaN
dtype: float64

In [27]:
s.sort_values()  # 默认升序，NAN会被放到最后

0    5.0
2    6.0
3    9.0
1    NaN
4    NaN
dtype: float64

In [29]:
s.sort_values(ascending=False)  # 使用降序，NAN依然被放在最后

3    9.0
2    6.0
0    5.0
1    NaN
4    NaN
dtype: float64

**DataFrame**

In [32]:
df = pd.DataFrame(np.random.randint(20,size=(4,3)),index=list('abcd'),columns=list('ABC'))
df

Unnamed: 0,A,B,C
a,0,8,12
b,15,0,1
c,8,12,18
d,15,19,11


In [34]:
df.sort_values(by=['A'])

Unnamed: 0,A,B,C
a,0,8,12
c,8,12,18
b,15,0,1
d,15,19,11


In [37]:
df.sort_values(by=['b'],axis =1)

Unnamed: 0,B,C,A
a,8,12,0
b,0,1,15
c,12,18,8
d,19,11,15


## 2.2 唯一值和成员属性

In [38]:
s = pd.Series([2,6,8,9,8,3,6],index=['a','a','c','c','c','c','c'])
s

a    2
a    6
c    8
c    9
c    8
c    3
c    6
dtype: int64

In [39]:
s.unique()

array([2, 6, 8, 9, 3], dtype=int64)

In [40]:
s.index.is_unique

False

In [41]:
s.value_counts()

6    2
8    2
3    1
2    1
9    1
dtype: int64

In [42]:
s.isin([2,8])

a     True
a    False
c     True
c    False
c     True
c    False
c    False
dtype: bool

## 2.3 缺失值处理

In [43]:
df = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
                    [np.nan, 4., np.nan], [1., 2., 3.]])
df

Unnamed: 0,0,1,2
0,-0.017182,0.825946,-0.51278
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [44]:
df.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,True
2,True,False,True
3,False,False,False


In [45]:
df.dropna()  # 默认丢弃行

Unnamed: 0,0,1,2
0,-0.017182,0.825946,-0.51278
3,1.0,2.0,3.0


In [46]:
df.dropna(axis=1)  # 指定丢弃列

Unnamed: 0,1
0,0.825946
1,2.0
2,4.0
3,2.0


In [49]:
df.fillna(100)  # 填充缺失数据

Unnamed: 0,0,1,2
0,-0.017182,0.825946,-0.51278
1,1.0,2.0,100.0
2,100.0,4.0,100.0
3,1.0,2.0,3.0
