In [1]:
import pandas as pd
import numpy as np

### 汇总和计算描述统计

In [2]:
df = pd.DataFrame([[1.4,np.nan],
                  [7.1,-4.5],
                  [np.nan,np.nan],
                  [0.75,-1.3]],
                 index=['a','b','c','d'],
                 columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [4]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [5]:
df.mean(axis=1,skipna=False) # skipna 排除缺失值

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [6]:
df.idxmax()

one    b
two    d
dtype: object

In [7]:
df.cumsum() # 累加

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [8]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [9]:
obj = pd.Series(['a','a','b','c']*4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [10]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

### 唯一值 值计数和成员资格

In [11]:
obj = pd.Series(['c','a','d','a','b','b','c','c'])
uniques = obj.unique()

In [12]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [13]:
obj.value_counts()

c    3
a    2
b    2
d    1
dtype: int64

In [16]:
pd.value_counts(obj.values,sort=False)

b    2
c    3
d    1
a    2
dtype: int64

### 处理缺失值

In [18]:
string_data = pd.Series(['aardvark','sadfsafasd',np.nan,'avoca'])
string_data

0      aardvark
1    sadfsafasd
2           NaN
3         avoca
dtype: object

In [19]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [20]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 滤出缺失数据

In [21]:
from numpy import nan as NA
data = pd.Series([1,NA,3.5,NA,7])

In [22]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [23]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [24]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [25]:
data = pd.DataFrame([[1.,6.5,3.],
                    [1.,NA,NA],
                    [NA,NA,NA],
                    [NA,6.5,3.]])
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [26]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [27]:
data.dropna(how='all') # 全是NA才扔掉

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [28]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### 填充缺失数据

In [33]:
df.fillna(10)

Unnamed: 0,one,two
a,1.4,10.0
b,7.1,-4.5
c,10.0,10.0
d,0.75,-1.3


### 层次化索引

In [38]:
data = pd.Series(np.random.randn(10),
                index=[['a','a','a','b','b','b','c','c','d','d'],
                      [1,2,3,1,2,3,1,2,2,3]])
data

a  1    0.082136
   2    0.098847
   3    1.135527
b  1   -1.419518
   2   -1.217968
   3   -0.669480
c  1   -0.791502
   2   -0.816943
d  2   -1.526589
   3    0.946669
dtype: float64

In [39]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [40]:
data['b']

1   -1.419518
2   -1.217968
3   -0.669480
dtype: float64

In [41]:
data['b':'c']

b  1   -1.419518
   2   -1.217968
   3   -0.669480
c  1   -0.791502
   2   -0.816943
dtype: float64

In [43]:
data.loc[['b','d']]

b  1   -1.419518
   2   -1.217968
   3   -0.669480
d  2   -1.526589
   3    0.946669
dtype: float64

In [44]:
data.unstack()

Unnamed: 0,1,2,3
a,0.082136,0.098847,1.135527
b,-1.419518,-1.217968,-0.66948
c,-0.791502,-0.816943,
d,,-1.526589,0.946669


In [45]:
data.unstack().stack()

a  1    0.082136
   2    0.098847
   3    1.135527
b  1   -1.419518
   2   -1.217968
   3   -0.669480
c  1   -0.791502
   2   -0.816943
d  2   -1.526589
   3    0.946669
dtype: float64

### 使用DataFrame的列

In [47]:
frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [48]:
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [50]:
frame.set_index(['c','d'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3
