In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 一维序列
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# 时间序列
dates = pd.date_range('20130101', periods = 6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
# 二维数据框
df = pd.DataFrame(np.random.randn(6, 4),
                  index = dates, 
                  columns = list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.30548,0.729849,-0.59539,0.14677
2013-01-02,-0.433928,0.268882,-0.697556,-0.212038
2013-01-03,0.774532,-1.647604,-2.593172,-0.23182
2013-01-04,1.526009,-0.279923,0.431776,-1.460632
2013-01-05,0.550472,-0.663538,-1.655078,1.313903
2013-01-06,0.565596,0.152469,1.186626,0.327792


In [8]:
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1, index = list(range(4)), dtype = 'float32'),
                    'D':np.array([3] * 4, dtype = 'int32'),
                    'E':pd.Categorical(["test", "train", "test", "train"]),
                    'F':'foo'})
                                       

In [9]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [13]:
df2.head(2)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo


In [15]:
# 后三行
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [16]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df2.index

Index([0, 1, 2, 3], dtype='int64')

In [18]:
df.values

array([[ 1.30547954,  0.729849  , -0.59538988,  0.14676968],
       [-0.43392772,  0.26888161, -0.69755648, -0.21203836],
       [ 0.77453166, -1.6476037 , -2.59317197, -0.2318202 ],
       [ 1.52600863, -0.27992342,  0.43177579, -1.46063156],
       [ 0.55047228, -0.66353778, -1.6550776 ,  1.31390281],
       [ 0.56559605,  0.15246919,  1.18662565,  0.3277922 ]])

In [19]:
# 转置
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.30548,-0.433928,0.774532,1.526009,0.550472,0.565596
B,0.729849,0.268882,-1.647604,-0.279923,-0.663538,0.152469
C,-0.59539,-0.697556,-2.593172,0.431776,-1.655078,1.186626
D,0.14677,-0.212038,-0.23182,-1.460632,1.313903,0.327792


In [21]:
# 按某一列排序
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2013-01-03,0.774532,-1.647604,-2.593172,-0.23182
2013-01-05,0.550472,-0.663538,-1.655078,1.313903
2013-01-04,1.526009,-0.279923,0.431776,-1.460632
2013-01-06,0.565596,0.152469,1.186626,0.327792
2013-01-02,-0.433928,0.268882,-0.697556,-0.212038
2013-01-01,1.30548,0.729849,-0.59539,0.14677


In [22]:
# 投影
df['A']

2013-01-01    1.305480
2013-01-02   -0.433928
2013-01-03    0.774532
2013-01-04    1.526009
2013-01-05    0.550472
2013-01-06    0.565596
Freq: D, Name: A, dtype: float64

In [23]:
# 按行下标切片
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,1.30548,0.729849,-0.59539,0.14677
2013-01-02,-0.433928,0.268882,-0.697556,-0.212038


In [24]:
# 按索引切片
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.433928,0.268882,-0.697556,-0.212038
2013-01-03,0.774532,-1.647604,-2.593172,-0.23182
2013-01-04,1.526009,-0.279923,0.431776,-1.460632


In [26]:
# 按列切片
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2013-01-01,1.30548,0.729849
2013-01-02,-0.433928,0.268882
2013-01-03,0.774532,-1.647604
2013-01-04,1.526009,-0.279923
2013-01-05,0.550472,-0.663538
2013-01-06,0.565596,0.152469


In [27]:
# 同时按行列切片
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.433928,0.268882
2013-01-03,0.774532,-1.647604
2013-01-04,1.526009,-0.279923


In [28]:
df.loc['20130102']['A']

np.float64(-0.4339277206482249)

In [29]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.526009,-0.279923
2013-01-05,0.550472,-0.663538


In [30]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.433928,-0.697556
2013-01-03,0.774532,-2.593172
2013-01-05,0.550472,-1.655078


In [31]:
# 按条件进行筛选
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.30548,0.729849,-0.59539,0.14677
2013-01-03,0.774532,-1.647604,-2.593172,-0.23182
2013-01-04,1.526009,-0.279923,0.431776,-1.460632
2013-01-05,0.550472,-0.663538,-1.655078,1.313903
2013-01-06,0.565596,0.152469,1.186626,0.327792


In [32]:
# 聚合
df.groupby('A').sum()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.433928,0.268882,-0.697556,-0.212038
0.550472,-0.663538,-1.655078,1.313903
0.565596,0.152469,1.186626,0.327792
0.774532,-1.647604,-2.593172,-0.23182
1.30548,0.729849,-0.59539,0.14677
1.526009,-0.279923,0.431776,-1.460632


In [33]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.433928,0.268882,-0.697556,-0.212038
0.550472,-0.663538,-1.655078,1.313903
0.565596,0.152469,1.186626,0.327792
0.774532,-1.647604,-2.593172,-0.23182
1.30548,0.729849,-0.59539,0.14677
1.526009,-0.279923,0.431776,-1.460632
