In [4]:
import pandas as pd
import numpy as np

In [7]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [11]:
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


### 用含日期时间索引与标签的 NumPy 数组生成 DataFrame

In [9]:
dates = pd.date_range('20130101', periods=6)

In [10]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [14]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927
2013-01-05,0.585272,0.717404,1.634748,1.534494
2013-01-06,-0.958328,0.624115,0.337157,-0.271554


### 用 Series 字典对象生成 DataFrame:

In [15]:
df2 = pd.DataFrame({'A': 1.,
   ...:                     'B': pd.Timestamp('20130102'),
   ...:                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
   ...:                     'D': np.array([3] * 4, dtype='int32'),
   ...:                     'E': pd.Categorical(["test", "train", "test", "train"]),
   ...:                     'F': 'foo'})

In [16]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [22]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### 查看数据

#### 查看DataFrame头部和尾部数据

In [24]:
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249


In [23]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927
2013-01-05,0.585272,0.717404,1.634748,1.534494
2013-01-06,-0.958328,0.624115,0.337157,-0.271554


#### 显示索引与列名

In [27]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [28]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [35]:
df.values

array([[ 1.35636191, -0.7766259 , -1.36194733, -1.01287401],
       [ 1.25908882,  0.86627711,  2.20915225, -0.43102899],
       [ 0.8939943 , -0.08824916,  0.35392881,  0.56524891],
       [-1.15442204, -0.5957228 ,  1.1222383 , -0.02792669],
       [ 0.58527165,  0.7174043 ,  1.63474769,  1.53449424],
       [-0.9583283 ,  0.62411547,  0.33715729, -0.27155382]])

In [36]:
type(df.values)

numpy.ndarray

In [37]:
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [38]:
type(df2.values)

numpy.ndarray

In [39]:
type(df2.values[:, 1])

numpy.ndarray

In [40]:
df2.values[:, 1]

array([Timestamp('2013-01-02 00:00:00'), Timestamp('2013-01-02 00:00:00'),
       Timestamp('2013-01-02 00:00:00'), Timestamp('2013-01-02 00:00:00')],
      dtype=object)

#### describe() (opens new window)可以快速查看数据的统计摘要：

In [41]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.330328,0.124533,0.71588,0.059393
std,1.110241,0.710971,1.251488,0.887607
min,-1.154422,-0.776626,-1.361947,-1.012874
25%,-0.572428,-0.468854,0.34135,-0.39116
50%,0.739633,0.267933,0.738084,-0.14974
75%,1.167815,0.694082,1.50662,0.416955
max,1.356362,0.866277,2.209152,1.534494


#### 转置数据

In [44]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.356362,1.259089,0.893994,-1.154422,0.585272,-0.958328
B,-0.776626,0.866277,-0.088249,-0.595723,0.717404,0.624115
C,-1.361947,2.209152,0.353929,1.122238,1.634748,0.337157
D,-1.012874,-0.431029,0.565249,-0.027927,1.534494,-0.271554


In [45]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927
2013-01-05,0.585272,0.717404,1.634748,1.534494
2013-01-06,-0.958328,0.624115,0.337157,-0.271554


#### 按轴排序

In [47]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.012874,-1.361947,-0.776626,1.356362
2013-01-02,-0.431029,2.209152,0.866277,1.259089
2013-01-03,0.565249,0.353929,-0.088249,0.893994
2013-01-04,-0.027927,1.122238,-0.595723,-1.154422
2013-01-05,1.534494,1.634748,0.717404,0.585272
2013-01-06,-0.271554,0.337157,0.624115,-0.958328


####  按值排序

In [48]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927
2013-01-03,0.893994,-0.088249,0.353929,0.565249
2013-01-06,-0.958328,0.624115,0.337157,-0.271554
2013-01-05,0.585272,0.717404,1.634748,1.534494
2013-01-02,1.259089,0.866277,2.209152,-0.431029


### 选择

#### 优化过的pandas数据访问方法: .at .iat  .loc  .iloc

### 获取数据

#### 选择单列，产生series，与df.A等效

In [50]:
df['A']

2013-01-01    1.356362
2013-01-02    1.259089
2013-01-03    0.893994
2013-01-04   -1.154422
2013-01-05    0.585272
2013-01-06   -0.958328
Freq: D, Name: A, dtype: float64

#### 切片

In [51]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249


In [52]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927


### 按标签选择

#### 用标签提取一行数据

In [53]:
df.loc[dates[0]]

A    1.356362
B   -0.776626
C   -1.361947
D   -1.012874
Name: 2013-01-01 00:00:00, dtype: float64

In [54]:
df[dates[0]]

KeyError: Timestamp('2013-01-01 00:00:00', freq='D')

#### 用标签提取多行数据

In [55]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,1.356362,-0.776626
2013-01-02,1.259089,0.866277
2013-01-03,0.893994,-0.088249
2013-01-04,-1.154422,-0.595723
2013-01-05,0.585272,0.717404
2013-01-06,-0.958328,0.624115


#### 用标签切片，包含行与列结束点

In [56]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,1.259089,0.866277
2013-01-03,0.893994,-0.088249
2013-01-04,-1.154422,-0.595723


#### 返回对象降维

In [57]:
df.loc['20130102', ['A', 'B']]

A    1.259089
B    0.866277
Name: 2013-01-02 00:00:00, dtype: float64

#### 提取标量值

In [58]:
df.loc[dates[0], 'A']

1.35636190673495

In [60]:
type(df.loc[dates[0], 'A'])

numpy.float64

#### 快速访问标量

In [62]:
df.at[dates[0], 'A']

1.35636190673495

### 按位置选择

#### 用整数位置选择

In [63]:
df.iloc[3]

A   -1.154422
B   -0.595723
C    1.122238
D   -0.027927
Name: 2013-01-04 00:00:00, dtype: float64

#### 用整数切片

In [65]:
df.iloc[0:3, 3:5]

Unnamed: 0,D
2013-01-01,-1.012874
2013-01-02,-0.431029
2013-01-03,0.565249


#### 用整数列表按位置切片

In [66]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,1.259089,2.209152
2013-01-03,0.893994,0.353929
2013-01-05,0.585272,1.634748


#### 显示提取值

In [67]:
df.iloc[1, 1]

0.866277105311734

#### 快速访问标量

In [69]:
df.iat[1,1]

0.866277105311734

### 布尔索引

In [70]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249
2013-01-05,0.585272,0.717404,1.634748,1.534494


#### 用isin()筛选

In [71]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [72]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.356362,-0.776626,-1.361947,-1.012874,one
2013-01-02,1.259089,0.866277,2.209152,-0.431029,one
2013-01-03,0.893994,-0.088249,0.353929,0.565249,two
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927,three
2013-01-05,0.585272,0.717404,1.634748,1.534494,four
2013-01-06,-0.958328,0.624115,0.337157,-0.271554,three


In [73]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.893994,-0.088249,0.353929,0.565249,two
2013-01-05,0.585272,0.717404,1.634748,1.534494,four


### 赋值

In [74]:
In [49]: df.iat[0, 1] = 0

In [75]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.356362,0.0,-1.361947,-1.012874
2013-01-02,1.259089,0.866277,2.209152,-0.431029
2013-01-03,0.893994,-0.088249,0.353929,0.565249
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927
2013-01-05,0.585272,0.717404,1.634748,1.534494
2013-01-06,-0.958328,0.624115,0.337157,-0.271554


### 缺失值

#### 重建索引（reindex）可以更改、添加、删除指定轴的索引，并返回数据副本，即不更改原数据。

In [76]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [77]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [78]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,1.356362,0.0,-1.361947,-1.012874,1.0
2013-01-02,1.259089,0.866277,2.209152,-0.431029,1.0
2013-01-03,0.893994,-0.088249,0.353929,0.565249,
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927,


#### 删除所有含缺失值的行

In [80]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,1.356362,0.0,-1.361947,-1.012874,1.0
2013-01-02,1.259089,0.866277,2.209152,-0.431029,1.0


#### 填充缺失值

In [81]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,1.356362,0.0,-1.361947,-1.012874,1.0
2013-01-02,1.259089,0.866277,2.209152,-0.431029,1.0
2013-01-03,0.893994,-0.088249,0.353929,0.565249,5.0
2013-01-04,-1.154422,-0.595723,1.122238,-0.027927,5.0


#### 提取nan值的布尔掩码

In [83]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


## 运算

#### 不同维度对象运算时，要先对齐。 此外，Pandas 自动沿指定维度广播。

In [92]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(1)

In [101]:
s

2013-01-01    NaN
2013-01-02    1.0
2013-01-03    3.0
2013-01-04    5.0
2013-01-05    NaN
2013-01-06    6.0
Freq: D, dtype: float64

### Apply函数

In [100]:
df.apply(lambda x: x.max() - x.min())

A    2.510784
B    1.462000
C    3.571100
D    2.547368
dtype: float64