# 10 Minutes to pandas


通常，我们导入如下：

In [1]:
import numpy as np
import pandas as pd

## 1）对象创建

### 1.1）创建一个 `Series` 对象
通过传递一个 list 参数；pandas 会默认创建一个整数索引：


In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## 1.2）创建一个 `DataFrame` 对象

#### 1.2.1）通过传递带有日期时间索引和标记列的NumPy数组来创建：

In [8]:
dates = pd.date_range('20130101', periods=6)  # 带有日期时间索引
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))  # columns=list('ABCD') 标记列
df

Unnamed: 0,A,B,C,D
2013-01-01,-2.575608,0.717251,-2.230937,0.979498
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226
2013-01-03,0.349076,1.392867,0.141277,-1.549281
2013-01-04,0.650115,-0.757204,0.811333,-0.898107
2013-01-05,-0.262156,1.284219,0.332757,0.072827
2013-01-06,-0.912691,0.681862,-2.108169,0.277249


---

#### 1.2.2）DataFrame通过传递一个 dict 来创建。

In [10]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


上述创建的 DataFrame 的每列都具有不同的 dtypes（可以理解为类型）。

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

>小技巧：如果您正在使用IPython，则会自动启用列名称（以及公共属性）的选项卡完成。以下是将要完成的属性的子集：  
>`df2.<TAB>  # 按 TAB 键`


## 2）查看数据

### 2.1）查看框架的顶行和底行

In [14]:
df.head(2)  # 看顶2行，默认值为 5

Unnamed: 0,A,B,C,D
2013-01-01,-2.575608,0.717251,-2.230937,0.979498
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226


In [16]:
df.tail(3)  # 看底3行，默认值为 5

Unnamed: 0,A,B,C,D
2013-01-04,0.650115,-0.757204,0.811333,-0.898107
2013-01-05,-0.262156,1.284219,0.332757,0.072827
2013-01-06,-0.912691,0.681862,-2.108169,0.277249


### 2.2）显示索引 和 列

In [17]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### 2.3）DataFrame.to_numpy() 给出基础数据的 NumPy 表示

请注意，当您DataFrame拥有不同数据类型的列时，他的操作可能很昂贵。  

这可归结为pandas和NumPy之间的根本差异：
1. NumPy数组对整个数组只有一个dtype，
2. 而 pandas DataFrames 每列有一个dtype。

#### 2.3.1）因为 df 的所有列类型均为浮点值，所以 DataFrame.to_numpy()很快，不需要复制数据。

In [19]:
df.to_numpy()

array([[-2.57560769,  0.71725113, -2.2309373 ,  0.97949774],
       [-1.67145514, -0.31075815, -2.3452338 , -0.13226038],
       [ 0.34907616,  1.39286725,  0.14127714, -1.54928148],
       [ 0.65011491, -0.75720392,  0.8113333 , -0.89810743],
       [-0.2621556 ,  1.28421949,  0.33275659,  0.07282717],
       [-0.91269059,  0.68186199, -2.10816891,  0.27724943]])

#### 2.3.2）因为df2，DataFrame具有多个dtypes， DataFrame.to_numpy() 相对开销大。

In [20]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

>__注意: DataFrame.to_numpy()不输出索引或列标签__

### 2.4）describe() 显示数据的快速统计摘要 


In [21]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.73712,0.501373,-0.899829,-0.208346
std,1.233992,0.863755,1.473253,0.895042
min,-2.575608,-0.757204,-2.345234,-1.549281
25%,-1.481764,-0.062603,-2.200245,-0.706646
50%,-0.587423,0.699557,-0.983446,-0.029717
75%,0.196268,1.142477,0.284887,0.226144
max,0.650115,1.392867,0.811333,0.979498


### 2.5）转置数据


In [22]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-2.575608,-1.671455,0.349076,0.650115,-0.262156,-0.912691
B,0.717251,-0.310758,1.392867,-0.757204,1.284219,0.681862
C,-2.230937,-2.345234,0.141277,0.811333,0.332757,-2.108169
D,0.979498,-0.13226,-1.549281,-0.898107,0.072827,0.277249


### 2.6）排序

#### 1.6.1）按轴排序

In [29]:
df.sort_index(axis=1, ascending=False)  # axis = 0，索引轴；axis = 1，列轴。 ascending=False，降序；默认为升序。 

Unnamed: 0,D,C,B,A
2013-01-01,0.979498,-2.230937,0.717251,-2.575608
2013-01-02,-0.13226,-2.345234,-0.310758,-1.671455
2013-01-03,-1.549281,0.141277,1.392867,0.349076
2013-01-04,-0.898107,0.811333,-0.757204,0.650115
2013-01-05,0.072827,0.332757,1.284219,-0.262156
2013-01-06,0.277249,-2.108169,0.681862,-0.912691


#### 1.6.2）按值排序

In [30]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,0.650115,-0.757204,0.811333,-0.898107
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226
2013-01-06,-0.912691,0.681862,-2.108169,0.277249
2013-01-01,-2.575608,0.717251,-2.230937,0.979498
2013-01-05,-0.262156,1.284219,0.332757,0.072827
2013-01-03,0.349076,1.392867,0.141277,-1.549281


In [32]:
df.at

<pandas.core.indexing._AtIndexer at 0x11eda5db8>

## 3）选择

注意 虽然标准的Python / numpy的表达式选择和设置直观，派上用场的互动工作，为生产代码，我们建议优化的熊猫数据访问方法，.at，.iat， .loc和.iloc。
请参阅索引文档索引和选择数据以及MultiIndex / Advanced索引。

### 3.1）获得

#### 3.1.1）选择一个列，产生一个Series

In [36]:
df['A']  # 相当于 df.A

2013-01-01   -2.575608
2013-01-02   -1.671455
2013-01-03    0.349076
2013-01-04    0.650115
2013-01-05   -0.262156
2013-01-06   -0.912691
Freq: D, Name: A, dtype: float64

#### 3.1.2）对行进行切片

In [35]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-2.575608,0.717251,-2.230937,0.979498
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226
2013-01-03,0.349076,1.392867,0.141277,-1.549281


### 3.2）按标签（索引名 和 列名）选择：`.loc` 的使用


In [56]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-2.575608,0.717251,-2.230937,0.979498
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226
2013-01-03,0.349076,1.392867,0.141277,-1.549281
2013-01-04,0.650115,-0.757204,0.811333,-0.898107
2013-01-05,-0.262156,1.284219,0.332757,0.072827
2013-01-06,-0.912691,0.681862,-2.108169,0.277249


In [57]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

---

In [58]:
df.loc[dates[0]]  # 第1行

A   -2.575608
B    0.717251
C   -2.230937
D    0.979498
Name: 2013-01-01 00:00:00, dtype: float64

In [59]:
df.loc[:, ['A', 'B']]  # 所有行，A列和B列

Unnamed: 0,A,B
2013-01-01,-2.575608,0.717251
2013-01-02,-1.671455,-0.310758
2013-01-03,0.349076,1.392867
2013-01-04,0.650115,-0.757204
2013-01-05,-0.262156,1.284219
2013-01-06,-0.912691,0.681862


---

In [60]:
df.loc['20130102':'20130104', ['A', 'B']]  # '20130102':'20130104'（闭区间），A列和B列

Unnamed: 0,A,B
2013-01-02,-1.671455,-0.310758
2013-01-03,0.349076,1.392867
2013-01-04,0.650115,-0.757204


---

In [61]:
df.loc['20130102', ['A', 'B']]  # '20130102'行，A列和B列

A   -1.671455
B   -0.310758
Name: 2013-01-02 00:00:00, dtype: float64

---

In [62]:
df.loc[dates[0], 'A']  # 第1行，A列，等效于 df.at[dates[0], 'A']

-2.5756076928104554

---

### 3.3）按位置选择，`.iloc` 的使用

#### 3.3.1）通过传递的整数的位置选择

In [46]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-2.575608,0.717251,-2.230937,0.979498
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226
2013-01-03,0.349076,1.392867,0.141277,-1.549281
2013-01-04,0.650115,-0.757204,0.811333,-0.898107
2013-01-05,-0.262156,1.284219,0.332757,0.072827
2013-01-06,-0.912691,0.681862,-2.108169,0.277249


In [48]:
df.iloc[3]  # 第4行

A    0.650115
B   -0.757204
C    0.811333
D   -0.898107
Name: 2013-01-04 00:00:00, dtype: float64

#### 3.3.2）通过整数切片，类似于numpy / python：

In [49]:
df.iloc[3:5, 0:2]  # 左闭右开；第4行和第5行，第1列和第2列。0代表1

Unnamed: 0,A,B
2013-01-04,0.650115,-0.757204
2013-01-05,-0.262156,1.284219


#### 3.3.3）通过整数位置位置列表，类似于numpy / python样式：

In [50]:
 df.iloc[[1, 2, 4], [0, 2]]  # 第2，3，5行 及 第1，2列。

Unnamed: 0,A,C
2013-01-02,-1.671455,-2.345234
2013-01-03,0.349076,0.141277
2013-01-05,-0.262156,0.332757


#### 3.3.4）

In [51]:
df.iloc[1:3, :]  # 第2，3行 及 所有列

Unnamed: 0,A,B,C,D
2013-01-02,-1.671455,-0.310758,-2.345234,-0.13226
2013-01-03,0.349076,1.392867,0.141277,-1.549281


In [52]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.717251,-2.230937
2013-01-02,-0.310758,-2.345234
2013-01-03,1.392867,0.141277
2013-01-04,-0.757204,0.811333
2013-01-05,1.284219,0.332757
2013-01-06,0.681862,-2.108169


In [55]:
df.iloc[1, 1] # 第2行第2列，相当于  df.iat[1, 1]

-0.31075815368684456