# 10 Minutes to pandas


通常，我们导入如下：

In [1]:
import numpy as np
import pandas as pd

## 1）对象创建

### 1.1）创建一个 `Series` 对象
通过传递一个 list 参数；pandas 会默认创建一个整数索引：


In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## 1.2）创建一个 `DataFrame` 对象

#### 1.2.1）通过传递带有日期时间索引和标记列的NumPy数组来创建：

In [3]:
dates = pd.date_range('20130101', periods=6)  # 带有日期时间索引
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))  # columns=list('ABCD') 标记列
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521
2013-01-02,-0.821663,-0.812546,0.661892,0.143146
2013-01-03,1.307401,1.79947,1.323906,0.372228
2013-01-04,-0.385808,-1.052066,0.113939,-1.533184
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345
2013-01-06,-1.892185,-0.496382,-0.627257,1.034214


---

#### 1.2.2）DataFrame通过传递一个 dict 来创建。

In [5]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


上述创建的 DataFrame 的每列都具有不同的 dtypes（可以理解为类型）。

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

>小技巧：如果您正在使用IPython，则会自动启用列名称（以及公共属性）的选项卡完成。以下是将要完成的属性的子集：  
>`df2.<TAB>  # 按 TAB 键`


## 2）查看数据

### 2.1）查看框架的顶行和底行

In [7]:
df.head(2)  # 看顶2行，默认值为 5

Unnamed: 0,A,B,C,D
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521
2013-01-02,-0.821663,-0.812546,0.661892,0.143146


In [8]:
df.tail(3)  # 看底3行，默认值为 5

Unnamed: 0,A,B,C,D
2013-01-04,-0.385808,-1.052066,0.113939,-1.533184
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345
2013-01-06,-1.892185,-0.496382,-0.627257,1.034214


### 2.2）显示索引 和 列

In [9]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

### 2.3）DataFrame.to_numpy() 给出基础数据的 NumPy 表示

请注意，当您DataFrame拥有不同数据类型的列时，他的操作可能很昂贵。  

这可归结为pandas和NumPy之间的根本差异：
1. NumPy数组对整个数组只有一个dtype，
2. 而 pandas DataFrames 每列有一个dtype。

#### 2.3.1）因为 df 的所有列类型均为浮点值，所以 DataFrame.to_numpy()很快，不需要复制数据。

In [11]:
df.to_numpy()

array([[-0.02848982,  0.10033819, -1.62890227, -0.7755215 ],
       [-0.82166294, -0.81254571,  0.6618915 ,  0.14314569],
       [ 1.30740088,  1.79946954,  1.32390612,  0.37222779],
       [-0.38580761, -1.05206638,  0.11393888, -1.53318409],
       [ 0.10704784, -0.06417839, -1.23906752, -0.51934487],
       [-1.89218493, -0.49638195, -0.62725726,  1.03421425]])

#### 2.3.2）因为df2，DataFrame具有多个dtypes， DataFrame.to_numpy() 相对开销大。

In [12]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

>__注意: DataFrame.to_numpy()不输出索引或列标签__

### 2.4）describe() 显示数据的快速统计摘要 


In [13]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.285616,-0.087561,-0.232582,-0.213077
std,1.061739,1.021738,1.136777,0.914116
min,-1.892185,-1.052066,-1.628902,-1.533184
25%,-0.712699,-0.733505,-1.086115,-0.711477
50%,-0.207149,-0.28028,-0.256659,-0.1881
75%,0.073163,0.059209,0.524903,0.314957
max,1.307401,1.79947,1.323906,1.034214


### 2.5）转置数据


In [14]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.02849,-0.821663,1.307401,-0.385808,0.107048,-1.892185
B,0.100338,-0.812546,1.79947,-1.052066,-0.064178,-0.496382
C,-1.628902,0.661892,1.323906,0.113939,-1.239068,-0.627257
D,-0.775521,0.143146,0.372228,-1.533184,-0.519345,1.034214


### 2.6）排序

#### 1.6.1）按轴排序

In [15]:
df.sort_index(axis=1, ascending=False)  # axis = 0，索引轴；axis = 1，列轴。 ascending=False，降序；默认为升序。 

Unnamed: 0,D,C,B,A
2013-01-01,-0.775521,-1.628902,0.100338,-0.02849
2013-01-02,0.143146,0.661892,-0.812546,-0.821663
2013-01-03,0.372228,1.323906,1.79947,1.307401
2013-01-04,-1.533184,0.113939,-1.052066,-0.385808
2013-01-05,-0.519345,-1.239068,-0.064178,0.107048
2013-01-06,1.034214,-0.627257,-0.496382,-1.892185


#### 1.6.2）按值排序

In [16]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,-0.385808,-1.052066,0.113939,-1.533184
2013-01-02,-0.821663,-0.812546,0.661892,0.143146
2013-01-06,-1.892185,-0.496382,-0.627257,1.034214
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521
2013-01-03,1.307401,1.79947,1.323906,0.372228


In [17]:
df.at

<pandas.core.indexing._AtIndexer at 0x1156f8408>

## 3）选择

注意 虽然标准的Python / numpy的表达式选择和设置直观，派上用场的互动工作，为生产代码，我们建议优化的熊猫数据访问方法，.at，.iat， .loc和.iloc。
请参阅索引文档索引和选择数据以及MultiIndex / Advanced索引。

### 3.1）获得

#### 3.1.1）选择一个列，产生一个Series

In [18]:
df['A']  # 相当于 df.A

2013-01-01   -0.028490
2013-01-02   -0.821663
2013-01-03    1.307401
2013-01-04   -0.385808
2013-01-05    0.107048
2013-01-06   -1.892185
Freq: D, Name: A, dtype: float64

#### 3.1.2）对行进行切片

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521
2013-01-02,-0.821663,-0.812546,0.661892,0.143146
2013-01-03,1.307401,1.79947,1.323906,0.372228


### 3.2）按标签（索引名 和 列名）选择：`.loc` 的使用


In [20]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521
2013-01-02,-0.821663,-0.812546,0.661892,0.143146
2013-01-03,1.307401,1.79947,1.323906,0.372228
2013-01-04,-0.385808,-1.052066,0.113939,-1.533184
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345
2013-01-06,-1.892185,-0.496382,-0.627257,1.034214


In [21]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

---

In [22]:
df.loc[dates[0]]  # 第1行

A   -0.028490
B    0.100338
C   -1.628902
D   -0.775521
Name: 2013-01-01 00:00:00, dtype: float64

In [23]:
df.loc[:, ['A', 'B']]  # 所有行，A列和B列

Unnamed: 0,A,B
2013-01-01,-0.02849,0.100338
2013-01-02,-0.821663,-0.812546
2013-01-03,1.307401,1.79947
2013-01-04,-0.385808,-1.052066
2013-01-05,0.107048,-0.064178
2013-01-06,-1.892185,-0.496382


In [24]:
df.loc['20130102':'20130104', ['A', 'B']]  # '20130102':'20130104'（闭区间），A列和B列

Unnamed: 0,A,B
2013-01-02,-0.821663,-0.812546
2013-01-03,1.307401,1.79947
2013-01-04,-0.385808,-1.052066


In [25]:
df.loc['20130102', ['A', 'B']]  # '20130102'行，A列和B列

A   -0.821663
B   -0.812546
Name: 2013-01-02 00:00:00, dtype: float64

In [26]:
df.loc[dates[0], 'A']  # 第1行，A列，等效于 df.at[dates[0], 'A']

-0.028489818275668315

---

### 3.3）按位置（整数）选择，`.iloc` 的使用

In [27]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521
2013-01-02,-0.821663,-0.812546,0.661892,0.143146
2013-01-03,1.307401,1.79947,1.323906,0.372228
2013-01-04,-0.385808,-1.052066,0.113939,-1.533184
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345
2013-01-06,-1.892185,-0.496382,-0.627257,1.034214


In [28]:
df.iloc[3]  # 第4行

A   -0.385808
B   -1.052066
C    0.113939
D   -1.533184
Name: 2013-01-04 00:00:00, dtype: float64

In [29]:
df.iloc[3:5, 0:2]  # 左闭右开；第4行和第5行，第1列和第2列

Unnamed: 0,A,B
2013-01-04,-0.385808,-1.052066
2013-01-05,0.107048,-0.064178


In [30]:
 df.iloc[[1, 2, 4], [0, 2]]  # 第2，3，5行，第1，2列

Unnamed: 0,A,C
2013-01-02,-0.821663,0.661892
2013-01-03,1.307401,1.323906
2013-01-05,0.107048,-1.239068


In [31]:
df.iloc[1:3, :]  # 第2，3行

Unnamed: 0,A,B,C,D
2013-01-02,-0.821663,-0.812546,0.661892,0.143146
2013-01-03,1.307401,1.79947,1.323906,0.372228


In [32]:
df.iloc[:, 1:3]  # 第2，3列

Unnamed: 0,B,C
2013-01-01,0.100338,-1.628902
2013-01-02,-0.812546,0.661892
2013-01-03,1.79947,1.323906
2013-01-04,-1.052066,0.113939
2013-01-05,-0.064178,-1.239068
2013-01-06,-0.496382,-0.627257


In [33]:
df.iloc[1, 1] # 第2行第2列，相当于  df.iat[1, 1]

-0.8125457126973459

---

### 3.4）布尔索引


In [34]:
df[df.A > 0]  # 所有A列值大于0的行

Unnamed: 0,A,B,C,D
2013-01-03,1.307401,1.79947,1.323906,0.372228
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345


In [35]:
df[df > 0]  # 所有大于0的值，不满足的置为 NaN

Unnamed: 0,A,B,C,D
2013-01-01,,0.100338,,
2013-01-02,,,0.661892,0.143146
2013-01-03,1.307401,1.79947,1.323906,0.372228
2013-01-04,,,0.113939,
2013-01-05,0.107048,,,
2013-01-06,,,,1.034214


In [36]:
df[df < 0]

Unnamed: 0,A,B,C,D
2013-01-01,-0.02849,,-1.628902,-0.775521
2013-01-02,-0.821663,-0.812546,,
2013-01-03,,,,
2013-01-04,-0.385808,-1.052066,,-1.533184
2013-01-05,,-0.064178,-1.239068,-0.519345
2013-01-06,-1.892185,-0.496382,-0.627257,


In [37]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.02849,0.100338,-1.628902,-0.775521,one
2013-01-02,-0.821663,-0.812546,0.661892,0.143146,one
2013-01-03,1.307401,1.79947,1.323906,0.372228,two
2013-01-04,-0.385808,-1.052066,0.113939,-1.533184,three
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345,four
2013-01-06,-1.892185,-0.496382,-0.627257,1.034214,three


In [38]:
df2[df2['E'].isin(['two', 'four'])]  # 所有E列的值为 'two' 或'four' 的行

Unnamed: 0,A,B,C,D,E
2013-01-03,1.307401,1.79947,1.323906,0.372228,two
2013-01-05,0.107048,-0.064178,-1.239068,-0.519345,four
