## TOC

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## 1. 数据生成 - Generate Data
### 1.1 通过 `pd.series` 函数，依据 `list` 的内容构建数据

Creating a `Series` by passing a list of values

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 1.2 通过 `DataFrame`，依据`Numpy` 的内容构建数据
Creating a `DatafFrame` by passing a Numpy array
- `index` is generated by `pd.date_range`
- `columns` is defined by `list('ABCD')`

In [4]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.457535,-1.583202,-0.477675,-0.227034
2013-01-02,0.541023,-0.432149,-0.682263,-0.863532
2013-01-03,0.082854,-1.478781,-0.581225,-0.160362
2013-01-04,0.580847,-0.692669,-1.201673,1.265452
2013-01-05,-1.967342,1.299409,-0.49987,2.903616
2013-01-06,1.440159,-0.474128,2.118362,1.278324


### 1.3 通过 `DataFrame`，依据 `dict` 内容构建数据
Creating a `DataFrame` by passing a dict of object, which consists of different types of data

In [6]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 1.4　构建相同内容的列数据

`pd.Series` generates a column by define the value, which is one, and the `index`, which ranges from 0 to 3

In [7]:
pd.Series(1,index=list(range(4)),dtype='float32')

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

### 1.5 查看每一列数据的具体类型
Check hte different `dtypes` of each column.

In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 2. 数据显示 -Viewing Data

### 2.1 显示头五行数据
`df.head` shows the first five line of the dataset.

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.457535,-1.583202,-0.477675,-0.227034
2013-01-02,0.541023,-0.432149,-0.682263,-0.863532
2013-01-03,0.082854,-1.478781,-0.581225,-0.160362
2013-01-04,0.580847,-0.692669,-1.201673,1.265452
2013-01-05,-1.967342,1.299409,-0.49987,2.903616


### 2.2 显示最后几行数据

In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.580847,-0.692669,-1.201673,1.265452
2013-01-05,-1.967342,1.299409,-0.49987,2.903616
2013-01-06,1.440159,-0.474128,2.118362,1.278324


### 2.3 分别显示`index`，`column` 和 `value` 信息

In [11]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
df.values

array([[-0.45753481, -1.58320176, -0.47767498, -0.2270338 ],
       [ 0.54102269, -0.43214862, -0.6822631 , -0.86353242],
       [ 0.08285384, -1.47878063, -0.58122455, -0.16036248],
       [ 0.58084715, -0.69266912, -1.20167287,  1.26545216],
       [-1.96734189,  1.29940854, -0.49986951,  2.90361642],
       [ 1.44015921, -0.47412753,  2.11836212,  1.2783241 ]])

### 2.4 数据信息的快速统计

In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.036668,-0.560253,-0.220724,0.699411
std,1.164707,1.038251,1.176479,1.382343
min,-1.967342,-1.583202,-1.201673,-0.863532
25%,-0.322438,-1.282253,-0.657003,-0.210366
50%,0.311938,-0.583398,-0.540547,0.552545
75%,0.570891,-0.442643,-0.483224,1.275106
max,1.440159,1.299409,2.118362,2.903616


### 2.4 对数据进行转置 

In [15]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.457535,0.541023,0.082854,0.580847,-1.967342,1.440159
B,-1.583202,-0.432149,-1.478781,-0.692669,1.299409,-0.474128
C,-0.477675,-0.682263,-0.581225,-1.201673,-0.49987,2.118362
D,-0.227034,-0.863532,-0.160362,1.265452,2.903616,1.278324


### 2.5 按某个坐标轴进行排序

In [20]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.227034,-0.477675,-1.583202,-0.457535
2013-01-02,-0.863532,-0.682263,-0.432149,0.541023
2013-01-03,-0.160362,-0.581225,-1.478781,0.082854
2013-01-04,1.265452,-1.201673,-0.692669,0.580847
2013-01-05,2.903616,-0.49987,1.299409,-1.967342
2013-01-06,1.278324,2.118362,-0.474128,1.440159


### 2.6 按某一列的数据进行排序

In [21]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-0.457535,-1.583202,-0.477675,-0.227034
2013-01-03,0.082854,-1.478781,-0.581225,-0.160362
2013-01-04,0.580847,-0.692669,-1.201673,1.265452
2013-01-06,1.440159,-0.474128,2.118362,1.278324
2013-01-02,0.541023,-0.432149,-0.682263,-0.863532
2013-01-05,-1.967342,1.299409,-0.49987,2.903616


## 3. 选择想要的数据

### 3.1 选择某一列的数据

In [24]:
df['A']

2013-01-01   -0.457535
2013-01-02    0.541023
2013-01-03    0.082854
2013-01-04    0.580847
2013-01-05   -1.967342
2013-01-06    1.440159
Freq: D, Name: A, dtype: float64

In [25]:
df.A

2013-01-01   -0.457535
2013-01-02    0.541023
2013-01-03    0.082854
2013-01-04    0.580847
2013-01-05   -1.967342
2013-01-06    1.440159
Freq: D, Name: A, dtype: float64

### 3.2 选择前其中几行

In [26]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.457535,-1.583202,-0.477675,-0.227034
2013-01-02,0.541023,-0.432149,-0.682263,-0.863532
2013-01-03,0.082854,-1.478781,-0.581225,-0.160362


In [27]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.541023,-0.432149,-0.682263,-0.863532
2013-01-03,0.082854,-1.478781,-0.581225,-0.160362
2013-01-04,0.580847,-0.692669,-1.201673,1.265452
