In [3]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

### DataFrame定义
- DataFrame是一个表格型的数据结构，含有一组有序的列，每列可以是不同的值类型
- DataFrame既有行索引也有列索引，可以被看做有Series组成的字典
- DataFrame中的数据是以一个或多个二维快存放的

### DataFrame创建

#### 传入等长列表或Numpy数组组成的字典

In [10]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


注：DataFrame会自动加上索引，且全部列会被有序排列

In [8]:
# 指定列顺序，DataFrame的会按制定顺序排列

DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [14]:
# 传入的列在数据中找不到，就会产生NA值

frame2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [15]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

#### 嵌套字典
外层字典的键作为列，内层键作为行索引

In [38]:
pop = {'Nevada': {2001:2.4, 2002:2.9},
       'Ohio': {2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [43]:
# 指定索引

DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


#### 由Series组成的字典

In [44]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


### 索引列
通过字典标记或属性的方式，将DataFrame的列获取为一个Series

In [16]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [17]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

### 索引行

In [19]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

### 修改列中值

#### 直接赋值

In [21]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [22]:
frame2.debt = np.arange(5)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


#### 通过Series赋值

In [27]:
# 将列表或数组赋值给某列，若通过Series赋值，将精确匹配DataFrame索引

val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


#### 为不存在的列赋值

In [32]:
# 为不存在的列赋值会创建出一个新列

frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


#### 删除列

In [33]:
# 关键字del用于删除列

del frame2['eastern']

In [34]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


### 将DataFrame转置

In [40]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [41]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


### name属性
DataFrame的index和columns都有name属性

In [46]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


### values属性

In [47]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [48]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

### 索引对象
pandas的**索引**对象负责管理轴标签和其他元数据（ 比如轴名称）。构建Series或DataFrame时，所用到的任何数组或其他序列的标签都会被转换成一个index（索引对象）。

#### 索引对象类型

In [4]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

#### 索引对象不可修改

In [7]:
# 可以更换Series的索引对象，但不等改索引对象中的内容

obj.index = ['x', 'y', 'z']
obj

x    0
y    1
z    2
dtype: int64

In [6]:
# Index对象是不可修改的

index[1] = 'd'

TypeError: Index does not support mutable operations

#### 不可修改性使index可在多个数据结构之间安全共享

In [53]:
np.arange(3)

array([0, 1, 2])

In [56]:
index = pd.Index(np.arange(3))
index

Int64Index([0, 1, 2], dtype='int64')

In [60]:
obj2 = Series([1.5, -2.5, 0], index = index)
obj2.index is index

True

#### 索引的方法及属性

In [61]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [64]:
2000 in frame3.index

True

In [65]:
'Ohio' in frame3.columns

True