In [1]:
#引入pandas
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

In [2]:
data = {'state':['Ohio','Ohio','Ohio','Neveda','Neveda'],
       'year':[2000,2001,2002,2003,2004],
        'pop':[1.5,1.7,3.6,2.4,2.9] }
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Neveda,2003
4,2.9,Neveda,2004


In [3]:
#指定了列序列，就会列就会按照指定的顺序排列
DataFrame(data,columns = ['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2003,Neveda,2.4
4,2004,Neveda,2.9


In [4]:
#传入的列在数据中找不到，就会产生NA值
frame2 = DataFrame(data,columns = ['year','state','pop','debt'],index = ['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2003,Neveda,2.4,
five,2004,Neveda,2.9,


In [5]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [6]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Neveda
five     Neveda
Name: state, dtype: object

In [7]:
frame2.year

one      2000
two      2001
three    2002
four     2003
five     2004
Name: year, dtype: int64

In [8]:
#赋值方式修改列
frame2['dept']= 16.5
frame2

Unnamed: 0,year,state,pop,debt,dept
one,2000,Ohio,1.5,,16.5
two,2001,Ohio,1.7,,16.5
three,2002,Ohio,3.6,,16.5
four,2003,Neveda,2.4,,16.5
five,2004,Neveda,2.9,,16.5


In [9]:
frame2['dept'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt,dept
one,2000,Ohio,1.5,,0.0
two,2001,Ohio,1.7,,1.0
three,2002,Ohio,3.6,,2.0
four,2003,Neveda,2.4,,3.0
five,2004,Neveda,2.9,,4.0


In [10]:
#列表或者数组赋值给某个列，长度必须和DataFrame的长度匹配。
#如果赋值的是一个Series,就会精确匹配DataFrame的索引,所有的空位都将被填上缺失值
val= Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
frame2['dept'] = val
frame2

Unnamed: 0,year,state,pop,debt,dept
one,2000,Ohio,1.5,,
two,2001,Ohio,1.7,,-1.2
three,2002,Ohio,3.6,,
four,2003,Neveda,2.4,,-1.5
five,2004,Neveda,2.9,,-1.7


In [11]:
#关键字dele用于删除列
frame2['eastern'] = frame2.state=='Ohio'
frame2

Unnamed: 0,year,state,pop,debt,dept,eastern
one,2000,Ohio,1.5,,,True
two,2001,Ohio,1.7,,-1.2,True
three,2002,Ohio,3.6,,,True
four,2003,Neveda,2.4,,-1.5,False
five,2004,Neveda,2.9,,-1.7,False


In [12]:
del frame2['eastern']
frame2.columns
#通过索引方式返回的是相应数据的试图，不是副本。所以，对返回的Series所做的任何就地的修改全都反应在源DataFrame上。通过Series的copy方法可以显示的复制列

Index(['year', 'state', 'pop', 'debt', 'dept'], dtype='object')

In [13]:
#嵌套字典,如果把它传给DataFrame,它就会解释为：外层字典的键是列，内层键作为索引
pop = {'Nevada':{2001:2.4,2002:2.9},
        'Ohio':{2000:1.5,2001:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,3.6
2002,2.9,


In [14]:
#结果置换
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,3.6,


In [15]:
frame3['Ohio'][:-1]

2000    1.5
2001    3.6
Name: Ohio, dtype: float64

In [17]:
#Series组成的字典
pdata = {'Ohio':frame3['Ohio'][-1:],'Nevada':frame3['Nevada'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,
2001,2.4,
2002,,


In [19]:
#设置DataFrame的index,columns 的name属性
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,3.6
2002,2.9,


In [20]:
#values属性：和Series一样，会以二维ndarray的形式返回DataFrame数据
frame3.values

array([[ nan,  1.5],
       [ 2.4,  3.6],
       [ 2.9,  nan]])

In [21]:
##################索引对象：负责管理轴标签和其他元数据（比如轴名称）。

In [23]:
obj = Series(range(3),index = ['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [25]:
#index对象不能修改
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index = index)
obj2.index is index

True

In [30]:
#重新索引reindex
#调用Series的reindex 会根据新的索引进行重排。如果某个索引值当前不存在，就会引入缺失值NaN
obj = Series([4.5,7.2,-5.3,3.6],index = ['d','b','a','c'])
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [31]:
#fill_value=0 设置缺失值为0
obj.reindex(['a','b','c','d','e'],fill_value = 0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [32]:
#reindex的(插值)methond方法

In [41]:
#对于DataFrame，reindex可以修改（行）索引、列，或者两个都可以修改，如果仅仅传入一个序列，那么久默认是重新索引行
frame = DataFrame(np.arange(9).reshape(3,3),index = ['a','b','c'],columns = ['Ohio','Texas','California'])
frame2 = frame.reindex(['a','b','c','d'])
frame2
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [43]:
#使用columns关键字可以重新索引列:
states = ['Texas','Utah','California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [46]:
#同时对列和行索引，但是插值只能按照行应用
frame.reindex(index = ['a','b','c','d'],columns = states,method = 'ffill')

ValueError: index must be monotonic increasing or decreasing