## Ch05 pandas入门


In [None]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

### pandas的数据结构介绍
#### Series

In [None]:
obj = Series([4,7,-5,3])
obj

In [None]:
obj.values

In [None]:
obj.index

In [None]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

In [None]:
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])
obj2

In [None]:
obj2.index

In [None]:
obj2['a']

In [None]:
obj2[obj2>0]

In [None]:
obj2*2

In [None]:
np.exp(obj2)

In [None]:
'b' in obj2
'e' in obj2

In [None]:
#dict -> Series
sdata = {'Ohio':3500, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = Series(sdata)
obj3
#obj3.index

In [None]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

In [None]:
pd.isnull(obj4)  
#pd.notnull(obj4)

In [None]:
r = pd.isnull(obj4)
type(r)

In [None]:
r = obj4.isnull()
type(r) 

In [None]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

#### DataFrame
DataFrame是一个表格型的数据结构，它含有一组有序每列可以是不同的值类型（数值、字符串、布尔值等） **DataFrame既有行索引也有列索引，它可以被看作由Series组成的词典**。 

- 二维ndarray
- 由数组、列表、或者元组组成的字典
- NumPy的结构化/记录数组
- 由Series组成的字典
- 由字典组成的字典
- 字典或者Series的列表
- 由列表或者元组组成的列表
- 另一个DataFrame
- NumPy的MaskedArray


#### 构建DataFrame的最常用方法是：直接传入一个由等长列表或者NumPy数组组成的辞典。

In [None]:
data = {
    'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Neveda'],
    'year' :[2000,2001,2002,2001,2002] ,
    'pop'  :[1.5,1.7,3.6,2.4,2.9]
}
frame = DataFrame(data)
frame

In [None]:
DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'])
frame2

In [None]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index = ['one', 'two', 'three', 'four', 'five']
                  )
frame2

In [None]:
frame2['state'] == frame2.state

In [None]:
frame2.loc['three']

In [None]:
frame2.debt=16.5
frame2

In [None]:
val = Series([-1.2,-1.5, -1.7], index=['two', 'four','five'])
frame2.debt = val
frame2

In [None]:
frame2['eastern'] = frame2.state=='Ohio'
frame2

In [None]:
print(frame2.columns)
del frame2['eastern']
print(frame2.columns)

#### 另一种常见的数据形式是嵌套字典

In [None]:
pop ={
    'Nevada' : {
        2001:2.4, 
        2002:2.9
    },
    'Ohio'   : {
        2000:1.5, 
        2001:1.7, 
        2002:3.6
    }
}

frame3 = DataFrame(pop)
print(frame3.index)
print(frame3.columns)
frame3


In [None]:
frame3.T

In [None]:
frame3['Ohio']

In [None]:
pdata = {
    'Ohio' : frame3['Ohio'][:-1],
    'Nevada': frame3['Nevada'][:3]
}
DataFrame(pdata)

In [None]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

In [None]:
frame3.values

In [None]:
frame2.values

### 索引对象
pandas的索引对象负责管理轴标签和其他元数据（比如轴名称）。构建Series或者DataFrame时，所用到的任何数组或者其他序列的标签都会被转化成一个Index

**pandas中主要的Index对象**

- Index
- Int64Index
- MultiIndex
- DatatimeIndex
- PeriodIndex

**Index的方法和属性**
- append
- diff
- intersection
- union
- isin
- delete
- drop
- insert
- is_monotonic
- is_unique
- unique

In [None]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

In [None]:
index[1:]

In [None]:
try:
    index[1] = 'd'
except TypeError:
    print('pd.index object should not be changed')

In [None]:
index = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5,0], index=index)
print( obj2.index == index )
print( obj2.index is index )

**除了长得像数组， Index的功能也类似一个固定大小的集合**

In [None]:
print(  frame3  )
print( 'Ohio' in frame3.columns  )
print(  2003 in frame3.index)

### 基本功能

#### 重新索引

In [None]:
obj = Series([4.5,7.2,-5.3,3.6], index=['d', 'b','a','c'])
obj

In [None]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

In [None]:
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
obj2

**使用ffill可以实现前向值填充**

reindex的插值方式，method选项：
- ffill/pad    前向填充
- bfill/backfill    后向填充

In [None]:
obj3 = Series(['blue', 'purple','yellow'], index=[0,2,4])
obj3.reindex(range(9), method='ffill')

**reindex可以修改行和列**

In [None]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','b'],
                  columns=['Ohio', 'Texas', 'California'])
frame

In [None]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

In [None]:
states =['Texas','Ohio', 'California', 'Utah']
frame.reindex(columns=states)

In [None]:
states =['Texas','Utah', 'California' ]
#frame.reindex(index=['a','b','c','d'], method='ffill', columns=states)
frame.reindex(index=['a','b','c','d'],  columns=states).ffill()

In [None]:
#frame.ix[['a'],states]
frame.loc[['a','b','c','d'],states]

In [None]:
'''
注意reindex的参数：
index,columns
method
fill_value
limit
level
copy
'''

#help(df.reindex)


#### 丢弃指定轴上的项

In [None]:
obj = Series(np.arange(5), index=['a','b','c','d','e'])
new_obj = obj.drop('c')
print (new_obj)
print(obj)

obj.drop(['d','c'])

In [None]:
data = DataFrame(np.arange(16).reshape((4,4)),
                index=['Ohio','Colorado', 'Utah','NewYork'],
                 columns=['one','two', 'three','four']
                 )
data.drop(['Colorado', 'Ohio'])

In [None]:
print( data.drop('two',axis=1) )
print( data.drop(['two', 'four'], axis=1) )

#### 索引、选取和过滤
**Series索引的工作方式类似于NumPy数组的索引，只不过Series的索引值不只是整数**

In [None]:
obj = Series(np.arange(4,8), index=['a','b','c','d'])

print ( obj ) ;print ('-'*16)
print ( obj['b'] ) ; print ('-'*16)
print ( obj[2:4] ) ;print ('-'*16)
print ( obj[['b','a','d']] )  ;print ('-'*16)
print ( obj[[1,3]]) ;print ('-'*16)
print ( obj[obj>5]) ;print ('-'*16)

In [None]:
obj['b':'d'] = 666
obj

In [None]:
data = DataFrame(np.arange(16).reshape((4,4)),
                index=['Ohio','Colorado', 'Utah','NewYork'],
                 columns=['one','two', 'three','four']
                 )
print ( data ) ;print('-'*16)
print ( data['two'] ) ;print('-'*16)
print ( data[['three', 'one']])  ;print('-'*16)
print ( data[:2] ) ;print('-'*16)


In [None]:
data < 5

In [None]:
data[data<5] = 0
data

In [None]:
#data.ix['Colorado', ['two','three']]
data.loc['Colorado', ['two','three']]

In [None]:
slice = data.loc[
    ['Colorado', 'Utah'],
    ['four','one','two']
]
print (slice) ;print('-'*32)

slice = data.loc[['Colorado', 'Utah']]
print (slice) ;print('-'*32)

slice = data.loc[data.three>5][:3]
print (slice) ;print('-'*32)

#### 算术运算和数据对齐

- 算术运算
    - +
    - -
    - *
    - /
    - add
    - sub
    - div
    - mul
- NB    
**自动的数据对齐操作在不重叠的索引处引入了NA值**


In [None]:
s1 = Series( 
    [7.3,-2.5,3.4,1.5],
    index = ['a','c','d','e']
)

s2 = Series(
    [-2.1, 3.6,-1.5,4,3.1],
    index=['a','c','e','f','g']
)
print (s1) ; print('-'*32)
print (s2) ; print('-'*32)
print (s1+s2) ; print('-'*32)


In [None]:
df1 = DataFrame(np.arange(9).reshape((3,3)),
                columns=list('bcd'),
                index=['Ohio','Texas', 'Colorado']
               )
df2 = DataFrame(np.arange(12).reshape((4,3)),
                columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon']
               )
print(df1) ;print('-'*32)
print(df2) ;print('-'*32)
print(df1+df2) ;print('-'*32)
print(df1-df2) ;print('-'*32)

#### 算术方法中填充值

In [None]:
df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list('abcd'),dtype=np.int)
df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list('abcde'),dtype=np.int)
print(df1) ;print('1-'*32)
print(df2) ;print('2-'*32)
print(df1+df2) ;print('3-'*32)
d3 = DataFrame.add(df1,df2,fill_value=0)
print(d3) ;print('4-'*32)
d4 = df1.add(df2, fill_value=0)
print(d4) ;print('5-'*32)
r = d3 == d4
print(r) ;print('6-'*32)
print(r.any());print('7-'*32)


####  DataFrame和Series之间的运算

**广播**

In [None]:
arr = np.arange(12).reshape( (3,4) )
print(arr) ; print('1-'*16)
print(arr[0]) ; print('2-'*16)
print(arr-arr[0]) ;print('3-'*16)

In [None]:
frame = DataFrame( np.arange(12).reshape( (4,3) ),
                 columns=list('bde'),
                  index=['Utah', 'Texas','Ohio', 'Oregon']
                 )
#series = frame.ix[0]
series = frame.loc['Utah']
print( frame ) ; print( '-'*32 )
print( series ) ; print( '-'*32 )

print(frame - series) ; print( '-'*32 )

In [None]:
series2 = Series(range(3), index=['b','e','f'] )
frame + series2

In [None]:
series3 = frame['d']
print (frame) ; print('-'*32)
print(series3) ; print('-'*32)
print(frame.sub(series3, axis=0) ) ; print('-'*32)

#### 函数应用和映射

In [None]:
frame = DataFrame( np.arange(12).reshape( (4,3) ),
                 columns=list('bde'),
                  index=['Utah', 'Texas','Ohio', 'Oregon']
                 )
np.abs(frame)

In [None]:
f = lambda x : x.max() - x.min()
print ( frame.apply(f) )
print ( frame.apply(f, axis=0) )
print ( frame.apply(f, axis=1) )

In [None]:
def f(x):
    return Series(
        [x.min(), x.max()],
        index=['min', 'max']
    )
frame.apply(f)

In [None]:
format = lambda x: '%.2f' %x
frame.applymap(format)

之所以叫做applymap，是因为Series有一个应用元素级函数的map方法

In [None]:
frame['e'].map(format)

#### 排序和排名
要对行或列索引进行排序，可以使用sort_index方法，它将返回一个**已经排序的新对象**

In [None]:
obj = Series(range(4), index=list('dabc'))
print(obj) ; print('-'*32)
print(obj.sort_index())

**而对于DataFrame，则可以根据任意一个轴上的索引进行排序**

In [None]:
frame = DataFrame(np.arange(8).reshape((2,4)), 
                  index=['three','one'],
                  columns=['d','a','b','c']
                 )
frame.sort_index()

In [None]:
frame.sort_index(axis=1, ascending=False)

In [None]:
obj = Series([4,7,-3,2])
obj.sort_values()

In [None]:
obj = Series([4,np.nan, 7, np.nan, -3, 2])
obj.sort_values()

In [None]:
frame = DataFrame(
    {
        'b': [4,7,-3,2],
        'a': [0,1,0,1]
    }
)

print(frame) ; print('-'*32)
print(frame.sort_values(by='a')); print('-'*32)
print(frame.sort_values(by='b')) ; print('-'*32)
print(frame.sort_values(by=['a','b'])); print('-'*32)

#print(frame.sort_index(by='a')) ; print('-'*32)
#print(frame.sort_index(by='b')) ; print('-'*32)
#print(frame.sort_index(by=['a','b'])); print('-'*32)

In [None]:
obj = Series([7,-5,7,4,2,0,4])
print( obj )
print( obj.rank() )
print( obj.rank(method='first') )
print( obj.rank(ascending=False, method='max'))

In [None]:
frame = DataFrame(
    {
        'b' : [4.3, 7, -3, 2],
        'a' : [0,1,0,1],
        'c' : [-2, 5, 8, -2.5]
    }
)

print( frame )
print( frame.rank( axis=1 ) )

**排名时用于破坏平衡关系的method选项**
- 'average'
- 'min'
- 'max'
- 'first'

#### 带有重复值的轴索引

In [None]:
obj = Series( range(5), index=list('aabbc'))
print ( obj )
print ( obj.index.is_unique )
print ( obj.a ); print ( type( obj.a ) )
print ( obj.c ); print ( type( obj.c ) )

In [None]:
df = DataFrame(np.random.randn(4,3), index=list('aabb'))
print ( df )
print ( df.loc['b'] )

### 汇总和计算描述统计

### 处理缺失数据

### 层次化索引

### 其他有关Pandas的话题