# 第05章 pandas入门

In [1]:
import pandas as pd
from pandas import Series, DataFrame

## 5.1 pandas的数据结构介绍

### Series

In [2]:
# 类似于dict
obj = pd.Series([4, 7, -5, 3])
obj, obj.values, obj.index

(0    4
 1    7
 2   -5
 3    3
 dtype: int64,
 array([ 4,  7, -5,  3], dtype=int64),
 RangeIndex(start=0, stop=4, step=1))

In [3]:
# 指定index
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2, obj2.index

(d    4
 b    7
 a   -5
 c    3
 dtype: int64,
 Index(['d', 'b', 'a', 'c'], dtype='object'))

In [4]:
# 赋值和索引
obj2['d'] = 6
obj2['a'], obj2[['c', 'a', 'd']]

(-5,
 c    3
 a   -5
 d    6
 dtype: int64)

In [5]:
obj2 > 0

d     True
b     True
a    False
c     True
dtype: bool

In [6]:
import numpy as np

obj2[obj2 > 0], obj2 * 2, np.exp(obj2)

(d    6
 b    7
 c    3
 dtype: int64,
 d    12
 b    14
 a   -10
 c     6
 dtype: int64,
 d     403.428793
 b    1096.633158
 a       0.006738
 c      20.085537
 dtype: float64)

In [7]:
'b' in obj2

True

In [8]:
# 传dict带索引
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [9]:
# 根据index 重排序
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [10]:
pd.isnull(obj4), pd.notnull(obj4), obj4.isnull()

(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California    False
 Ohio           True
 Oregon         True
 Texas          True
 dtype: bool,
 California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool)

In [11]:
# 计算自动对齐
obj3,obj4,obj3 + obj4

(Ohio      35000
 Texas     71000
 Oregon    16000
 Utah       5000
 dtype: int64,
 California        NaN
 Ohio          35000.0
 Oregon        16000.0
 Texas         71000.0
 dtype: float64,
 California         NaN
 Ohio           70000.0
 Oregon         32000.0
 Texas         142000.0
 Utah               NaN
 dtype: float64)

In [12]:
# 名称
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [13]:
# 修改索引
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [14]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [20]:
# 列的顺序随机
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [18]:
# 显示前5行
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [21]:
# 指定列的顺序
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [23]:
# 没有的列默认空
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four','five', 'six'])
frame2, frame2.columns

(       year   state  pop debt
 one    2000    Ohio  1.5  NaN
 two    2001    Ohio  1.7  NaN
 three  2002    Ohio  3.6  NaN
 four   2001  Nevada  2.4  NaN
 five   2002  Nevada  2.9  NaN
 six    2003  Nevada  3.2  NaN,
 Index(['year', 'state', 'pop', 'debt'], dtype='object'))

In [24]:
# 获取列
frame2['state'], frame2.state

(one        Ohio
 two        Ohio
 three      Ohio
 four     Nevada
 five     Nevada
 six      Nevada
 Name: state, dtype: object,
 one        Ohio
 two        Ohio
 three      Ohio
 four     Nevada
 five     Nevada
 six      Nevada
 Name: state, dtype: object)

In [25]:
# 获取行
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [26]:
# 赋值 支持传播
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [28]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [29]:
# 赋值不匹配时 默认NA
frame2['debt'] = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [33]:
# 删除列
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [34]:
del frame2['eastern']
frame2, frame2.columns

(       year   state  pop  debt
 one    2000    Ohio  1.5   NaN
 two    2001    Ohio  1.7  -1.2
 three  2002    Ohio  3.6   NaN
 four   2001  Nevada  2.4  -1.5
 five   2002  Nevada  2.9  -1.7
 six    2003  Nevada  3.2   NaN,
 Index(['year', 'state', 'pop', 'debt'], dtype='object'))

In [38]:
# 嵌套字典创建
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [39]:
# 转置
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [40]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [43]:
pdata = {'Ohio': frame3['Ohio'][:-1],'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [44]:
# name
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [47]:
# 类型自动兼容所有类型
frame3.values, frame2.values

(array([[2.4, 1.7],
        [2.9, 3.6],
        [nan, 1.5]]),
 array([[2000, 'Ohio', 1.5, nan],
        [2001, 'Ohio', 1.7, -1.2],
        [2002, 'Ohio', 3.6, nan],
        [2001, 'Nevada', 2.4, -1.5],
        [2002, 'Nevada', 2.9, -1.7],
        [2003, 'Nevada', 3.2, nan]], dtype=object))

### 索引对象