In [5]:
# 
import pandas as pd
pd.__version__

'0.22.0'

In [6]:
# 3.2.1：Series对象
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [7]:
# 可以通过values和index属性获取数据
print(data.values)
print(data.index)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [8]:
# 可以在中括号索引标签获取：
data[1:3]

1    0.50
2    0.75
dtype: float64

In [9]:
# 显示索引的Series对象
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [10]:
# Series是特殊的字典
population_dict = {'California': 38332521, 
                    'Texas': 26448193, 
                    'New York': 19651127, 
                    'Florida': 19552860, 
                    'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [11]:
population['California']

38332521

In [12]:
# Series 对象还支持数组形式的操作，比如切片：
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [13]:
# 3.2.2：Pandas的DataFrame对象
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 
 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [14]:
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [15]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [16]:
states.columns

Index(['area', 'population'], dtype='object')

In [17]:
# 创建单列的DataFrame
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [18]:
data = [{'a': i, 'b': 2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [19]:
# Pandas使用NaN来填充缺失值
pd.DataFrame([{'a': 1, 'b': 2}, {'b':3, 'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [20]:
# 通过 NumPy 二维数组创建。假如有一个二维数组，就可以创建一个可以指定
# 行列索引值的 DataFrame。如果不指定行列索引值，那么行列默认都是整数索引值：
import numpy as np
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a,', 'b', 'c'])

Unnamed: 0,foo,bar
"a,",0.094115,0.657008
b,0.87216,0.475412
c,0.797566,0.997065


In [21]:
data = states
data.columns = ['area', 'pop']
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [22]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [23]:
# 将DataFrame看成一个二维数组
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])

In [24]:
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [25]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [26]:
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [27]:
data['Florida' : 'Illionis']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [28]:
data[1:3]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [29]:
data[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


In [30]:
# 3.4：Pandas的数值运算方法
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [51]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),
                  columns=['A', 'B', 'C', 'D'])
df
#df.unstack()

Unnamed: 0,A,B,C,D
0,1,9,8,9
1,4,1,3,6
2,7,2,0,3


In [32]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [33]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [34]:
# 通用函数：索引对齐
# 1:Series对齐
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 
 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 
 'New York': 19651127}, name='population')
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [35]:
area.index| population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [36]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [37]:
# 设置参数自定义A或B的值
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [38]:
# 2：DataFrame索引对齐
A = pd.DataFrame(rng.randint(0.,20,(2,2)), columns=list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [39]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), 
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,4,0,9
1,5,8,0
2,9,2,6


In [40]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [41]:
# 我们将用 A中所有值的均值来填充缺失值
#（计算 A 的均值需要用 stack 将二维数组压缩成一维数组）：
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


In [42]:
# 3.6:层级索引
import numpy as np
index = [('California', 2000),('California', 2010),
        ('New York', 2000),('New York', 2010),
        ('Texas', 2000),('Texas', 2010)]
populations = [33871648, 37253956,
                18976457, 19378102,
                20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [43]:
pop[('California', 2010): ('Texas',2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [44]:
pop[[i for i in pop.index if i[1]==2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

In [45]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [46]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [47]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [48]:
# unstack() 方法可以快速将一个多级索引的Series 
# 转化为普通索引的 DataFrame：
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [49]:
# stack()方法实现相反的结果
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [52]:
pop_df = pd.DataFrame({'total':pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [53]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [56]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [57]:
pop.loc['California':'New York']

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

In [58]:
pop[pop > 22000000]

California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

In [63]:
# 有序的索引和无序的索引
# 如果MultiIndex 不是有序的索引，那么大多数切片操作都会失败。
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1,2]])
data = pd.Series(np.random.randn(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1     -0.431403
      2      0.207093
c     1      0.811206
      2      0.989417
b     1     -0.035659
      2      0.700539
dtype: float64

In [65]:
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [66]:
data = data.sort_index()
data

char  int
a     1     -0.431403
      2      0.207093
b     1     -0.035659
      2      0.700539
c     1      0.811206
      2      0.989417
dtype: float64