# 4 Pandas的索引操作

In [2]:
import pandas as pd
import numpy as np

In [3]:

dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.index)

Index([0, 1, 2, 3], dtype='int64')


In [4]:
# 索引对象的值不可变（上面代码增加）
# df_obj2.index[0] = 2

# 3 常见的Index种类
•Index，索引，可以是各种类型
•Int64Index，整数索引
•MultiIndex，层级索引，难度较大
•DatetimeIndex，时间戳类型

In [5]:
ser_obj = pd.Series(range(5), index = list("abcde"))
print(ser_obj)
ser_obj.index

a    0
b    1
c    2
d    3
e    4
dtype: int64


Index(['a', 'b', 'c', 'd', 'e'], dtype='str')

In [6]:
# 行索引，不仅可以用索引名，也可以用索引位置来获取
print(ser_obj['b']) #索引名
print(ser_obj[2]) #位置索引

1


KeyError: 2

In [None]:
print(ser_obj.loc['b']) #索引名
print(ser_obj.iloc[2]) #位置索引

In [None]:
# 切片索引
print(ser_obj.iloc[1:3])  #索引位置取数据，左闭右开
print(ser_obj.loc['b':'d'])  #记住索引名  左闭右闭

In [None]:
# 不连续索引
print(ser_obj.iloc[[0, 2, 4]])
print(ser_obj.loc[['a', 'e']])

In [9]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_obj)
print(ser_bool)


a    0
b    1
c    2
d    3
e    4
dtype: int64
a    False
b    False
c    False
d     True
e     True
dtype: bool


In [12]:
print('-'*50)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2]) #取出大于2的元素

--------------------------------------------------
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## 4.4 DataFrame索引

In [13]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0  0.294651  0.368222 -0.235332  0.512946
1 -2.373416 -0.284899  0.727536 -0.802664
2 -0.771550 -0.550840 -0.690039  1.284229
3 -1.126341 -1.071084 -0.112994 -0.457058
4  0.975439 -1.988087 -0.222836  1.250679


In [16]:
# 列索引
print(df_obj['a']) # 返回Series类型
print('-'*50)
print(df_obj[['a']]) # 返回DataFrame类型
print('-'*50)
print(type(df_obj[['a']])) # 打印类型：DataFrame

"""
df_obj['a'] vs df_obj[['a']]
表达式              返回类型	        维度	                        说明
df_obj['a']	        Series	        一维	            单独一列，带有行索引，类似一维数组
df_obj[['a']]	    DataFrame	    二维	            仍然是一个表（只有一列），保留了DataFrame的属性和方法
"""

0    0.294651
1   -2.373416
2   -0.771550
3   -1.126341
4    0.975439
Name: a, dtype: float64
--------------------------------------------------
          a
0  0.294651
1 -2.373416
2 -0.771550
3 -1.126341
4  0.975439
--------------------------------------------------
<class 'pandas.DataFrame'>


"\ndf_obj['a'] vs df_obj[['a']]\n表达式              返回类型\t        维度\t                        说明\ndf_obj['a']\t        Series\t        一维\t            单独一列，带有行索引，类似一维数组\ndf_obj[['a']]\t    DataFrame\t    二维\t            仍然是一个表（只有一列），保留了DataFrame的属性和方法\n"

# loc 标签索引(通过索引标签值获取数据)————基于标签的索引（label-based），使用行标签和列标签进行选择。切片是闭区间，即包含起始和结束标签。可以接受布尔数组、可调用函数等

In [17]:
# 标签索引 loc，建议使用loc，效率更高
# Series
print(ser_obj)
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d']) #前闭后闭
print('-'*50)


a    0
b    1
c    2
d    3
e    4
dtype: int64
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
--------------------------------------------------


In [19]:
# DataFrame
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = list('abcd'),#列索引
                      index=list('abcde'))#行索引
print(df_obj)
print('-'*50)
print(df_obj['a'])  #建议不用,拿的是列
print('-'*50)
print(df_obj.loc['a'])  #拿的是行
print('-'*50)


          a         b         c         d
a  0.847275  0.524755  1.137014  0.629950
b  0.173780 -0.287387  0.970651 -0.091902
c -0.645819 -2.038054  1.432194  0.392428
d -1.133024 -0.440666  0.127903 -1.276678
e -0.031939 -1.634777 -1.706990 -0.316842
--------------------------------------------------
a    0.847275
b    0.173780
c   -0.645819
d   -1.133024
e   -0.031939
Name: a, dtype: float64
--------------------------------------------------
a    0.847275
b    0.524755
c    1.137014
d    0.629950
Name: a, dtype: float64
--------------------------------------------------


In [20]:
# 第一个参数是行索引，第二个参数是列,loc或者iloc效率高于直接用取下标的方式，前闭后闭
print(df_obj.loc['a':'c', 'b':'d']) #连续索引
print(df_obj.loc[['a','c'], ['b','d']]) #不连续索引
print(df_obj.loc[['c'],['b']]) #取一个值,返回的是DataFrame类型
print(df_obj.loc['c','b'])  #取一个值

          b         c         d
a  0.524755  1.137014  0.629950
b -0.287387  0.970651 -0.091902
c -2.038054  1.432194  0.392428
          b         d
a  0.524755  0.629950
c -2.038054  0.392428
          b
c -2.038054
-2.038054430409086


## iloc 位置索引(推荐使用)——基于整数位置的索引（integer position-based）使用从 0 开始的整数位置进行选择

In [22]:
ser_obj
print('-'*50)
# Series
print(ser_obj[1:3])
print('-'*50)
print(ser_obj.iloc[1:3]) # 前闭后开[)，效率高

--------------------------------------------------
b    1
c    2
dtype: int64
--------------------------------------------------
b    1
c    2
dtype: int64


In [23]:
df_obj

Unnamed: 0,a,b,c,d
a,0.847275,0.524755,1.137014,0.62995
b,0.17378,-0.287387,0.970651,-0.091902
c,-0.645819,-2.038054,1.432194,0.392428
d,-1.133024,-0.440666,0.127903,-1.276678
e,-0.031939,-1.634777,-1.70699,-0.316842


In [24]:
# DataFrame，iloc是前闭后开[)
print(df_obj)
print('-'*50)
print(df_obj.iloc[0:2, 0:2]) 
print('-'*50)
print(df_obj.iloc[[0,2], [0,2]]) # 不连续索引
print('-'*50)
print(df_obj.iloc[0,0]) # 取一个值

          a         b         c         d
a  0.847275  0.524755  1.137014  0.629950
b  0.173780 -0.287387  0.970651 -0.091902
c -0.645819 -2.038054  1.432194  0.392428
d -1.133024 -0.440666  0.127903 -1.276678
e -0.031939 -1.634777 -1.706990 -0.316842
--------------------------------------------------
          a         b
a  0.847275  0.524755
b  0.173780 -0.287387
--------------------------------------------------
          a         c
a  0.847275  1.137014
c -0.645819  1.432194
--------------------------------------------------
0.8472752653507258


In [25]:
#没有设置行和列索引的DataFrame，iloc和loc的区别
df_obj2 = pd.DataFrame(np.random.randn(5,4))
print(df_obj2)
print('-'*50)
print(df_obj2.iloc[0:2]) #左闭右开 2行
print('-'*50)
print(df_obj2.loc[0:2]) #左闭右闭 3行

          0         1         2         3
0 -0.081020  0.519439  0.922273  1.446927
1 -2.358938 -1.366378  0.660049  1.444101
2 -0.837307 -1.794773  0.515618  0.676264
3  0.373140 -1.436584 -0.390028  0.391201
4 -1.300621  0.966678  1.484387  0.449329
--------------------------------------------------
          0         1         2         3
0 -0.081020  0.519439  0.922273  1.446927
1 -2.358938 -1.366378  0.660049  1.444101
--------------------------------------------------
          0         1         2         3
0 -0.081020  0.519439  0.922273  1.446927
1 -2.358938 -1.366378  0.660049  1.444101
2 -0.837307 -1.794773  0.515618  0.676264


# 5.对齐运算

In [26]:
import pandas as pd
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
# Series 对齐运算
print('s1+s2: ')
s3=s1+s2
print(s3)  #缺失数据默认是NaN  np.nan

s1+s2: 
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [27]:
#两个长度不同的一维ndarray相加
a1 = np.array([1,2,3,4,5])
a2 = np.array([1]) # 长度为1
print(a2.shape)
print(a1+a2)

(1,)
[2 3 4 5 6]


In [28]:
print(s2)
s1

0    20
1    21
2    22
3    23
4    24
dtype: int64


0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [32]:
print(np.isnan(s3[6]))
print('-'*50)
print(s2.add(s1, fill_value = 0))  #未对齐的数据将和填充值做运算
print(s2.sub(s1, fill_value = 0)) #s2-s1

True
--------------------------------------------------
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    15.0
6    16.0
7    17.0
8    18.0
9    19.0
dtype: float64
0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5   -15.0
6   -16.0
7   -17.0
8   -18.0
9   -19.0
dtype: float64


In [34]:
#df的对齐运算
import numpy as np
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])#np.ones()用于创建一个指定形状的全 1 数组，这里是2行2列的数组
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print(df1)
print(df2)
print('-'*50)
print(df2.dtypes)
print(df1-df2)
print(df2.sub(df1, fill_value = 2)) #未对齐的数据将和填充值做运算

     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
--------------------------------------------------
a    float64
b    float64
c    float64
dtype: object
     a    b   c
0  0.0  0.0 NaN
1  0.0  0.0 NaN
2  NaN  NaN NaN
     a    b    c
0  0.0  0.0 -1.0
1  0.0  0.0 -1.0
2 -1.0 -1.0 -1.0


# 总结：没对齐的元素，默认填充NaN，对齐运算时，fill_value参数可以指定填充值。