# 資料的索引和選擇
複習存取、設定、以及修改NumPy陣列值的方法和工具
* 索引 (ex : arr[2,1])
* 切片 (ex : arr[:,1:5])
* 遮罩 (ex : arr[arr > 0])
* fancy索引 (ex : arr[0,[1,5]])
* 組合 (arr[:,[1,5]])

In [1]:
# Series 中選擇資料，把Series當做是字典
import pandas as pd
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data['b']

0.5

In [2]:
'a' in data


True

In [3]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [4]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [5]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [6]:
# 把 Series 當作是一維陣列
#切片
data['a':'c'], data[0:2]

(a    0.25
 b    0.50
 c    0.75
 dtype: float64,
 a    0.25
 b    0.50
 dtype: float64)

In [7]:
#遮罩
data[(data>0.3)&(data<0.8)]

b    0.50
c    0.75
dtype: float64

In [8]:
# fancy 索引
data[['a','e']]

a    0.25
e    1.25
dtype: float64

## 重要!! : Indexer : loc、iloc、ix


In [9]:
data = pd.Series(['a','b','c'],index=[1,3,5])
data,data[1],data[1:3]

(1    a
 3    b
 5    c
 dtype: object,
 'a',
 3    b
 5    c
 dtype: object)

In [10]:
# loc 屬性允許索引和切片參考明確的索引
data.loc[1], data.loc[0:5] # 看index的值，值是0到5

('a',
 1    a
 3    b
 5    c
 dtype: object)

In [11]:
# iloc 屬性讓索引和切片的索引已隱含的Python型態索引
data.iloc[1], data.iloc[0:3] # 不看index的值，前0到3個

('b',
 1    a
 3    b
 5    c
 dtype: object)

### 在 DataFrame 中選取資料


In [16]:
area = pd.Series( {'California':38332521,
                   'Texas':16549845,
                   'New York':6549846,
                   'Florida':1564987,
                   'Illinois':1234567
                   })
pop = pd.Series( {'California':50000000,
                   'Texas':60000000,
                   'New York':70000000,
                   'Florida':80000000,
                   'Illinois':90000000
                   })

data = pd.DataFrame({'area': area,'pop':pop})
data

Unnamed: 0,area,pop
California,38332521,50000000
Texas,16549845,60000000
New York,6549846,70000000
Florida,1564987,80000000
Illinois,1234567,90000000


In [17]:
data['area'], data.area

(California    38332521
 Texas         16549845
 New York       6549846
 Florida        1564987
 Illinois       1234567
 Name: area, dtype: int64,
 California    38332521
 Texas         16549845
 New York       6549846
 Florida        1564987
 Illinois       1234567
 Name: area, dtype: int64)

In [18]:
# 注意取名 : pop 是 DataFrame 的方法，所以才會不一樣，因此使用 data['pop']比較安全
data.area is data['area'], data.pop is data['pop']

(True, False)

In [19]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,38332521,50000000,1.304375
Texas,16549845,60000000,3.625412
New York,6549846,70000000,10.687274
Florida,1564987,80000000,51.118635
Illinois,1234567,90000000,72.900053


In [20]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,38332520.0,16549840.0,6549846.0,1564987.0,1234567.0
pop,50000000.0,60000000.0,70000000.0,80000000.0,90000000.0
density,1.304375,3.625412,10.68727,51.11864,72.90005


In [22]:
data.values[0]

array([3.83325210e+07, 5.00000000e+07, 1.30437547e+00])

In [23]:
data['area']

California    38332521
Texas         16549845
New York       6549846
Florida        1564987
Illinois       1234567
Name: area, dtype: int64

In [24]:
data.iloc[:3,:2] # = data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,38332521,50000000
Texas,16549845,60000000
New York,6549846,70000000


In [27]:
# 遮罩、fancy indexing
data.loc[data.density > 50 ,['pop', 'density']]

Unnamed: 0,pop,density
Florida,80000000,51.118635
Illinois,90000000,72.900053


In [28]:
data.iloc[0,2] = 90
data

Unnamed: 0,area,pop,density
California,38332521,50000000,90.0
Texas,16549845,60000000,3.625412
New York,6549846,70000000,10.687274
Florida,1564987,80000000,51.118635
Illinois,1234567,90000000,72.900053


In [30]:
data[1:3]

Unnamed: 0,area,pop,density
Texas,16549845,60000000,3.625412
New York,6549846,70000000,10.687274


In [31]:
data[data.density > 50]

Unnamed: 0,area,pop,density
California,38332521,50000000,90.0
Florida,1564987,80000000,51.118635
Illinois,1234567,90000000,72.900053
