In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_rows', 20)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 3)
plt.rcParams['font.family'] = 'sans-serif'

In [3]:
# 指定檔案位置，包含完整路徑以及檔案名稱
# os.curdir: 回傳目前目錄的路徑
Turbofan_FILEPATH = os.path.join(os.curdir, 'data', 'turbofan.csv')
# 檢視檔案位置
print(Turbofan_FILEPATH)
# 讀取 CSV 檔案，並將其指定為名稱是 df (DataFrame) 的物件
df = pd.read_csv(filepath_or_buffer=Turbofan_FILEPATH)
df_bak = df.copy()

./data/turbofan.csv


In [4]:
df.head()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388,100.0,39.06,23.419,,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388,100.0,39.0,23.4236,,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388,100.0,38.95,23.3442,,,,,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388,100.0,38.88,23.3739,,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388,100.0,38.9,23.4044,,,,,,


In [5]:
# 讀取未處理過的原始資料
Turbofan_ORIG_FILEPATH = os.path.join(os.curdir, 'data', 'train_PM.txt')

df_orig = pd.read_csv(Turbofan_ORIG_FILEPATH, sep=' ', header=None, )
df_orig_bak = df_orig.copy()

In [6]:
df_orig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,,


# 第 2 章：觀察資料

在 Pandas 中，資料的基本物件有兩種，分別是一維 (one-dimension) 的 `Series` 與二維 (two-dimension) 的 `DataFrame`。我們在上個章節已經將資料讀取成為 DataFrame 物件了，接著就帶大家操作相關的功能，來了解資料本身的樣態。

以下我們將介紹 Pandas 中，最基礎的兩個物件：`DataFrame` 和 `Series` 的相關操作，但會以 DataFrame 為主。大多數的操作都可以套用在 DataFrame 和 Series 上面，不妨自行嘗試看看。

參考文件：

* [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html)
* [pandas.Series](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html)
* [10 Minutes to pandas | Object Creation](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html)
* [Intro to Data Structures](https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html)

## 觀察資料

### `.head()` : 取得前 n Rows 的資料

`.head()` function 將回傳前 n 個 rows 的資料，預設為前 5 個 rows。

* 常用 Parameters : 
     * n：回傳 row 的數量，預設為 5
* 參考文件：
    * [pandas.DataFrame.head](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html)
    * [pandas.Series.head](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.head.html)



In [7]:
# 取得前 5 個 rows 的資料
df.head()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388,100.0,39.06,23.419,,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388,100.0,39.0,23.4236,,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388,100.0,38.95,23.3442,,,,,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388,100.0,38.88,23.3739,,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388,100.0,38.9,23.4044,,,,,,


In [8]:
# 取得前 10 個 rows 的資料
df.head(10)

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388,100.0,39.06,23.419,,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388,100.0,39.0,23.4236,,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388,100.0,38.95,23.3442,,,,,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388,100.0,38.88,23.3739,,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388,100.0,38.9,23.4044,,,,,,
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,2388,100.0,38.98,23.3669,,,,,,
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388,100.0,39.1,23.3774,,,,,,
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,2388,100.0,38.97,23.3106,,,,,,
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,2388,100.0,39.05,23.4066,,,,,,
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,2388,100.0,38.95,23.4694,,,,,,


### `.tail()` : 取得倒數 n Rows 的資料

`.tail()` function 將回傳倒數 n 個 rows 的資料，預設為倒數 5 個 rows。

* 常用 Parameters : 
     * n：回傳 row 的數量，預設為 5
* 參考文件：
    * [pandas.DataFrame.tail](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html)
    * [pandas.Series.tail](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.tail.html)


In [9]:
# 取得倒數 5 個 rows 的資料
df.tail()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,2388,100.0,38.49,22.9735,,,,,,
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,2388,100.0,38.3,23.1594,,,,,,
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,2388,100.0,38.44,22.9333,,,,,,
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,2388,100.0,38.29,23.064,,,,,,
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,2388,100.0,38.37,23.0522,,,,,,


In [10]:
# 取得倒數 10 個 rows 的資料
df.tail(10)

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
20621,100,191,-0.0005,-0.0,100.0,518.67,643.69,1610.87,1427.19,14.62,...,2388,100.0,38.39,23.1218,,,,,,
20622,100,192,-0.0009,0.0001,100.0,518.67,643.53,1601.23,1419.48,14.62,...,2388,100.0,38.56,23.077,,,,,,
20623,100,193,-0.0001,0.0002,100.0,518.67,643.09,1599.81,1428.93,14.62,...,2388,100.0,38.47,23.023,,,,,,
20624,100,194,-0.0011,0.0003,100.0,518.67,643.72,1597.29,1427.41,14.62,...,2388,100.0,38.38,23.1324,,,,,,
20625,100,195,-0.0002,-0.0001,100.0,518.67,643.41,1600.04,1431.9,14.62,...,2388,100.0,38.14,23.1923,,,,,,
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,2388,100.0,38.49,22.9735,,,,,,
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,2388,100.0,38.3,23.1594,,,,,,
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,2388,100.0,38.44,22.9333,,,,,,
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,2388,100.0,38.29,23.064,,,,,,
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,2388,100.0,38.37,23.0522,,,,,,


### `.shape`：回傳 rows 數量 和 columns 數量

回傳一個標示資料形狀的 tuple。

參考文件：

* [pandas.DataFrame.shape](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shape.html)
* [pandas.Series.shape](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.shape.html)

In [11]:
df.shape
# 結果說明：
# 回傳的第一個元素為 Row 的數量
# 第二個元素則為 Column 的數量
# 兩者皆包含數值、非數值 (NaN)、以及空值

(20631, 32)

> 備註：
>
> 觀察一下底下兩種查詢資料的寫法，再觀察各自的 dtype 與 shape

In [12]:
# 1
type(df[['unit_number']])

pandas.core.frame.DataFrame

In [13]:
df[['unit_number']].shape

(20631, 1)

In [14]:
#2
type(df['unit_number'])

pandas.core.series.Series

In [15]:
df['unit_number'].shape

(20631,)

### `.index`：回傳 Row Labels

回傳所有 index (row labels)。

參考文件：

* [pandas.DataFrame.index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.index.html)
* [pandas.Series.index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.index.html)

In [16]:
# 檢視資料的 Index
df.index

RangeIndex(start=0, stop=20631, step=1)

### `.columns` ： 回傳 Column Labels

透過這個屬性以**取得**、或**修改** Column 的內容。

參考文件：

* [pandas.DataFrame.columns](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.columns.html)

In [17]:
# 檢視資料的 Column Name
df.columns

Index(['unit_number', 'time_in_cycles', 'op_setting_1', 'op_setting_2',
       'op_setting_3', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4',
       'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10',
       'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15',
       'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20',
       'sensor_21', 'sensor_22', 'sensor_23', 'sensor_24', 'sensor_25',
       'sensor_26', 'sensor_27'],
      dtype='object')

#### 同場加映：修改 Column 的內容

In [18]:
# 修改資料的 Column Name
tmp = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
tmp

Unnamed: 0,A,B
0,1,3
1,2,4


In [19]:
tmp.columns = ['Col1', 'Col2']
tmp

Unnamed: 0,Col1,Col2
0,1,3
1,2,4


### `.info()` : 回傳 DataFrame 的彙整資訊

調用 DataFrame 的 `.info()` 這個 method 會回傳包含了

* `index dtype`：Index 的資料型態
* `column dtypes`：Column 的資料型態
* `non-null values`：非空值的個數
* `memory usage`：物件占用多少記憶體空間

等等的資訊。

參考文件：[pandas.DataFrame.info](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.info.html)

In [20]:
# 檢視資料的彙整資訊
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 32 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   unit_number     20631 non-null  int64  
 1   time_in_cycles  20631 non-null  int64  
 2   op_setting_1    20631 non-null  float64
 3   op_setting_2    20631 non-null  float64
 4   op_setting_3    20631 non-null  float64
 5   sensor_1        20631 non-null  float64
 6   sensor_2        20631 non-null  float64
 7   sensor_3        20631 non-null  float64
 8   sensor_4        20631 non-null  float64
 9   sensor_5        20631 non-null  float64
 10  sensor_6        20631 non-null  float64
 11  sensor_7        20631 non-null  float64
 12  sensor_8        20631 non-null  float64
 13  sensor_9        20631 non-null  float64
 14  sensor_10       20631 non-null  float64
 15  sensor_11       20631 non-null  float64
 16  sensor_12       20631 non-null  float64
 17  sensor_13       20631 non-null 

### `.describe()` : 回傳敘述性統計分析的結果

調用 `describe()` method 以製作一份資料的敘述性統計分析，是用來觀察資料分布的技巧。

敘述性統計的分析結果通常還會搭配一些視覺化的工具來輔助觀察，這些工具將在後面的小節中提及。

* 常用 Parameter：
    * `include`：可以指定一連串 dtype 的白名單，讓敘述性統計分析包含這些型態的 column，或是指定 `all` 來包含所有的 column
* 參考文件：
    * [pandas.DataFrame.describe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html)
    * [pandas.Series.describe](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.describe.html)
    * [Wikipedia | 描述統計學](https://zh.wikipedia.org/zh-tw/描述統計學)

In [21]:
# 我們用 df 來觀察 DataFrame 中所有 column 的資料分布
# 此時會發現部分資料為 NaN，無法計算的便會顯示為 NaN
df.describe()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,2388.0,100.0,38.816271,23.289705,,,,,,
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,9.000605,1.7764e-15,...,0.0,0.0,0.180746,0.108251,,,,,,
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,2388.0,100.0,38.14,22.8942,,,,,,
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,2388.0,100.0,38.7,23.2218,,,,,,
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,2388.0,100.0,38.83,23.2979,,,,,,
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,2388.0,100.0,38.95,23.3668,,,,,,
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,2388.0,100.0,39.43,23.6184,,,,,,


#### 解釋 8 個統計量
* `count`：個數
* `mean`：平均值
* `std`：標準差
* `min`：最小值
* `25%`：第一四分位數數值
* `50%`：第二四分位數數值
* `75%`：第三四分位數數值
* `max`：最大值

In [22]:
tmp = pd.DataFrame({'A': [1, 2, 5, 7, 9, 3, 4, 8, 6, 10]}).describe()
tmp

Unnamed: 0,A
count,10.0
mean,5.5
std,3.02765
min,1.0
25%,3.25
50%,5.5
75%,7.75
max,10.0


In [23]:
10*0.25, 10*0.5, 10*0.75

(2.5, 5.0, 7.5)

In [24]:
pd.DataFrame({'A': [1, 2, 5, 7, 9, 3, 4, 8, 6, 10]}).sort_values(by="A")

Unnamed: 0,A
0,1
1,2
5,3
6,4
2,5
8,6
3,7
7,8
4,9
9,10


### `.dtypes` : 回傳每一個 Column 的 Data type

若是調用 `DataFrame.dtypes`，將回傳一個帶有所有 column 的 data type 的 Series，調用 `Series.dtypes`（或是 `Series.dtype`），將回傳 data type object。調用這些 Attribute 所得到的結果與 `.info()` 中各 `columns dtype` 項目是相同的。

Data types 的說明則請參閱 [Numpy 的官方文件](https://docs.scipy.org/doc/numpy/user/basics.types.html)。

參考文件：

* [pandas.DataFrame.dtypes](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dtypes.html)
* [pandas.Series.dtype](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dtype.html)
* [Data types](https://docs.scipy.org/doc/numpy/user/basics.types.html)


In [25]:
df.dtypes

unit_number         int64
time_in_cycles      int64
op_setting_1      float64
op_setting_2      float64
op_setting_3      float64
                   ...   
sensor_23         float64
sensor_24         float64
sensor_25         float64
sensor_26         float64
sensor_27         float64
Length: 32, dtype: object

### `.unique()`, `.nunique()`：檢視無重複的資料

觀察類別資料時，我們常常會想到要觀察「**不重複**的所有資料」，這時可以對 Series 調用 `unique()` 或是 `nunique()`（排除 `NaN` 值）method。

如果還要計算出各類別的個數，還可以使用 `.value_counts()` method。

參考文件：

* [pandas.unique](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.unique.html)

In [26]:
# 理論上是直接對 Series 調用 .unique() Method 就好
df.loc[:, 'unit_number'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [27]:
# 或是調用 Numpy 的 set_printoptions() Function
# np.set_printoptions(suppress=True)

### `.value_counts()`：計算 series 中每一個數值的出現次數，並回傳由大到小排序的 series

`value_counts()` method 會回傳 Series 中，每個數值出現的次數的 Series。

預設的排序方式是以數值的大小做降冪排序，如果想要更改排序的方式，可以搭配 `sort_values()` 或是 `sort_index()` method。

* 參考文件：
    * [pandas.Series.value_counts](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html)
    * [pandas.Series.sort_index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.sort_index.html)
    * [pandas.Series.sort_values](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.sort_values.html)

In [28]:
# 例：計算 unit_number 中各數值出現次數
df['unit_number'].value_counts()

69    362
92    341
96    336
67    313
83    293
     ... 
24    147
57    137
70    137
91    135
39    128
Name: unit_number, Length: 100, dtype: int64

In [29]:
# 觀察最長工作週期的前十名設備
df.loc[:, 'unit_number'].value_counts().head(10)

69    362
92    341
96    336
67    313
83    293
2     287
95    283
64    283
86    278
17    276
Name: unit_number, dtype: int64

## 選擇資料

### `.loc` : 以 Row/Column Label 為查詢基礎來取得資料


調用 `.loc` attribute 以獲得想要觀察的資料。

`.loc` 是以 row/column 的 **Label** 作為查詢的基礎，如果調用的對象是 DataFrame，則可以傳入兩個維度的 Slice 標記，分別為 Row labels 和 Column labels，如果是 Series 則僅傳入 Row labels 就可以了。最後依據查詢的範圍不同，可能回傳 DataFrame, Series 或單一 object。

傳入的 labels 如果是 Iterable（可迭代的）object，或是用 slice 的標記格式傳入一個區間，當然也可以是單一物件。

* 參考文件：
    * [pandas.DataFrame.loc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html)
    * [pandas.Series.loc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.loc.html)
    * [10 Minutes to pandas | Selection](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#selection)
    * [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html)


In [30]:
# 以 df 為例
# 取得設定值1、設定值2、設定值3 (Column label: 'op_setting_1', 'op_setting_2', 'op_setting_3') 在第 3~7 個 row 的資料
df.loc[3:7, ['op_setting_1', 'op_setting_2', 'op_setting_3']]
# 以下操作均等價
# df.loc[3:7, 'op_setting_1':'op_setting_3']

Unnamed: 0,op_setting_1,op_setting_2,op_setting_3
3,0.0007,0.0,100.0
4,-0.0019,-0.0002,100.0
5,-0.0043,-0.0001,100.0
6,0.001,0.0001,100.0
7,-0.0034,0.0003,100.0


> 備註：
>
> 與一般 Python 語法在操作 Slice 時稍有不同的地方是：`.loc()` 的 Slice 是包含上界 label 的。
>
> 例：`df.loc[3:5, :]` 會回傳 Index label 為 `3, 4, 5` 的 rows。
>
> 不過 index = `0` 的時候，使用負數的 index 則會無法顯示 index = `0` 的資料
>
> 例：`df.loc[-3:, :]` 無法改寫為 `df.loc[-3:0, :]`

In [31]:
# 以 df 為例
# 取得所有 row 以及 column 的資料
# 以下操作均等價
df.loc[:]  # Column label 可以省略
# df.loc[:, :]

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,2388,100.0,39.06,23.4190,,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388,100.0,39.00,23.4236,,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,2388,100.0,38.95,23.3442,,,,,,
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388,100.0,38.88,23.3739,,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388,100.0,38.90,23.4044,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,2388,100.0,38.49,22.9735,,,,,,
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,2388,100.0,38.30,23.1594,,,,,,
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,2388,100.0,38.44,22.9333,,,,,,
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,2388,100.0,38.29,23.0640,,,,,,


In [32]:
# 取得 設定值1 (Column label: 'op_setting_1')
# 在 1000 ~ 1100 的資料
# 以下操作雖不等價，但是回傳資料的意義相似
df.loc[1000:1100, 'op_setting_1']  # 回傳 Series
df.loc[1000:1100, ('op_setting_1')]  # 回傳 DataFrame
df.loc[1000:1100, ['op_setting_1']]  # 回傳 DataFrame

Unnamed: 0,op_setting_1
1000,-0.0019
1001,0.0005
1002,-0.0012
1003,0.0006
1004,-0.0001
...,...
1096,0.0004
1097,-0.0031
1098,-0.0013
1099,0.0032


#### 如果 dataframe 的 index label 是『時間』...

> [NOTE]
>
> 以公司資料來說，dataframe 的 index label 是『時間』。
>
> 剛好渦輪風扇發動機資料集的index label不是時間，所以在這裏建立一個測試的 dataframe 解說

In [33]:
dates = pd.date_range(start='2020-01-01', end='2020-02-01', freq='H')
tmp = pd.DataFrame(np.random.randn(745, 4), index=dates, columns=list('ABCD'))
tmp

Unnamed: 0,A,B,C,D
2020-01-01 00:00:00,-1.245191,0.023987,0.449775,-1.120604
2020-01-01 01:00:00,1.709980,-0.205141,-0.135362,-1.268684
2020-01-01 02:00:00,0.239082,-1.087192,0.815949,-1.793432
2020-01-01 03:00:00,-0.588527,-1.485335,0.913325,0.369256
2020-01-01 04:00:00,0.251370,0.274383,-0.348956,-0.670142
...,...,...,...,...
2020-01-31 20:00:00,0.285163,-0.602996,-0.229520,-0.231623
2020-01-31 21:00:00,0.894860,1.167568,-1.149798,1.230772
2020-01-31 22:00:00,0.914468,0.988474,1.067244,-1.410097
2020-01-31 23:00:00,-0.626234,-1.623535,1.014417,0.787523


In [34]:
tmp.loc['2020-01-01 00:00:00':'2020-01-02 00:00:00', 'A']

2020-01-01 00:00:00   -1.245191
2020-01-01 01:00:00    1.709980
2020-01-01 02:00:00    0.239082
2020-01-01 03:00:00   -0.588527
2020-01-01 04:00:00    0.251370
                         ...   
2020-01-01 20:00:00   -0.863072
2020-01-01 21:00:00   -0.923197
2020-01-01 22:00:00    0.503391
2020-01-01 23:00:00    0.512645
2020-01-02 00:00:00    0.976747
Freq: H, Name: A, Length: 25, dtype: float64

In [35]:
tmp.loc['2020-01-01 00:00:00':'2020-01-02 00:00:00', 'A':'C']

Unnamed: 0,A,B,C
2020-01-01 00:00:00,-1.245191,0.023987,0.449775
2020-01-01 01:00:00,1.709980,-0.205141,-0.135362
2020-01-01 02:00:00,0.239082,-1.087192,0.815949
2020-01-01 03:00:00,-0.588527,-1.485335,0.913325
2020-01-01 04:00:00,0.251370,0.274383,-0.348956
...,...,...,...
2020-01-01 20:00:00,-0.863072,-0.015959,-1.503622
2020-01-01 21:00:00,-0.923197,-1.645438,0.320389
2020-01-01 22:00:00,0.503391,-0.679802,-0.323073
2020-01-01 23:00:00,0.512645,1.028167,-0.154549


### `.iloc` : 以 Row/Column 的數字位置索引值為查詢基礎以取得資料


調用 `.iloc` attribute 以取得想要觀察的資料。

`.iloc` 是以 Row/Column 的「數字位置索引值 (Integer-location based)」為查詢基礎，如果調用的對象是 DataFrame，則最多可以傳入兩個維度的 Slice 標記，分別為 Row indexes 和 Column indexes，如果是 Series 則僅傳入 Row indexes 就可以了。最後一句查詢的範圍不同，可能回傳 DataFrame, Series 或單一 object。

傳入的 labels 如果是 Iterable（可迭代的）object，或是用 slice 的標記格式傳入一個區間，當然也可以是單一物件。

如果想要使用 Row/Column 中的某個 label 的數字位置索引值來搭配 `.iloc` 來查詢資料的話，可以配合 `.get_loc()` method 來查詢（只是有點多此一舉）。

* 參考文件：
    * [pandas.DataFrame.iloc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html)
    * [pandas.Series.iloc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.iloc.html)
    * [10 Minutes to pandas | Selection](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#selection)
    * [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-integer)
    * [pandas.Index.get_loc](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.get_loc.html)
    
> [NOTE]
>
> 以數字位置索引值為查詢基礎時，包含開始的數值，不包含結束的數值。

In [36]:
# 取得資料集中前 5 個 rows 以及前 3 個 columns 的資料
# 以下兩種操作等價
df.loc[0:4, ['unit_number', 'time_in_cycles', 'op_setting_1']]
# df.loc[:4, :'op_setting_1']

Unnamed: 0,unit_number,time_in_cycles,op_setting_1
0,1,1,-0.0007
1,1,2,0.0019
2,1,3,-0.0043
3,1,4,0.0007
4,1,5,-0.0019


In [37]:
# 同上的例子，但是我們這次改用 iloc 來執行
# 也就是取得 Row label = 0 ~ 4
# Column label = 'unit_number', 'time_in_cycles', 'op_setting_1' 的資料

In [38]:
# 先取得 'unit_number', 'time_in_cycles', 'op_setting_1' 的數字位置索引值
column_indexes = []
column_indexes.append(df.columns.get_loc('unit_number'))
column_indexes.append(df.columns.get_loc('time_in_cycles'))
column_indexes.append(df.columns.get_loc('op_setting_1'))
# 以上操作與底下等價
# column_indexes = [df.columns.get_loc(col) for col in ['unit_number', 'time_in_cycles', 'op_setting_1']]

In [39]:
# 試著把 pandas.Index.get_loc 單獨執行看看
# 輸入 column label 
df.columns.get_loc('unit_number')

0

In [40]:
# 再取得 0 ~ 4 的數字位置索引值
row_index_start = df.index.get_loc(0)
row_index_end = df.index.get_loc(4)

In [41]:
# 取得資料（想一想為何 row_index_end 要加一？）
df.iloc[row_index_start: row_index_end+1, column_indexes]

Unnamed: 0,unit_number,time_in_cycles,op_setting_1
0,1,1,-0.0007
1,1,2,0.0019
2,1,3,-0.0043
3,1,4,0.0007
4,1,5,-0.0019


In [42]:
# 若不使用 pandas.Index.get_loc
df.iloc[0:5, 0:3]

Unnamed: 0,unit_number,time_in_cycles,op_setting_1
0,1,1,-0.0007
1,1,2,0.0019
2,1,3,-0.0043
3,1,4,0.0007
4,1,5,-0.0019
