In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_rows', 20)

plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 3)
plt.rcParams['font.family'] = 'sans-serif'

In [3]:
# 指定檔案位置，包含完整路徑以及檔案名稱
# os.curdir: 回傳目前目錄的路徑
Turbofan_FILEPATH = os.path.join(os.curdir, 'data', 'turbofan.csv')
# 檢視檔案位置
print(Turbofan_FILEPATH)
# 讀取 CSV 檔案，並將其指定為名稱是 df (DataFrame) 的物件
df = pd.read_csv(filepath_or_buffer=Turbofan_FILEPATH)
df_bak = df.copy()

./data/turbofan.csv


In [4]:
df.head()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388,100.0,39.06,23.419,,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388,100.0,39.0,23.4236,,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388,100.0,38.95,23.3442,,,,,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388,100.0,38.88,23.3739,,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388,100.0,38.9,23.4044,,,,,,


# 第 4 章：挑選要觀察的資料

看完資料的樣態以後，我們開始會從裡面擷取我們要的資料，以便進入資料分析的階段。此時可能會需要比對特定的條件，對部分資料執行運算，以產生原始資料中沒有提供的部分，好讓資料分析進行得更順利。


## Boolean Indexing：使用 Boolean Vector 取得特定條件下的資料

在選擇資料時，常常有很多情況是要選擇**符合一定條件**的資料。在 Pandas 裡面，我們必須先將這些條件組合成一個僅包含 `True` 或是 `False` 的布林向量 (Boolean vector)，再與原本的資料做比對，標示為 True 的 index 會被保留，而 False 則捨去，依此過濾出我們要查詢的資料。

參考文件：

* [10 Minutes to pandas | Selection | Boolean Indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#boolean-indexing)
* [Indexing and selecting data | boolean-indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing)

### 單一條件比對

如果只需過濾一個條件，則使用單一 boolean vector 與原本的資料做比對。

In [5]:
# 例：過濾設定值1 (Column label: 'op_setting_1') 記錄大於 0 的資料
# 產生一個 boolean vector
more_than_zero = df.loc[:,'op_setting_1'] > 0
# 過濾資料
df.loc[more_than_zero]

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388,100.0,39.00,23.4236,,,,,,
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388,100.0,38.88,23.3739,,,,,,
6,1,7,0.0010,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388,100.0,39.10,23.3774,,,,,,
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.80,14.62,...,2388,100.0,39.05,23.4066,,,,,,
10,1,11,0.0018,-0.0003,100.0,518.67,642.28,1581.75,1400.64,14.62,...,2388,100.0,38.94,23.4787,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20614,100,184,0.0027,-0.0004,100.0,518.67,642.91,1598.88,1420.89,14.62,...,2388,100.0,38.44,23.1229,,,,,,
20616,100,186,0.0026,0.0004,100.0,518.67,643.61,1593.55,1425.32,14.62,...,2388,100.0,38.51,23.1173,,,,,,
20617,100,187,0.0015,0.0002,100.0,518.67,643.63,1596.96,1421.49,14.62,...,2388,100.0,38.67,23.2308,,,,,,
20619,100,189,0.0015,0.0001,100.0,518.67,643.69,1599.85,1423.15,14.62,...,2388,100.0,38.65,23.0591,,,,,,


In [6]:
# #可以用 describe 觀察 op_setting_1 結果
# df.describe()
# df.loc[more_than_zero].describe()

#### 使用 Boolean Vector 來檢視遺漏值數量

有一些 method 可以用來產生比對遺漏值的 boolean vector，例如

* `.isna()`, `.isnull()`（`.isna()` 的別名）：將帶有遺漏值的位置標記為 `True`
* `.notna()`, `.notnull()`（`.notna()` 的別名）：將不帶有遺漏值的位置標記為 `True`

透過這些 method，能用來觀察資料的遺漏值狀況，也可以過濾掉遺漏值或非遺漏值。

參考文件：

* [pandas.Series.isna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.isna.html)
* [pandas.Series.notna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.notna.html)
* [pandas.DataFrame.isna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isna.html)
* [pandas.DataFrame.notna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.notna.html)
* [pandas.isna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.isna.html)
* [pandas.notna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.notna.html)

> 備註—有關遺漏值的兩三事：
>
> 所謂的**遺漏值，在文件中常稱為 NA (Not available)**，包含 `None`  或  `NaN` (`numpy.NaN`)，
>
> 但是 `' '`（空白字元）或 `numpy.inf`（無限大數）則不屬於 **NA**。

In [7]:
# 先來觀察一下遺漏值
# 假設已事先知道 感測器22 (column label: 'sensor_22') 沒有監測數據
df['sensor_22']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
20626   NaN
20627   NaN
20628   NaN
20629   NaN
20630   NaN
Name: sensor_22, Length: 20631, dtype: float64

In [8]:
# 如果要過濾掉 感測器22 的遺漏值，就可以搭配 .notna() method 來使用
sensor_22_notnull = df.loc[:, 'sensor_22'].notna()
df.loc[:, 'sensor_22'][sensor_22_notnull]

# 不過這樣太麻煩了，預告一下：這個情境搭配 dropna() method 更快，
# 在後面的章節會講解。
# df.loc[:, 'sensor_22'].dropna()

Series([], Name: sensor_22, dtype: float64)

### 多重條件比對

若有多個條件需要比對時，要使用 **operators（運算子）**將多個條件組合成 boolean vector，而個別條件則建議**使用 `()` **將其集合起來。

> 備註：
>
> 不用括號將各個條件給集合起來，可能會遇到的問題是：程式或許不會如我們想像的方式來執行。
>
> 例：`df.A > 2 & df.B < 3` 沒有使用 `()` 將兩個條件先各自集合，程式就會解讀成 `df.A > (2 & df.B) < 3`，而不是解讀成 `(df.A > 2) & (df.B < 3)`。
>
> 這樣的運作應該跟運算子的優先級有關，詳細請參考 [Python 官方文件的 Operator precedence 章節](https://docs.python.org/3/reference/expressions.html#operator-precedence)。

常見的運算子以及範例：

* `|`：代表 `or`（或）
    * 例：取得 column 為「op_setting_1 小於 -0.0015」**或**「大於 0.0015」的資料
    * Boolean vector: `(df['op_setting_1'] < -0.0015) | (df['op_setting_1'] > 0.0015)`
* `&`：代表 `and`（且）
    * 例：取得 column 為「op_setting_1 大於 0」**且**「小於等於 0.0015」的資料
    * Boolean vector: `(df['op_setting_1'] > 0) & (df['op_setting_1'] <= 0.0015)`
* `~`：代表 `not`（不是）
    * 例：取得 column 為 「op_setting_1 不等於 0」的資料
    * Boolean vector: `~(df['op_setting_1'] == 0)`


In [14]:
df.describe()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,2388.0,100.0,38.816271,23.289705,,,,,,
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,9.000605,1.7764e-15,...,0.0,0.0,0.180746,0.108251,,,,,,
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,2388.0,100.0,38.14,22.8942,,,,,,
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,2388.0,100.0,38.7,23.2218,,,,,,
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,2388.0,100.0,38.83,23.2979,,,,,,
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,2388.0,100.0,38.95,23.3668,,,,,,
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,2388.0,100.0,39.43,23.6184,,,,,,


In [10]:
# 例：取得資料中
# 設定值1~3 (column label: 'op_setting_1', 'op_setting_2', 'op_setting_2') 大於 0 的資料
op_setting_1_bigger_than_0 = df.loc[:,'op_setting_1'] > 0
op_setting_2_bigger_than_0 = df.loc[:,'op_setting_2'] > 0
op_setting_3_bigger_than_0 = df.loc[:,'op_setting_3'] > 0
df.loc[op_setting_1_bigger_than_0 & op_setting_2_bigger_than_0 & op_setting_3_bigger_than_0]
# 以上操作與以下等價
# df.loc[(df.loc[:,'op_setting_1'] > 0) & (df.loc[:,'op_setting_2'] > 0) & (df.loc[:,'op_setting_3'] > 0)]

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27
6,1,7,0.0010,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388,100.0,39.10,23.3774,,,,,,
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.80,14.62,...,2388,100.0,39.05,23.4066,,,,,,
11,1,12,0.0016,0.0002,100.0,518.67,642.06,1583.41,1400.15,14.62,...,2388,100.0,39.06,23.3660,,,,,,
15,1,16,0.0006,0.0005,100.0,518.67,642.13,1587.98,1404.50,14.62,...,2388,100.0,38.97,23.4550,,,,,,
16,1,17,0.0002,0.0002,100.0,518.67,642.58,1584.96,1399.95,14.62,...,2388,100.0,38.81,23.3319,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20604,100,174,0.0011,0.0002,100.0,518.67,642.91,1602.24,1425.52,14.62,...,2388,100.0,38.80,23.1784,,,,,,
20609,100,179,0.0020,0.0004,100.0,518.67,643.22,1599.36,1423.94,14.62,...,2388,100.0,38.62,23.1685,,,,,,
20616,100,186,0.0026,0.0004,100.0,518.67,643.61,1593.55,1425.32,14.62,...,2388,100.0,38.51,23.1173,,,,,,
20617,100,187,0.0015,0.0002,100.0,518.67,643.63,1596.96,1421.49,14.62,...,2388,100.0,38.67,23.2308,,,,,,
