In [1]:
# 外れ値は、それ以外のデータの分布から極端に逸脱しているデータ
# この極端の定義はさまざま

df = spark.createDataFrame([
    (1, 143.5, 5.3, 28),
    (2, 154.2, 5.5, 45),
    (3, 342.3, 5.1, 99),
    (4, 144.5, 5.5, 33),
    (5, 133.2, 5.4, 54),
    (6, 124.1, 5.1, 21),
    (7, 129.2, 5.3, 42),
], ['id', 'weight', 'height', 'age'])
df.show()

+---+------+------+---+
| id|weight|height|age|
+---+------+------+---+
|  1| 143.5|   5.3| 28|
|  2| 154.2|   5.5| 45|
|  3| 342.3|   5.1| 99|
|  4| 144.5|   5.5| 33|
|  5| 133.2|   5.4| 54|
|  6| 124.1|   5.1| 21|
|  7| 129.2|   5.3| 42|
+---+------+------+---+



In [2]:
# 第一分位点および第三分位点からIQR(四分位範囲)の1.5倍離れた値を除去する方法

cols = ['weight', 'height', 'age']
bounds = {}
for col in cols:
    probabilities = [0.25, 0.75]
    relativeError = 0.05
    quantiles = df.approxQuantile(col, probabilities, relativeError)
    print(quantiles)
    
    IQR = quantiles[1] - quantiles[0]
    lowerRange = quantiles[0] - 1.5 * IQR
    upperRange = quantiles[1] + 1.5 * IQR
    
    bounds[col] = [lowerRange, upperRange]
bounds

[129.2, 154.2]
[5.1, 5.5]
[28.0, 54.0]


{'weight': [91.69999999999999, 191.7],
 'height': [4.499999999999999, 6.1000000000000005],
 'age': [-11.0, 93.0]}

In [4]:
expressions = ['id'] + [
    (
        (df[col] < bounds[col][0]) |
        (df[col] > bounds[col][1])
    ).alias(col + '_o')
    for col in cols
]
outliers = df.select(*expressions)
outliers.show()

+---+--------+--------+-----+
| id|weight_o|height_o|age_o|
+---+--------+--------+-----+
|  1|   false|   false|false|
|  2|   false|   false|false|
|  3|    true|   false| true|
|  4|   false|   false|false|
|  5|   false|   false|false|
|  6|   false|   false|false|
|  7|   false|   false|false|
+---+--------+--------+-----+



In [5]:
df_outliers = df.join(outliers, on='id')
df_outliers.filter('weight_o').select('id', 'weight').show()
df_outliers.filter('age_o').select('id', 'age').show()

+---+------+
| id|weight|
+---+------+
|  3| 342.3|
+---+------+

+---+---+
| id|age|
+---+---+
|  3| 99|
+---+---+

