In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy
from sklearn.neighbors import LocalOutlierFactor

In [2]:
df = pd.read_pickle("../../data/interim/01_data_processed.pkl")

In [3]:
outlierColumns = list(df.columns[:6])

In [4]:
def plot_binary_outliers(dataset, col, outlier_col, reset_index):
    """ Plot outliers in case of a binary outlier score. Here, the col specifies the real data
    column and outlier_col the columns with a binary value (outlier or not).

    Args:
        dataset (pd.DataFrame): The dataset
        col (string): Column that you want to plot
        outlier_col (string): Outlier column marked with true/false
        reset_index (bool): whether to reset the index for plotting
    """

    # Taken from: https://github.com/mhoogen/ML4QS/blob/master/Python3Code/util/VisualizeDataset.py

    dataset = dataset.dropna(axis=0, subset=[col, outlier_col])
    dataset[outlier_col] = dataset[outlier_col].astype("bool")

    if reset_index:
        dataset = dataset.reset_index()

    fig, ax = plt.subplots()

    plt.xlabel("samples")
    plt.ylabel("value")

    # Plot non outliers in default color
    ax.plot(
        dataset.index[~dataset[outlier_col]],
        dataset[col][~dataset[outlier_col]],
        "+",
    )
    # Plot data points that are outliers in red
    ax.plot(
        dataset.index[dataset[outlier_col]],
        dataset[col][dataset[outlier_col]],
        "r+",
    )

    plt.legend(
        ["outlier " + col, "no outlier " + col],
        loc="upper center",
        ncol=2,
        fancybox=True,
        shadow=True,
    )
    plt.show()

In [5]:
def mark_outliers_chauvenet(dataset, col, C=2):
    """Finds outliers in the specified column of datatable and adds a binary column with
    the same name extended with '_outlier' that expresses the result per data point.
    
    Taken from: https://github.com/mhoogen/ML4QS/blob/master/Python3Code/Chapter3/OutlierDetection.py

    Args:
        dataset (pd.DataFrame): The dataset
        col (string): The column you want apply outlier detection to
        C (int, optional): Degree of certainty for the identification of outliers given the assumption 
                           of a normal distribution, typicaly between 1 - 10. Defaults to 2.

    Returns:
        pd.DataFrame: The original dataframe with an extra boolean column 
        indicating whether the value is an outlier or not.
    """

    dataset = dataset.copy()
    # Compute the mean and standard deviation.
    mean = dataset[col].mean()
    std = dataset[col].std()
    N = len(dataset.index)
    criterion = 1.0 / (C * N)

    # Consider the deviation for the data points.
    deviation = abs(dataset[col] - mean) / std

    # Express the upper and lower bounds.
    low = -deviation / math.sqrt(C)
    high = deviation / math.sqrt(C)
    prob = []
    mask = []

    # Pass all rows in the dataset.
    for i in range(0, len(dataset.index)):
        # Determine the probability of observing the point
        prob.append(
            1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
        )
        # And mark as an outlier when the probability is below our criterion.
        mask.append(prob[i] < criterion)
    dataset[col + "_outlier"] = mask
    return dataset

In [6]:
outlierRemovedDF = df.copy() # copies the original DF.

In [7]:
for perCol in outlierColumns:
    for perLabel in df["label"].unique():
        dataset = mark_outliers_chauvenet(
            df[df["label"] == perLabel], perCol
            )
        
        # This will dynamically replace outlier values to 
        # NaN to each column.
        dataset.loc[dataset[perCol + "_outlier"], perCol] = np.nan
        
        # Update the column in the original dataframe.
        # Very advance technique, I need to understand this.
        outlierRemovedDF.loc[(outlierRemovedDF["label"] == perLabel), perCol] = dataset[perCol]
        
        nOutliers = len(dataset) - len(dataset[perCol].dropna())
        print(f"Remove {nOutliers} from {perCol} for {perLabel}")

Remove 0 from acce_x for bench
Remove 2 from acce_x for ohp
Remove 0 from acce_x for squat
Remove 2 from acce_x for dead
Remove 0 from acce_x for row
Remove 0 from acce_x for rest
Remove 5 from acce_y for bench
Remove 6 from acce_y for ohp
Remove 0 from acce_y for squat
Remove 0 from acce_y for dead
Remove 0 from acce_y for row
Remove 0 from acce_y for rest
Remove 3 from acce_z for bench
Remove 6 from acce_z for ohp
Remove 0 from acce_z for squat
Remove 1 from acce_z for dead
Remove 0 from acce_z for row
Remove 0 from acce_z for rest
Remove 2 from gyro_x for bench
Remove 4 from gyro_x for ohp
Remove 1 from gyro_x for squat
Remove 6 from gyro_x for dead
Remove 0 from gyro_x for row
Remove 12 from gyro_x for rest
Remove 14 from gyro_y for bench
Remove 15 from gyro_y for ohp
Remove 9 from gyro_y for squat
Remove 14 from gyro_y for dead
Remove 10 from gyro_y for row
Remove 9 from gyro_y for rest
Remove 13 from gyro_z for bench
Remove 1 from gyro_z for ohp
Remove 12 from gyro_z for squat
Re

In [8]:
outlierRemovedDF.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acce_x       9005 non-null   float64
 1   acce_y       8998 non-null   float64
 2   acce_z       8999 non-null   float64
 3   gyro_x       8984 non-null   float64
 4   gyro_y       8938 non-null   float64
 5   gyro_z       8945 non-null   float64
 6   participant  9009 non-null   object 
 7   label        9009 non-null   object 
 8   category     9009 non-null   object 
 9   set          9009 non-null   int32  
dtypes: float64(6), int32(1), object(3)
memory usage: 739.0+ KB


In [9]:
# This shows that the code that I retained is correct since it has the same numbe of data from the OLD python files which has all the code becuase of the individual task.