
# PARLA

## Problem
Write a function `process_outliers` that:
- takes a DataFrame containing metric values
- takes a tuple with metric bounds
- processes outliers using the specified method (dropping or clipping)

## Action
I implemented 2 different ways of handling outliers and combined them into one function

## Result
Successfully implemented the function that passed all tests

## Learning
- I revised relevant Python, Numpy, and Scipy functionality
- I learned (by implementing) several different ways of dealing with outliers

## Application
- I can apply relevant Python, Numpy, and Scipy functionality for similar data-related problems
- I can use the implemented function to handle outliers in different ways


In [5]:

from typing import Tuple

import pandas as pd


In [6]:

def process_outliers(
    metrics: pd.DataFrame,
    bounds: Tuple[float, float],
    outlier_process_type: str
) -> pd.DataFrame:
    """
    Returns a new DataFrame with processed outliers in metric values.

    :param metrics: A DataFrame containing metric values with columns ['user_id', 'metric'].
    :param bounds: A tuple (lower_bound, upper_bound). Values outside this range are considered outliers.
    :param outlier_process_type: Method for handling outliers. Possible options:
        - 'drop': remove the measurement.
        - 'clip': replace the outlier with the nearest boundary value (lower_bound or upper_bound).

    :return: A DataFrame with columns ['user_id', 'metric'], with outliers handled.
    """

    # mark metric values as inside the bounds (True) and outside the bounds (False)
    metrics['inside'] = (metrics.metric >= bounds[0]) & (metrics.metric <= bounds[1])

    # if 'drop', then keep only metric values inside the bounds
    if outlier_process_type == 'drop':
        metrics = metrics[metrics.inside]

    # if 'clip', replace the outlier with the nearest boundary value (lower_bound or upper_bound).
    elif outlier_process_type == 'clip':

        def clip(row):
            if row.inside:
                return row.metric
            else:
                if row.metric < bounds[0]:
                    return bounds[0]
                else:
                    return bounds[1]
        
        metrics.metric = metrics.apply(clip, axis=1)

    else:
        raise ValueError('Wrong "outlier_process_type" parameter!')

    metrics = metrics.drop(columns=['inside'])
    return metrics


In [7]:

# testing the process_outliers() function
metrics = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [1., 2, 3]})
bounds = (0.1, 2.2,)

# test case 01
outlier_process_type = 'drop'
answer = pd.DataFrame({'user_id': [1, 2], 'metric': [1.0, 2.0]})
result = process_outliers(metrics, bounds, outlier_process_type)
if answer.equals(result):
    print(f'test case 01: passed')
else:
    print(f'test case 01: failed')

# test case 02
outlier_process_type = 'clip'
answer = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [1.0, 2.0, 2.2]})
result = process_outliers(metrics, bounds, outlier_process_type)
if answer.equals(result):
    print(f'test case 02: passed')
else:
    print(f'test case 02: failed')


test case 01: passed
test case 02: passed
