
# Write a function that filters dataframe, using provided parameters (PARLA)

## Problem
- Write a function that filters a given dataframe based on the following parameters:
    - begin_date: start of the date range (inclusive)
    - end_date: the end of the date range (exclusive)
    - user_ids: a list of user ids to filter the data by
    - columns: a list of column names to retain in the output
    - If any of these parameters are not provided, the function just skips that step

## Action
- I implemented the function, using python and pandas

## Result
- The function successfully passed all tests

## Learning
- I revised relevant python and pandas functions

## Application
- I can apply the relevant python and pandas functions for data-related problems

In [3]:

from typing import Optional, List
import datetime as dt
import pandas as pd


def get_data_subset(
    df: pd.DataFrame,
    begin_date: Optional[dt.datetime],
    end_date: Optional[dt.datetime],
    user_ids: Optional[List[str]] = None,
    columns: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Returns a filtered subset of the input DataFrame based on:
    - optional date range,
    - user ID list,
    - selected columns

    :param df:
        input DataFrame
        must contain at least 'date' and 'user_id' columns
    :param begin_date:
        start of the date range (inclusive)
        if provided, only rows where 'date' >= begin_date are kept
        if None, this filter is skipped
    :param end_date:
        the end of the date range (exclusive)
        if provided, only rows where 'date' < end_date are kept
        if None, skip this filter
    :param user_ids:
        a list of user ids to filter the data by.
        if provided, only rows where 'user_id' is in this list are kept
        if None, skip this filter
    :param columns:
        a list of column names to retain in the output
        if provided, keep only these columns
        if None, return all columns
    :return:
    """

    # Filter by 'begin_date'
    if begin_date:
        df = df[df['date'] >= begin_date]

    # Filter by 'end_date'
    if end_date:
        df = df[df['date'] < end_date]

    # Filter by 'user_ids'
    if user_ids:
        df = df[df['user_id'].isin(user_ids)]

    # Filter by 'columns' and return 'df'
    if columns:
        return df[columns]
    else:
        return df



In [5]:

# test the function
test = pd.DataFrame({
    'date': [dt.datetime(2022, 1, 5), dt.datetime(2022, 1, 7)],
    'user_id': ['1', '2'],
})

answer = pd.DataFrame({
    'date': [dt.datetime(2022, 1, 5)],
    'user_id': ['1'],
})

result = get_data_subset(test, dt.datetime(2022, 1, 1), dt.datetime(2022, 1, 6))

result.equals(answer)


True