In [3]:
import os
os.chdir('/workspaces/work_utils2')

In [4]:
import pandas as pd
import random_dataframe as rd
import query_tools as qt

## Generate Sample Data

In [9]:
import json
with open('/workspaces/work_utils2/notebooks/creating_random_data/example_03.json', 'r') as f:
    specs = json.load(f)
df = rd.create_dataframe(specs, n_rows=10000, random_seed=42)
df.sample(5)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
1574,2575,yq0Gwi,2024-02-23,2024-02-09,2024-08-31,448,,False,91.881375,Low,Books,2,147.074161,False
9133,10134,kKVUqq,2024-02-09,2024-10-18,2024-12-31,280,82.236598,True,113.501735,Medium,Clothing,2,19.365444,False
4903,5904,9OF049EJyehqotSUCJXY3Bhn,2024-12-22,2024-10-04,2024-04-30,359,66.217815,False,107.599943,Low,Food,2,23.601668,False
7157,8158,ZGVNm1rFG599g,2024-03-10,2024-12-06,2024-10-31,884,91.947943,True,95.126278,Low,Food,1,10.0,True
3341,4342,hDJZPkUecobb4h1TD9OeDRjGxsDykW,2024-01-29,2024-03-29,2024-07-31,738,76.350755,True,71.724122,Low,Food,3,17.050712,False


## Load Named Filters from JSON

Now, let's load our named filter definitions from a JSON file. This approach allows you to:
- Separate filter definitions from code
- Share filters across multiple applications
- Modify filters without changing application code

In [7]:
# Load the named filters
with open('/workspaces/work_utils2/notebooks/query_data/example_02.json', 'r') as f:
    named_filters = json.load(f)
    
# Display available filter names
print("Available filters:")
for filter_name in named_filters.keys():
    print(f"- {filter_name}")

Available filters:
- active_customers
- high_value_items
- electronics_products
- q1_orders
- discounted_items
- high_quantity
- premium_products


## Apply Individual Named Filters

Let's apply each of our named filters one by one, to see what they do.

In [8]:
def apply_named_filter(df_, filter_name):
    if filter_name not in named_filters:
        print(f"Error: Filter '{filter_name}' not found!")
        return
    
    filter_spec = named_filters[filter_name]
    mask = qt.filters.apply_filter(df_, filter_spec)
    filtered_df = df_[mask]
    
    return filtered_df

In [11]:
apply_named_filter(df, "active_customers").sample(5)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
4409,5410,ZGfBj6JW,2024-05-20,2024-05-31,2024-05-31,498,,True,81.796849,Low,Electronics,2,10.0,True
6662,7663,dGJ16Imo8GBVHLa1EWqvFCaVBO,2024-08-13,2024-06-28,2024-04-30,362,,True,92.443101,Low,Clothing,3,10.0,True
8115,9116,jxLbg30IBNXzFoQvwj,2024-01-26,2024-10-18,2024-04-30,925,54.122834,True,109.308515,Medium,Books,1,17.415404,False
3120,4121,7XARMgGCjjl,2024-02-28,2024-02-02,2024-12-31,162,76.6121,True,99.052713,Low,Electronics,3,52.148432,False
810,1811,,2024-06-25,2024-05-03,2024-10-31,341,73.375933,True,120.013396,Low,Clothing,1,23.933754,True


In [12]:
apply_named_filter(df, "electronics_products").sample(5)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
4026,5027,,2024-06-15,2024-06-07,2024-08-31,667,75.926635,True,116.142683,Low,Electronics,3,103.971602,False
8780,9781,9G0WlDdMkRAVrnoWHwI,2024-02-25,2024-07-19,2024-10-31,710,81.600645,True,78.966329,Medium,Electronics,1,42.26031,True
2296,3297,pW9eRUBfmQDzqi4pWZ3rmCQSx5S,2024-01-16,2024-03-22,2024-06-30,893,91.042238,True,109.340864,Medium,Electronics,1,10.672962,False
6687,7688,Ul9a15Q,2024-01-07,2024-03-29,2024-11-30,44,,False,76.353743,High,Electronics,5,54.186152,False
5533,6534,DEBi6k,2024-04-23,2024-04-26,2024-05-31,969,63.8295,False,95.926724,Medium,Electronics,1,19.951125,False


## Combining Named Filters

In [13]:
def combine_named_filters(filter_names, logic="and"):
    """Combine multiple named filters using AND or OR logic"""
    # Validate filter names
    filter_specs = []
    for name in filter_names:
        if name in named_filters:
            filter_specs.append(named_filters[name])
        else:
            print(f"Warning: Filter '{name}' not found, skipping.")
    
    if not filter_specs:
        return None
    
    # If only one filter, return it directly
    if len(filter_specs) == 1:
        return filter_specs[0]
    
    # Create a combined filter
    combined_filter = {
        "type": logic.lower(),
        "filters": filter_specs
    }
    
    return combined_filter


In [14]:
combined_filter = combine_named_filters(["active_customers", "electronics_products"], "and")
mask = qt.filters.apply_filter(df, combined_filter)
df[mask].sample(5)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
9604,10605,8DdeO8uttK5qiFWwtizqt2Eru,2024-05-01,2024-12-13,2024-02-29,560,76.356033,True,114.578489,High,Electronics,2,39.172795,False
7946,8947,w6Jyr6pbzH5lVjC3bpJP6,2024-03-07,2024-10-25,2024-05-31,818,84.221629,True,105.575948,Low,Electronics,5,12.390802,False
2572,3573,Yc9GoUe1xk,2024-01-11,2024-03-08,2024-04-30,960,67.326714,True,77.168549,Low,Electronics,6,41.916334,False
4690,5691,H8nsxNqbEqhTqLz8QNSNg3x,2024-02-07,2024-11-15,2024-09-30,665,72.065687,True,100.264984,Low,Electronics,2,132.807298,False
3151,4152,CiK73PU,2024-06-28,2024-04-26,2024-06-30,958,,True,94.903471,Medium,Electronics,3,30.575572,False


## Creating a Complex Filter Programmatically

In [16]:
def create_complex_filter(scenario="high_value_electronics_q1"):
    """Create a complex filter based on a predefined scenario"""
    
    if scenario == "high_value_electronics_q1":
        # High-value Electronics purchased in Q1
        return {
            "type": "and",
            "filters": [
                named_filters["electronics_products"],
                named_filters["high_value_items"],
                named_filters["q1_orders"]
            ]
        }
    
    elif scenario == "premium_or_discounted":
        # Premium products OR any discounted item
        return {
            "type": "or",
            "filters": [
                named_filters["premium_products"], 
                named_filters["discounted_items"]
            ]
        }
    
    elif scenario == "active_high_quantity_premium_buyer":
        # Active customer buying multiple premium products
        return {
            "type": "and",
            "filters": [
                named_filters["active_customers"],
                named_filters["high_quantity"],
                named_filters["premium_products"]
            ]
        }
    
    else:
        raise ValueError(f"Unknown scenario: {scenario}")


In [17]:
complex_filter = create_complex_filter("premium_or_discounted")
mask = qt.filters.apply_filter(df, complex_filter)
df[mask].sample(5)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
4749,5750,,2024-11-22,2024-02-23,2024-11-30,964,76.666697,True,110.768984,Medium,Books,2,10.0,True
1560,2561,g525p6VeUwJ,2024-04-23,2024-03-08,2024-09-30,54,95.092795,False,114.129238,Low,Clothing,1,11.327931,True
2160,3161,gNmK7ArxObpn,2024-07-10,2024-01-26,2024-02-29,159,88.153612,True,88.227894,Low,Clothing,7,54.291888,True
9672,10673,,2024-10-08,2024-12-27,2024-08-31,771,69.804053,True,105.23383,Low,Electronics,1,82.741429,True
9747,10748,tHXnoJDrHzJURsPdPA2cWR6l,2024-07-14,2024-05-10,2024-09-30,958,66.513752,True,116.16313,Low,Books,2,10.031464,True


## Saving Filter Results

Finally, let's demonstrate how to save the results of applying a filter.

In [None]:
# Apply a complex filter
filter_name = "high_value_electronics_q1"
complex_filter = create_complex_filter(filter_name)
mask = qt.filters.apply_filter(df_, complex_filter)
filtered_df = df_[mask]

# Get filter statistics
stats = qt.filters.get_filter_stats(df_, complex_filter, 
                                    numeric_columns=["price", "quantity", "value"])

print(f"Filter '{filter_name}' Statistics:")
print(f"  - Records: {stats['count']} ({stats['percentage']:.1f}% of total)")
print("\nAverage price: ${:.2f}".format(stats['columns']['price']['mean']))
print("Average quantity: {:.1f}".format(stats['columns']['quantity']['mean']))
print("Average value: ${:.2f}".format(stats['columns']['value']['mean']))

# Example of saving the filtered data
output_file = f"{filter_name}_results.csv"
print(f"\nSaving results to {output_file}")
# filtered_df.to_csv(output_file, index=False)  # Uncomment to actually save the file
print("Done!")