In [1]:
import os
os.chdir('/workspaces/work_utils2')

In [2]:
import json
import random_dataframe as rd
import query_tools as qt
import trends

# Generate Random Data

In [3]:
# read dict from json file
with open('/workspaces/work_utils2/notebooks/interestingness/random_data.json', 'r') as f:
    specs = json.load(f)

# Generate DataFrame
df = rd.create_dataframe(specs, n_rows=1000, random_seed=42)
df.sample(3)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
312,1313,AjiGFVfO6J67bCdaszy,2024-05-14,2024-03-01,2024-03-31,348,80.772624,True,74.768991,Low,Electronics,2,40.020129,False
75,1076,,2024-12-18,2024-02-09,2024-08-31,624,74.013916,True,120.81769,Medium,Clothing,2,31.301804,False
2,1003,Gl5v8RyWA6PB7po99U9YR2Z4c,2024-09-14,2024-07-26,2024-06-30,56,74.505334,False,122.809963,Low,Clothing,3,47.137104,False


# Filter Data

In [4]:
# Load the named filters
with open('/workspaces/work_utils2/notebooks/interestingness/filters.json', 'r') as f:
    named_filters = json.load(f)
    
# Display available filter names
print("Available filters:")
for filter_name in named_filters.keys():
    print(f"- {filter_name}")

Available filters:
- active_customers
- high_value_items
- electronics_products
- q1_orders
- discounted_items
- high_quantity
- premium_products


In [5]:
filter_name = 'active_customers'
mask = qt.filters.apply_filter(df, named_filters[filter_name])
filtered_df = df[mask]
filtered_df.sample(3)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
591,1592,ZVTn6G,2024-01-02,2024-11-08,2024-04-30,457,88.351127,True,98.172316,Low,Electronics,3,48.68985,False
194,1195,,2024-11-15,2024-11-01,2024-06-30,963,66.819978,True,95.768844,Low,Electronics,5,40.0992,True
839,1840,74fahGYQuxyU24QsC0j3lG3gq88,2024-08-27,2024-07-26,2024-12-31,990,76.552461,True,102.072213,High,Electronics,2,25.087702,False


# Aggregate Data

In [6]:
summary_config = {
    "type": "summary",
    "spec": {
        "groupby": ["product_category", "open_month"],
        "columns": ["price"],
        "statistics": ["count", "mean", "var", "q1", "q3"]
    }
}
summary_df = qt.aggregations.process_aggregation(filtered_df, summary_config)
summary_df.sample(3)

Unnamed: 0,product_category,open_month,price_count,price_mean,price_var,price_q1,price_q3
36,Food,2024-01-31,11,34.751077,1906.910664,10.0,38.027272
3,Books,2024-04-30,12,21.215265,166.378904,10.0,31.815388
20,Clothing,2024-09-30,12,22.517923,143.336894,12.787782,31.013401


In [7]:
base_col = 'price'
trends_results = trends.evaluate_all_trend_metrics(summary_df, group_col='product_category', time_col='open_month', mean_col=f'{base_col}_mean', count_col=f'{base_col}_count', var_col=f'{base_col}_var')
trends_results

Unnamed: 0,product_category,slope,overall_slope,divergence,direction_difference,direction_significance,volatility,relative_volatility,r_squared,consistency_score,...,has_outlier_periods,outlier_ratio,has_periodicity,period_length,periodicity_strength,periodicity_difference,is_stationary,p_value,stationarity_difference,advanced_trend_divergence_score
0,Books,-0.711413,-0.474176,0.237237,0.0,0.0,0.758166,1.274111,0.002477,0.002477,...,False,0.0,True,2,2.363538,1.0,True,2.370056e-05,0.0,0.736321
1,Clothing,-0.542734,-0.474176,0.068558,0.0,0.0,0.504886,0.848469,0.021831,0.021831,...,False,0.0,False,4,1.972307,0.0,True,2.172334e-09,0.0,0.347365
2,Electronics,-0.339463,-0.474176,0.134714,0.0,0.0,0.568645,0.955618,0.064787,0.064787,...,True,0.083333,False,2,1.733027,0.0,False,0.2492046,1.0,0.697437
3,Food,-0.004601,-0.474176,0.469576,0.0,0.0,0.503292,0.845791,0.027512,0.027512,...,False,0.0,False,2,1.892668,0.0,True,0.01065868,0.0,0.59707
4,Home,-0.990073,-0.474176,0.515897,0.0,0.0,0.640286,1.076011,0.117122,0.117122,...,False,0.0,False,2,1.752157,0.0,False,0.2034654,1.0,1.0
