In [None]:
import os
os.chdir('/workspaces/work_utils2')

In [None]:
import json
import random_dataframe as rd
import query_tools as qt

# Generate Random Data

In [None]:
# read dict from json file
with open('/workspaces/work_utils2/notebooks/interestingness/random_data.json', 'r') as f:
    specs = json.load(f)

# Generate DataFrame
df = rd.create_dataframe(specs, n_rows=1000, random_seed=42)
df.sample(3)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
312,1313,AjiGFVfO6J67bCdaszy,2024-05-14,2024-03-01,2024-03-31,348,80.772624,True,74.768991,Low,Electronics,2,40.020129,False
75,1076,,2024-12-18,2024-02-09,2024-08-31,624,74.013916,True,120.81769,Medium,Clothing,2,31.301804,False
2,1003,Gl5v8RyWA6PB7po99U9YR2Z4c,2024-09-14,2024-07-26,2024-06-30,56,74.505334,False,122.809963,Low,Clothing,3,47.137104,False


# Filter Data

In [None]:
# Load the named filters
with open('/workspaces/work_utils2/notebooks/interestingness/filters.json', 'r') as f:
    named_filters = json.load(f)
    
# Display available filter names
print("Available filters:")
for filter_name in named_filters.keys():
    print(f"- {filter_name}")

Available filters:
- active_customers
- high_value_items
- electronics_products
- q1_orders
- discounted_items
- high_quantity
- premium_products


In [None]:
filter_name = 'active_customers'
mask = qt.filters.apply_filter(df, named_filters[filter_name])
filtered_df = df[mask]
filtered_df.sample(3)

Unnamed: 0,id,name,open_date,open_week,open_month,customer_id,score,active,value,category,product_category,quantity,price,is_discounted
591,1592,ZVTn6G,2024-01-02,2024-11-08,2024-04-30,457,88.351127,True,98.172316,Low,Electronics,3,48.68985,False
194,1195,,2024-11-15,2024-11-01,2024-06-30,963,66.819978,True,95.768844,Low,Electronics,5,40.0992,True
839,1840,74fahGYQuxyU24QsC0j3lG3gq88,2024-08-27,2024-07-26,2024-12-31,990,76.552461,True,102.072213,High,Electronics,2,25.087702,False


# Aggregate Data

In [None]:
summary_config = {
    "type": "summary",
    "spec": {
        "groupby": "product_category",
        "columns": ["price"],
        "statistics": ["count", "mean", "var", "q1", "q3"]
    }
}
summary_df = qt.aggregations.process_aggregation(filtered_df, summary_config)
summary_df

Unnamed: 0,product_category,price_count,price_mean,price_var,price_q1,price_q3
0,Books,110,31.609234,873.876393,11.407419,40.652082
1,Clothing,207,35.133717,2461.301695,10.509025,40.820861
2,Electronics,241,30.558619,1060.246644,10.377798,36.531489
3,Food,163,34.112189,1672.623566,10.0,40.069067
4,Home,97,31.571118,760.258213,10.296982,44.399389


# Calculate Interestingness

In [None]:
import interestingness as it

In [None]:
base_col = 'price'
interestingness_nested = it.evaluate_all_metrics(summary_df, mean_col=f'{base_col}_mean', count_col=f'{base_col}_count', var_col=f'{base_col}_var', percentile_25_col=f'{base_col}_q1', percentile_75_col=f'{base_col}_q3')
interestingness_dict = it.flatten_dict(interestingness_nested)
interestingness_dict

{'group_variance': np.float64(3.5581370490487143),
 'coefficient_of_variation': np.float64(0.05771010162365721),
 'max_deviation_ratio': np.float64(0.07782136950804028),
 'range_to_mean_ratio': np.float64(0.1403534302612568),
 'gini_coefficient': np.float64(0.03159766596945057),
 'anova_f_statistic': np.float64(0.48990618508916794),
 'anova_p_value': np.float64(0.7431762356693123),
 'anova_significant': np.False_,
 'effect_size_f': np.float64(0.04909544311098586),
 'discriminative_power': np.float64(0.002410362534264049),
 'entropy_reduction': np.float64(0.9),
 'kruskal_wallis_h_statistic': np.float64(986.9971739128783),
 'kruskal_wallis_p_value': np.float64(2.3466694130556784e-212),
 'kruskal_wallis_significant': np.True_,
 'outlier_groups_outlier_count': 0,
 'outlier_groups_outlier_proportion': 0.0,
 'outlier_groups_max_z_score': np.float64(1.2977178203376354),
 'outlier_groups_outlier_indices': '[]',
 'group_separation': np.float64(0.5110826783221051)}