Step1: Load dataset

In [18]:
import pandas as pd
import numpy as np

# Path to extracted TXT file
file_path = 'household_power_consumption.txt'

# Load with ; separator, handle '?' as NaN
data = pd.read_csv(file_path, sep=';', na_values='?')

# Combine Date and Time to datetime
data['datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], format='%d/%m/%Y %H:%M:%S')

# Numerical timestamp: seconds since the earliest date
min_dt = data['datetime'].min()
data['timestamp'] = (data['datetime'] - min_dt).dt.total_seconds()

# Relevant columns (drop Date/Time/datetime, keep numerics)
cols = ['timestamp', 'Global_active_power', 'Global_reactive_power', 'Voltage', 
        'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
data = data[cols].dropna()  # ~1.25% missing, drop for simplicity

print(f"Dataset loaded: {data.shape[0]} rows")

Dataset loaded: 2049280 rows


In [19]:
data.head()

Unnamed: 0,timestamp,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,0.0,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,60.0,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,120.0,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,180.0,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,240.0,3.666,0.528,235.68,15.8,0.0,1.0,17.0


Step2: Generate a Historical Query Log

In [23]:
import random

# Define dimensions (7D) and their min/max
dimensions = ['timestamp', 'Global_reactive_power', 'Voltage', 'Global_intensity', 
              'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
dim_ranges = {dim: (data[dim].min(), data[dim].max()) for dim in dimensions}

# Aggregate column
agg_col = 'Global_active_power'

# Add this after loading data
quantiles_low = data[dimensions].quantile(0.05)
quantiles_high = data[dimensions].quantile(0.95)

def generate_random_query_from_rows():
    # Sample a random row to anchor the query
    row = data.sample(1).iloc[0]
    predicates = {}
    for dim in dimensions:
        value = row[dim]
        low_min, low_max = quantiles_low[dim], quantiles_high[dim]
        # Choose a random width fraction (adjust 0.1-1.0 for larger ranges; higher = more points)
        width_fraction = random.uniform(0.1, 1.0)
        delta = width_fraction * (low_max - low_min) / 2
        lower = max(low_min, value - delta)
        upper = min(low_max, value + delta)
        predicates[dim] = (lower, upper)
    return predicates

# Function to compute exact SUM for a query
def exact_sum(query, df):
    mask = np.ones(len(df), dtype=bool)
    for dim, (lower, upper) in query.items():
        mask &= (df[dim] >= lower) & (df[dim] <= upper)
    return df.loc[mask, agg_col].sum()

# Generate query log (e.g., 2000 for training; adjust based on RAM/time)
# After data loading and quantile computation
query_log = []
num_queries = 2000
attempts = 0
max_attempts = 20000

while len(query_log) < num_queries and attempts < max_attempts:
    q = generate_random_query_from_rows()  # or _from_rows()
    exact_result = exact_sum(q, data)
    
    if exact_result > 0.01:  # small threshold to avoid floating-point zero
        estimate = sample_sum(q, sample, full_data_size)
        error = exact_result - estimate
        query_log.append({'query': q, 'exact': exact_result, 
                          'estimate': estimate, 'error': error})
    attempts += 1

print(f"Success! Generated {len(query_log)} valid queries in {attempts} attempts "
      f"({len(query_log)/attempts:.1%} success rate)")

Success! Generated 2000 valid queries in 2254 attempts (88.7% success rate)


In [24]:
query_log[0]

{'query': {'timestamp': (87824794.90096335, 102635965.09903665),
  'Global_reactive_power': (0.0, 0.13174039815164684),
  'Voltage': (243.0695629408365, 245.94),
  'Global_intensity': (0.8, 10.915215075500232),
  'Sub_metering_1': (0.0, 0.18648809148156936),
  'Sub_metering_2': (0.0, 0.7490313873441946),
  'Sub_metering_3': (13.707605818536464, 19.0)},
 'exact': 17540.63,
 'estimate': 19190.622079062956,
 'error': -1649.9920790629549}

Step 3: Create a Small Offline Sample

In [25]:
sample_size = int(0.001 * len(data))  # ~2000 rows
sample = data.sample(n=sample_size, random_state=42).copy()
print(f"Sample created: {sample.shape[0]} rows")

Sample created: 2049 rows


Step 4: Compute Sampling-Based Estimates and Errors for the Query Log

In [26]:
# Function for sampling-based approximate SUM (scaled)
def sample_sum(query, samp_df, full_size):
    mask = np.ones(len(samp_df), dtype=bool)
    for dim, (lower, upper) in query.items():
        mask &= (samp_df[dim] >= lower) & (samp_df[dim] <= upper)
    subset_sum = samp_df.loc[mask, agg_col].sum()
    scale = full_size / len(samp_df)
    return subset_sum * scale

# Add estimates and errors to query log
full_data_size = len(data)
for entry in query_log:
    entry['estimate'] = sample_sum(entry['query'], sample, full_data_size)
    entry['error'] = entry['exact'] - entry['estimate']

print("Estimates and errors computed for query log")

Estimates and errors computed for query log


In [27]:
query_log[200]

{'query': {'timestamp': (84910097.51229969, 118053362.99999999),
  'Global_reactive_power': (0.0, 0.2752887970714636),
  'Voltage': (240.53501883878465, 244.06498116121537),
  'Global_intensity': (1.7706663772775233, 10.229333622722477),
  'Sub_metering_1': (0.0, 0.19281273018858813),
  'Sub_metering_2': (0.0, 0.3745196392738447),
  'Sub_metering_3': (15.234280222606268, 19.0)},
 'exact': 63787.716,
 'estimate': 62012.47297218155,
 'error': 1775.2430278184474}

Step 5: Train the Error Prediction Model

In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Prepare features (flatten: lower/upper per dim) and targets (errors)
X = []
y = []
for entry in query_log:
    vec = []
    for dim in dimensions:
        lower, upper = entry['query'][dim]
        vec.extend([lower, upper])
    X.append(vec)
    y.append(entry['error'])

X = np.array(X)
y = np.array(y)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train (80/20 split for validation)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)  # As in paper
model.fit(X_train, y_train)

print(f"Model trained. Test MSE: {np.mean((model.predict(X_test) - y_test)**2)}")

Model trained. Test MSE: 12228394.507754307


Step 6: Estimate a New Query

In [35]:
# Example new 7D query (adjust ranges to sensible values based on data mins/maxes)
new_query = {
    'timestamp': (0, 1e8),  # e.g., first ~few months in seconds
    'Global_reactive_power': (0.0, 0.5),
    'Voltage': (220, 250),
    'Global_intensity': (0, 20),
    'Sub_metering_1': (0, 10),
    'Sub_metering_2': (0, 5),
    'Sub_metering_3': (0, 15)
}

# Flatten and scale
new_vec = []
for dim in dimensions:
    lower, upper = new_query[dim]
    new_vec.extend([lower, upper])
new_vec = np.array([new_vec])
new_scaled = scaler.transform(new_vec)

# Predict error
predicted_error = model.predict(new_scaled)[0]

# Find error-similar historical query (closest error)
min_diff = float('inf')
opt_entry = None
for entry in query_log:
    error_diff = abs(entry['error'] - predicted_error)
    if error_diff < min_diff:
        min_diff = error_diff
        opt_entry = entry

# Compute final estimate
sample_new = sample_sum(new_query, sample, full_data_size)
sample_opt = opt_entry['estimate']
final_estimate = opt_entry['exact'] + (sample_new - sample_opt)

print(f"Final estimate for new query: {final_estimate}")

Final estimate for new query: 550735.0209702294


In [36]:
# After computing final_estimate
print(f"LAQP estimate: {final_estimate:.2f}")

# Compute exact for the same query (for debugging/small queries)
exact = exact_sum(new_query, data)
print(f"Exact sum: {exact:.2f}")
print(f"Relative error: {abs(final_estimate - exact) / exact:.4f}")

# Also see how many rows match
mask = np.ones(len(data), dtype=bool)
for dim, (l, u) in new_query.items():
    mask &= (data[dim] >= l) & (data[dim] <= u)
matched_rows = mask.sum()
print(f"Query matches {matched_rows:,} rows ({matched_rows / len(data):.1%} of dataset)")

LAQP estimate: 550735.02
Exact sum: 560494.03
Relative error: 0.0174
Query matches 1,059,647 rows (51.7% of dataset)


In [39]:
new_query

{'timestamp': (0, 100000000.0),
 'Global_reactive_power': (0.0, 0.5),
 'Voltage': (220, 250),
 'Global_intensity': (0, 20),
 'Sub_metering_1': (0, 10),
 'Sub_metering_2': (0, 5),
 'Sub_metering_3': (0, 15)}

In [41]:
def laqp_estimate_with_details(query):
    # Flatten and predict error (same as before)
    vec = []
    for dim in dimensions:
        l, u = query[dim]
        vec.extend([l, u])
    vec = np.array([vec])
    scaled = scaler.transform(vec)
    pred_error = model.predict(scaled)[0]
    
    # Find the most error-similar historical query
    best_index = -1
    best_error_diff = float('inf')
    best_entry = None

    for idx, entry in enumerate(query_log):
        error_diff = abs(entry['error'] - pred_error)
        if error_diff < best_error_diff:
            best_error_diff = error_diff
            best_index = idx
            best_entry = entry
    
    # Compute estimates
    sample_new = sample_sum(query, sample, full_data_size)
    sample_opt = best_entry['estimate']
    final_est = best_entry['exact'] + (sample_new - sample_opt)
    
    print(f"Selected optimal query index: {best_index} (out of  {len(query_log)})")
    print(f"Predicted error for new query: {pred_error:.2f}")
    print(f"Chosen historical query error: {best_entry['error']:.2f} (diff: {best_error_diff:.2f})")
    print(f"Exact result of chosen query: {best_entry['exact']:.2f}")
    print("Predicate ranges of chosen query:")
    for dim, (l, u) in best_entry['query'].items():
        print(f"  {dim}: [{l:.2f}, {u:.2f}]")
    print(f"\nFinal LAQP estimate: {estimate:.2f}")    

    return final_est, best_index, best_entry

# Use it
estimate, opt_idx, opt_entry = laqp_estimate_with_details(new_query)

Selected optimal query index: 1108 (out of  2000)
Predicted error for new query: 1115.54
Chosen historical query error: 1119.93 (diff: 4.38)
Exact result of chosen query: 5414.51
Predicate ranges of chosen query:
  timestamp: [87053530.53, 112156789.47]
  Global_reactive_power: [0.00, 0.25]
  Voltage: [244.77, 245.94]
  Global_intensity: [0.80, 3.98]
  Sub_metering_1: [0.00, 0.46]
  Sub_metering_2: [0.00, 1.00]
  Sub_metering_3: [0.00, 8.90]

Final LAQP estimate: 550735.02


In [44]:
query_log[1108]

{'query': {'timestamp': (87053530.53088216, 112156789.46911784),
  'Global_reactive_power': (0.0, 0.25219982196670726),
  'Voltage': (244.77406960252299, 245.94),
  'Global_intensity': (0.8, 3.9761764652470073),
  'Sub_metering_1': (0.0, 0.4590564429582048),
  'Sub_metering_2': (0.0, 0.9963810778880567),
  'Sub_metering_3': (0.0, 8.90128872162247)},
 'exact': 5414.512000000001,
 'estimate': 4294.586783796974,
 'error': 1119.925216203027}

Step 7: Evaluate and Extend
Basic Evaluation: Measure Accuracy on Test Queries

In [47]:
# Generate test queries (e.g., 500 new queries)
test_query_log = []
num_queries = 500
attempts = 0
max_attempts = 10000

while len(test_query_log) < num_queries and attempts < max_attempts:
    q = generate_random_query_from_rows()  # or _from_rows()
    exact_result = exact_sum(q, data)
    
    if exact_result > 0.01:  # small threshold to avoid floating-point zero
        test_query_log.append({'query': q, 'exact': exact_result})
    attempts += 1
print(f"Success! Generated {len(test_query_log)} valid queries in {attempts} attempts "
      f"({len(test_query_log)/attempts:.1%} success rate)")

# Compute estimates for each test query
laqp_errors = []
sampling_errors = []
# Optional: aqp_plus_errors = []  # for range-similar baseline

for tq in test_queries:
    query = tq['query']
    exact = tq['exact']
    
    # 1. Pure sampling estimate
    sample_est = sample_sum(query, sample, full_data_size)
    sampling_errors.append(abs(sample_est - exact) / (exact + 1e-6))
    
    # 2. LAQP estimate (reuse the trained model and query_log)
    # Flatten and predict error
    vec = []
    for dim in dimensions:
        l, u = query[dim]
        vec.extend([l, u])
    vec = np.array([vec])
    scaled = scaler.transform(vec)
    pred_error = model.predict(scaled)[0]
    
    # Find error-similar historical query
    best_entry = min(query_log, key=lambda e: abs(e['error'] - pred_error))
    
    # LAQP final estimate
    sample_new = sample_sum(query, sample, full_data_size)
    sample_opt = best_entry['estimate']
    laqp_est = best_entry['exact'] + (sample_new - sample_opt)
    laqp_errors.append(abs(laqp_est - exact) / (exact + 1e-6))

# Results
print(f"Pure Sampling ARE: {np.mean(sampling_errors):.4f}")
print(f"LAQP ARE:          {np.mean(laqp_errors):.4f}")
print(f"Improvement:       {np.mean(sampling_errors) / np.mean(laqp_errors):.2f}x")

Success! Generated 500 valid queries in 577 attempts (86.7% success rate)
Pure Sampling ARE: 0.2529
LAQP ARE:          136016837.2035
Improvement:       0.00x


Implement Error Bounds (Confidence Intervals)

In [46]:
import scipy.stats as stats

def laqp_estimate_with_ci(query, confidence=0.95):
    # ... same as above to get best_entry, sample_new, sample_opt ...
    
    # Compute difference on sample (unscaled for variance)
    mask_new = np.ones(len(sample), dtype=bool)
    mask_opt = np.ones(len(sample), dtype=bool)
    for dim, (l, u) in query.items():
        mask_new &= (sample[dim] >= l) & (sample[dim] <= u)
    for dim, (l, u) in best_entry['query'].items():
        mask_opt &= (sample[dim] >= l) & (sample[dim] <= u)
    
    values_new = sample.loc[mask_new, agg_col].values
    values_opt = sample.loc[mask_opt, agg_col].values
    
    # Difference per tuple (aligned by padding with 0 if needed - simplified)
    # Better: compute variance of (new - opt) contribution
    # Simple CLT approximation on scaled difference
    diff = sample_new - sample_opt
    # Approximate standard error of difference
    se_diff = np.sqrt( (np.var(values_new)/len(values_new) if len(values_new)>1 else 0) +
                       (np.var(values_opt)/len(values_opt) if len(values_opt)>1 else 0) )
    se_diff *= full_data_size / len(sample)  # scale
    
    z = stats.norm.ppf(1 - (1-confidence)/2)
    ci = z * se_diff
    
    final_est = best_entry['exact'] + diff
    return final_est, (final_est - ci, final_est + ci)

# Example
est, (low, high) = laqp_estimate_with_ci(new_query)
print(f"LAQP Estimate: {est:.2f} ± { (high-low)/2 :.2f} (95% CI)")

LAQP Estimate: 59793.25 ± 47.99 (95% CI)


(Later)
Extension 1: Diversification

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

def diversify_query_log(entries, k=500):
    # Features: flattened ranges + error
    features = []
    for e in entries:
        vec = []
        for dim in dimensions:
            vec.extend(e['query'][dim])
        vec.append(e['error'])
        features.append(vec)
    features = np.array(features)
    features = StandardScaler().fit_transform(features)
    
    # Greedy selection
    selected = [0]  # start with first
    while len(selected) < k:
        dists = euclidean_distances(features[selected], features)
        min_dists = dists.min(axis=0)
        next_idx = np.argmax(min_dists)
        selected.append(next_idx)
    return [entries[i] for i in selected]

# Use diversified log for better model/training
diversified_log = diversify_query_log(query_log, k=1000)
# Retrain model on diversified_log if desired

Extension 2: Optimization

In [None]:
from scipy.optimize import minimize_scalar

def range_distance(q1, q2):
    dist = 0
    for dim in dimensions:
        l1, u1 = q1[dim]
        l2, u2 = q2[dim]
        # IoU-style or Euclidean on bounds
        dist += ((l1 - l2)**2 + (u1 - u2)**2)
    return np.sqrt(dist)

def optimized_similarity(alpha, val_queries):
    errors = []
    for vq in val_queries:
        query = vq['query']
        # predict error as before
        vec = [...]  # flatten
        pred_error = model.predict(scaler.transform([vec]))[0]
        
        best_entry = min(query_log, key=lambda e: 
            alpha * abs(e['error'] - pred_error) + 
            (1-alpha) * range_distance(query, e['query']))
        
        # compute LAQP estimate and relative error
        # ... same as before ...
        errors.append(rel_error)
    return np.mean(errors)

# Optimize alpha on validation queries
res = minimize_scalar(lambda a: optimized_similarity(a, val_queries),
                      bounds=(0,1), method='bounded')
best_alpha = res.x
print(f"Best alpha: {best_alpha:.3f}")