# Full Dataset Extraction

Extract all single-item orders with KSE measurements (~57K items)

**Output:** CSV/Parquet file for local analysis and experiments

## 1. Setup

In [None]:
from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery
import pandas as pd

PROJECT_ID = "sazoshop"
client = bigquery.Client(project=PROJECT_ID)

print("Authenticated")

## 2. Check Data Size First

In [None]:
# Count query - check how many records we'll get
COUNT_QUERY = """
WITH single_item_orders AS (
  SELECT order_item_order_id
  FROM `sazoshop.firestore_snapshot.v2_order_items`
  GROUP BY order_item_order_id
  HAVING COUNT(DISTINCT order_item_id) = 1
)

SELECT COUNT(*) as total_count
FROM `sazoshop.firestore_snapshot.v2_order_items` oi
INNER JOIN `sazoshop.firestore_collection.v2_kse_cost` kse
  ON oi.order_item_order_id = kse.order_id
WHERE 
  oi.order_item_order_id IN (SELECT order_item_order_id FROM single_item_orders)
  AND kse.dimensions IS NOT NULL
  AND kse.actual_weight IS NOT NULL
  AND kse.actual_weight > 0
  AND REGEXP_CONTAINS(kse.dimensions, r'^[0-9.]+x[0-9.]+x[0-9.]+$')
"""

count_result = client.query(COUNT_QUERY).to_dataframe()
print(f"Total records to extract: {count_result['total_count'].values[0]:,}")

## 3. Extract Full Dataset

In [None]:
# Full extraction query (aligned with bigquery/dataset.sql)
FULL_QUERY = """
WITH single_item_orders AS (
  SELECT order_item_order_id
  FROM `sazoshop.firestore_snapshot.v2_order_items`
  GROUP BY order_item_order_id
  HAVING COUNT(DISTINCT order_item_id) = 1
),

kse_shipping_data AS (
  SELECT
    kse.order_id,
    kse.dimensions,
    kse.actual_weight,
    kse.volumetric_weight,
    kse.shipping_date,
    CAST(SPLIT(dimensions, 'x')[OFFSET(0)] AS FLOAT64) AS actual_d1,
    CAST(SPLIT(dimensions, 'x')[OFFSET(1)] AS FLOAT64) AS actual_d2,
    CAST(SPLIT(dimensions, 'x')[OFFSET(2)] AS FLOAT64) AS actual_d3,
    (CAST(SPLIT(dimensions, 'x')[OFFSET(0)] AS FLOAT64) *
     CAST(SPLIT(dimensions, 'x')[OFFSET(1)] AS FLOAT64) *
     CAST(SPLIT(dimensions, 'x')[OFFSET(2)] AS FLOAT64)) / 1000000 AS volume_m3
  FROM `sazoshop.firestore_collection.v2_kse_cost` kse
  WHERE kse.order_id IN (SELECT order_item_order_id FROM single_item_orders)
    AND kse.dimensions IS NOT NULL
    AND kse.actual_weight IS NOT NULL
    AND kse.actual_weight > 0
    AND REGEXP_CONTAINS(kse.dimensions, r'^\\d+\\.?\\d*x\\d+\\.?\\d*x\\d+\\.?\\d*$')
),

order_item_details AS (
  SELECT
    oi.order_item_order_id,
    oi.order_item_id,
    oi.order_item_product_id,
    oi.order_item_product_version_id,
    oi.order_item_title_origin,
    oi.order_item_title_target,
    oi.order_item_product_version_info_category,
    oi.order_item_meta_custom_category_name,
    oi.order_item_product_version_site_name,
    oi.order_item_product_version_details,
    oi.order_item_product_version_info_details,
    oi.order_item_meta_materials,
    oi.order_item_meta_clothing_materials,
    oi.order_item_meta_hscode,
    ARRAY_TO_STRING(oi.order_item_product_version_thumbnail_urls, '|') AS thumbnail_urls,
    ARRAY_LENGTH(oi.order_item_product_version_thumbnail_urls) AS thumbnail_count,
    oi.order_item_price_origin_base,
    oi.order_item_price_target_base,
    oi.order_item_product_version_url,
    SAFE_CAST(JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.weight') AS FLOAT64) AS ai_weight_kg,
    SAFE_CAST(JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.volume') AS FLOAT64) AS ai_volume_m3,
    JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.width') AS ai_width,
    JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.depth') AS ai_depth,
    JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.height') AS ai_height,
    JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.volume') AS ai_volume_str,
    JSON_EXTRACT_SCALAR(oi.order_item_product_version_extra, '$.packed_volume') AS ai_packed_volume_str,
    oi.created_at,
    ROW_NUMBER() OVER (PARTITION BY oi.order_item_order_id ORDER BY oi.created_at DESC) AS rn
  FROM `sazoshop.firestore_snapshot.v2_order_items` oi
  WHERE oi.order_item_order_id IN (SELECT order_item_order_id FROM single_item_orders)
)

SELECT
  oid.order_item_order_id AS order_id,
  oid.order_item_id AS item_id,
  oid.order_item_product_id AS product_id,
  oid.order_item_product_version_id AS product_version_id,
  IFNULL(oid.order_item_title_origin, '') AS title_origin,
  IFNULL(oid.order_item_title_target, '') AS title_target,
  IFNULL(oid.order_item_product_version_info_category, '') AS category,
  IFNULL(oid.order_item_meta_custom_category_name, '') AS custom_category,
  IFNULL(oid.order_item_product_version_site_name, '') AS site_name,
  IFNULL(oid.order_item_product_version_details, '') AS product_details,
  IFNULL(oid.order_item_product_version_info_details, '') AS product_info_details,
  IFNULL(oid.order_item_meta_materials, '') AS materials,
  IFNULL(oid.order_item_meta_clothing_materials, '') AS clothing_materials,
  IFNULL(oid.order_item_meta_hscode, '') AS hscode,
  IFNULL(oid.thumbnail_urls, '') AS thumbnail_urls,
  IFNULL(oid.thumbnail_count, 0) AS thumbnail_count,
  IFNULL(oid.order_item_price_origin_base, 0) AS price_origin,
  IFNULL(oid.order_item_price_target_base, 0) AS price_krw,
  IFNULL(oid.order_item_product_version_url, '') AS product_url,
  kse.actual_weight,
  kse.dimensions AS actual_dimensions,
  kse.actual_d1,
  kse.actual_d2,
  kse.actual_d3,
  kse.volume_m3 AS actual_volume_m3,
  kse.volumetric_weight,
  oid.ai_weight_kg,
  oid.ai_volume_m3,
  SAFE_CAST(oid.ai_width AS FLOAT64) AS ai_width_cm,
  SAFE_CAST(oid.ai_depth AS FLOAT64) AS ai_depth_cm,
  SAFE_CAST(oid.ai_height AS FLOAT64) AS ai_height_cm,
  oid.ai_volume_str,
  oid.ai_packed_volume_str,
  kse.shipping_date,
  oid.created_at AS order_created_at
FROM order_item_details oid
INNER JOIN kse_shipping_data kse ON oid.order_item_order_id = kse.order_id
WHERE
  oid.rn = 1
  AND kse.actual_weight > 0
  AND kse.volume_m3 > 0
  AND kse.actual_weight < 100
  AND kse.volume_m3 < 1
  AND (oid.order_item_title_origin IS NOT NULL OR oid.order_item_title_target IS NOT NULL)
ORDER BY kse.shipping_date DESC
"""

print("Extracting full dataset... (may take 1-2 minutes)")
df = client.query(FULL_QUERY).to_dataframe()
print(f"Extracted {len(df):,} records")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

## 4. Data Overview

In [None]:
print("=== Column Info ===")
print(df.dtypes)
print("\n=== Sample Data ===")
df.head(3)

In [None]:
# Basic stats
print("=== Basic Statistics ===")
print(f"Total records: {len(df):,}")
print(f"Date range: {df['shipping_date'].min()} ~ {df['shipping_date'].max()}")
print(f"\nUnique products: {df['product_version_id'].nunique():,}")
print(f"Unique categories: {df['category'].nunique():,}")
print(f"Unique sites: {df['site_name'].nunique():,}")

print(f"\n=== Has AI Estimates ===")
has_ai = df['ai_weight_kg'].notna()
print(f"With AI estimates: {has_ai.sum():,} ({has_ai.mean()*100:.1f}%)")
print(f"Without AI estimates: {(~has_ai).sum():,} ({(~has_ai).mean()*100:.1f}%)")

In [None]:
# Weight distribution
print("=== Actual Weight Distribution ===")
print(df['actual_weight'].describe())

print("\n=== Weight Ranges ===")
weight_bins = [0, 0.5, 1, 2, 5, 10, 100]
labels = ['0-0.5kg', '0.5-1kg', '1-2kg', '2-5kg', '5-10kg', '10kg+']
df['weight_range'] = pd.cut(df['actual_weight'], bins=weight_bins, labels=labels)
print(df['weight_range'].value_counts().sort_index())

In [None]:
# Top categories
print("=== Top 20 Categories ===")
print(df['category'].value_counts().head(20))

## 5. Add Computed Columns

In [None]:
# Sort actual dimensions (max >= mid >= min)
def sort_dims(d1, d2, d3):
    dims = sorted([d1, d2, d3], reverse=True)
    return dims[0], dims[1], dims[2]

# Original dimensions are kept as actual_d1, actual_d2, actual_d3 from BigQuery
# Add sorted dimensions for comparison
sorted_dims = df.apply(lambda r: sort_dims(r['actual_d1'], r['actual_d2'], r['actual_d3']), axis=1)
df['actual_max'] = sorted_dims.apply(lambda x: x[0])
df['actual_mid'] = sorted_dims.apply(lambda x: x[1])
df['actual_min'] = sorted_dims.apply(lambda x: x[2])

# Calculate actual volume (cm³ and liters) - volume_m3 already from BigQuery
df['actual_volume_cm3'] = df['actual_volume_m3'] * 1000000  # m³ -> cm³
df['actual_volume_L'] = df['actual_volume_m3'] * 1000  # m³ -> L

print("Added columns:")
print("  - Original: actual_d1, actual_d2, actual_d3 (from KSE)")
print("  - Sorted: actual_max, actual_mid, actual_min (for comparison)")
print("  - Volume: actual_volume_cm3, actual_volume_L (converted from actual_volume_m3)")
df[['actual_dimensions', 'actual_d1', 'actual_d2', 'actual_d3', 'actual_max', 'actual_mid', 'actual_min']].head()

In [None]:
# Sort AI dimensions for comparison (ai_width_cm, ai_depth_cm, ai_height_cm already from BigQuery)
def sort_ai_dims(row):
    w, d, h = row['ai_width_cm'], row['ai_depth_cm'], row['ai_height_cm']
    if pd.notna(w) and pd.notna(d) and pd.notna(h):
        dims = sorted([w, d, h], reverse=True)
        return pd.Series({'ai_max': dims[0], 'ai_mid': dims[1], 'ai_min': dims[2]})
    return pd.Series({'ai_max': None, 'ai_mid': None, 'ai_min': None})

ai_sorted = df.apply(sort_ai_dims, axis=1)
df = pd.concat([df, ai_sorted], axis=1)

# Calculate AI volume in cm³ and L (ai_volume_m3 already from BigQuery)
df['ai_volume_cm3'] = df['ai_volume_m3'] * 1000000  # m³ -> cm³
df['ai_volume_L'] = df['ai_volume_m3'] * 1000  # m³ -> L

print("Added columns:")
print("  - Original: ai_width_cm, ai_depth_cm, ai_height_cm (from BigQuery)")
print("  - Sorted: ai_max, ai_mid, ai_min (for comparison)")
print("  - Volume: ai_volume_cm3, ai_volume_L (converted from ai_volume_m3)")

In [None]:
# Calculate errors (for items with AI estimates)
# Signed error: positive = AI overestimated, negative = AI underestimated
# Using SORTED dimensions for fair comparison
mask = df['ai_weight_kg'].notna()

# Weight and volume errors
df.loc[mask, 'weight_error'] = (df.loc[mask, 'ai_weight_kg'] - df.loc[mask, 'actual_weight']) / df.loc[mask, 'actual_weight']
df.loc[mask, 'volume_error'] = (df.loc[mask, 'ai_volume_cm3'] - df.loc[mask, 'actual_volume_cm3']) / df.loc[mask, 'actual_volume_cm3']

# Dimension errors (using sorted: max/mid/min)
df.loc[mask, 'max_error'] = (df.loc[mask, 'ai_max'] - df.loc[mask, 'actual_max']) / df.loc[mask, 'actual_max']
df.loc[mask, 'mid_error'] = (df.loc[mask, 'ai_mid'] - df.loc[mask, 'actual_mid']) / df.loc[mask, 'actual_mid']
df.loc[mask, 'min_error'] = (df.loc[mask, 'ai_min'] - df.loc[mask, 'actual_min']) / df.loc[mask, 'actual_min']
df.loc[mask, 'avg_dim_error'] = (df.loc[mask, 'max_error'] + df.loc[mask, 'mid_error'] + df.loc[mask, 'min_error']) / 3

print("Added error columns (signed: + = overestimate, - = underestimate)")
print("  - weight_error, volume_error")
print("  - max_error, mid_error, min_error (sorted dimensions)")
print("  - avg_dim_error")
print(f"\nItems with errors calculated: {mask.sum():,}")

## 6. Save Dataset

In [None]:
# Select columns to save
columns_to_save = [
    # IDs
    'order_id', 'item_id', 'product_id', 'product_version_id',
    # Input features - text
    'title_origin', 'title_target', 'category', 'custom_category',
    'site_name', 'product_details', 'product_info_details',
    'materials', 'clothing_materials', 'hscode',
    # Input features - image/price/url
    'thumbnail_urls', 'thumbnail_count', 'price_origin', 'price_krw', 'product_url',
    # Actual - original order (from KSE)
    'actual_weight', 'actual_dimensions', 'actual_d1', 'actual_d2', 'actual_d3',
    # Actual - sorted (for comparison)
    'actual_max', 'actual_mid', 'actual_min',
    'actual_volume_m3', 'actual_volume_cm3', 'actual_volume_L', 'volumetric_weight',
    # AI estimates - original order
    'ai_weight_kg', 'ai_width_cm', 'ai_depth_cm', 'ai_height_cm',
    # AI estimates - sorted (for comparison)
    'ai_max', 'ai_mid', 'ai_min',
    'ai_volume_m3', 'ai_volume_cm3', 'ai_volume_L',
    'ai_volume_str', 'ai_packed_volume_str',
    # Errors (using sorted dimensions)
    'weight_error', 'volume_error', 'max_error', 'mid_error', 'min_error', 'avg_dim_error',
    # Metadata
    'shipping_date', 'order_created_at', 'weight_range'
]

df_export = df[columns_to_save].copy()
print(f"Columns to export: {len(columns_to_save)}")
print(f"Rows: {len(df_export):,}")

In [None]:
# Save as CSV
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d')
csv_filename = f'single_item_kse_full_{timestamp}.csv'

df_export.to_csv(csv_filename, index=False)
print(f"Saved: {csv_filename}")

import os
file_size_mb = os.path.getsize(csv_filename) / 1024 / 1024
print(f"File size: {file_size_mb:.1f} MB")

In [None]:
# Also save as Parquet (smaller, faster to load)
parquet_filename = f'single_item_kse_full_{timestamp}.parquet'
df_export.to_parquet(parquet_filename, index=False)

parquet_size_mb = os.path.getsize(parquet_filename) / 1024 / 1024
print(f"Saved: {parquet_filename}")
print(f"File size: {parquet_size_mb:.1f} MB (Parquet is more compact)")

In [None]:
# Download files
from google.colab import files

print("Downloading CSV...")
files.download(csv_filename)

# Uncomment to also download parquet
# print("Downloading Parquet...")
# files.download(parquet_filename)

## 7. Quick Filtering Examples

Once you have this dataset, you can filter for various experiments:

In [None]:
# Example filters you can apply later

# 1. Items WITHOUT AI estimates (need to run estimation)
no_ai = df_export[df_export['ai_weight_kg'].isna()]
print(f"Items needing AI estimation: {len(no_ai):,}")

# 2. Items WITH AI estimates (can calculate errors)
has_ai = df_export[df_export['ai_weight_kg'].notna()]
print(f"Items with AI estimates: {len(has_ai):,}")

# 3. Overestimated items (AI estimate > actual)
overestimated = df_export[df_export['weight_error'] > 0.5]  # AI estimated 50%+ higher
print(f"Weight overestimated by >50%: {len(overestimated):,}")

# 4. Underestimated items (AI estimate < actual)
underestimated = df_export[df_export['weight_error'] < -0.5]  # AI estimated 50%+ lower
print(f"Weight underestimated by >50%: {len(underestimated):,}")

# 5. Specific category
perfume = df_export[df_export['category'].str.contains('향수|퍼퓸|perfume', case=False, na=False)]
print(f"Perfume items: {len(perfume):,}")

# 6. Weight range
light = df_export[df_export['actual_weight'] < 0.5]
heavy = df_export[df_export['actual_weight'] > 5]
print(f"Light items (<0.5kg): {len(light):,}")
print(f"Heavy items (>5kg): {len(heavy):,}")

## Done!

You now have the full dataset. Next steps:
1. Download the CSV/Parquet file
2. Use it locally or upload to Google Drive
3. Filter by various conditions for experiments
4. Run new AI estimations on items without estimates