# STARK Amazon SKB Data Extraction and Cleaning

**Extract and process the STARK Amazon Semi-structured Knowledge Base (SKB) dataset**

This notebook downloads the STARK Amazon dataset directly from Hugging Face, extracts node information including product attributes, reviews, and graph edge relationships (also_buy, also_view, has_brand, has_category, has_color), and performs data cleaning operations to create a structured CSV file suitable for analysis and machine learning tasks.

## Output
- Clean CSV file with 1M+ product nodes
- Includes: title, description, features, brand, price, rank, reviews, ratings, categories, and edge relationships
- Combined text field for NLP/embedding tasks

## Requirements
- Google Colab environment
- Google Drive mounted
- ~4GB download (cached after first run)

In [None]:
# ============================================================================
# MINIMAL SETUP
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

# Clone the repository
!git clone https://github.com/snap-stanford/stark.git
%cd stark

# Install minimal dependencies
print("Installing minimal dependencies...")
!pip install pandas huggingface_hub tqdm -q

# Check numpy version
import numpy as np
print(f"Current numpy version: {np.__version__}")

print("✅ Minimal setup complete!")

In [None]:
# ============================================================================
# Load and extract SKB data
# ============================================================================

from huggingface_hub import hf_hub_download
import os
import pandas as pd
import zipfile

# Download the processed Amazon SKB data directly
print("Downloading Amazon SKB data...")
repo_id = "snap-stanford/stark"
filename = "skb/amazon/processed.zip"

file_path = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
print(f"✅ Downloaded to: {file_path}")

# Extract
extract_dir = "/content/amazon_skb"
os.makedirs(extract_dir, exist_ok=True)

print("Extracting files...")
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"✅ Extracted!")
print(f"\nContents of {extract_dir}:")
for root, dirs, files in os.walk(extract_dir):
    level = root.replace(extract_dir, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 2 * (level + 1)
    for file in files[:10]:  # Show first 10 files
        print(f'{subindent}{file}')
    if len(files) > 10:
        print(f'{subindent}... and {len(files)-10} more files')

In [None]:
# Install torch
!pip install torch -q

print("✅ Torch installed")

In [None]:
# ============================================================================
# Load Amazon SKB with edges and convert to CSV
# ============================================================================

import pickle
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm

# Define edge type mapping
edge_type_dict = {
    0: 'also_buy',
    1: 'also_view',
    2: 'has_brand',
    3: 'has_category',
    4: 'has_color'
}

# Load all the data files
print("Loading data files...")

# Load node info
with open('/content/amazon_skb/processed/node_info.pkl', 'rb') as f:
    node_info = pickle.load(f)
print(f"✅ Loaded {len(node_info)} nodes")

# Load edge index
edge_index = torch.load('/content/amazon_skb/processed/edge_index.pt')
print(f"✅ Loaded edge_index with shape {edge_index.shape}")

# Load edge types
edge_types = torch.load('/content/amazon_skb/processed/edge_types.pt')
print(f"✅ Loaded {len(edge_types)} edge types")

# Load edge type dict
with open('/content/amazon_skb/processed/edge_type_dict.pkl', 'rb') as f:
    edge_type_mapping = pickle.load(f)
print(f"✅ Loaded edge type mapping: {edge_type_mapping}")

print("\nBuilding edge dictionary...")
# Convert to numpy for faster processing
edges = edge_index.numpy()
edge_types_np = edge_types.numpy()
edge_list = list(zip(edges[0], edges[1], edge_types_np))

# Create a dictionary mapping each node to its edges by type
edge_dict = {node_id: {etype: [] for etype in edge_type_dict.values()}
             for node_id in node_info.keys()}

for src, dst, etype in tqdm(edge_list, desc="Processing edges"):
    edge_name = edge_type_dict.get(etype, "Unknown")
    if src in edge_dict:
        edge_dict[src][edge_name].append(int(dst))
    if dst in edge_dict:
        edge_dict[dst][edge_name].append(int(src))

print(f"✅ Built edge dictionary for {len(edge_dict)} nodes")

# Now build the dataframe with all information
print("\nBuilding DataFrame with node info, reviews, and edges...")
node_data = []

for node_id, node_info_dict in tqdm(node_info.items(), desc="Processing nodes"):
    # Extract key fields
    title = node_info_dict.get("title", "")
    description = node_info_dict.get("description", "")
    feature = node_info_dict.get("feature", "")
    global_category = node_info_dict.get("global_category", "")
    categories = ", ".join(node_info_dict.get("category", [])) if isinstance(node_info_dict.get("category"), list) else ""
    brand = node_info_dict.get("brand", "")
    price = node_info_dict.get("price", "")
    rank = node_info_dict.get("rank", "")

    # Extract review details
    reviews = node_info_dict.get("review", [])
    if isinstance(reviews, list):
        review_texts = [str(review.get("reviewText", "")) for review in reviews if isinstance(review, dict)]
        review_ratings = [review.get("overall", None) for review in reviews if isinstance(review, dict)]
    else:
        review_texts = []
        review_ratings = []

    # Get edges for this node
    node_edges = edge_dict.get(node_id, {etype: [] for etype in edge_type_dict.values()})

    # Store structured row
    node_data.append({
        "node_id": node_id,
        "title": title,
        "description": description,
        "feature": feature,
        "global_category": global_category,
        "categories": categories,
        "brand": brand,
        "price": price,
        "rank": rank,
        "reviews": " | ".join(filter(None, review_texts)),
        "ratings": review_ratings,
        "also_buy": node_edges.get("also_buy", []),
        "also_view": node_edges.get("also_view", []),
        "has_brand": node_edges.get("has_brand", []),
        "has_category": node_edges.get("has_category", []),
        "has_color": node_edges.get("has_color", [])
    })

# Convert to DataFrame
df = pd.DataFrame(node_data)

print(f"\n✅ Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
print(f"\nColumns: {list(df.columns)}")

# Save to CSV
output_path = '/content/drive/MyDrive/stark_amazon_skb_with_edges.csv'
df.to_csv(output_path, index=False)

print(f"\n🎉 SUCCESS! Saved to {output_path}")
print(f"\nFirst 3 rows:")
display(df.head(3))
print(f"\nDataFrame info:")
print(df.info())

### Cleaning Procedures

In [None]:
# ============================================================================
# Load raw Amazon SKB with edges
# ============================================================================

import pandas as pd

# Load the raw CSV file you just created
file_path = '/content/drive/MyDrive/stark_amazon_skb_with_edges.csv'
node_df = pd.read_csv(file_path)

print(f"Loaded {len(node_df)} rows")
print(f"\nColumns: {list(node_df.columns)}")
display(node_df.head())

In [None]:
# ============================================================================
# Convert text columns to lowercase
# ============================================================================

text_columns = ['title', 'feature', 'description', 'global_category', 'categories', 'brand', 'reviews']

for col in text_columns:
    if col in node_df.columns:
        node_df[col] = node_df[col].str.lower()

print("Text normalized to lowercase")
display(node_df.head())

In [None]:
# ============================================================================
# Function to clean text - remove punctuation, brackets, extra spaces
# ============================================================================

import string

def clean_text_field(desc):
    if isinstance(desc, list):
        desc = ' '.join(desc)
    elif not isinstance(desc, str):
        desc = ""
    # Remove punctuation and brackets
    desc = desc.translate(str.maketrans('', '', string.punctuation + "[]"))
    # Replace multiple spaces with single space
    return ' '.join(desc.split())

# Clean description and feature
node_df['description_cleaned'] = node_df['description'].apply(clean_text_field)
node_df['feature_cleaned'] = node_df['feature'].apply(clean_text_field)

# Drop original columns and rename cleaned ones
node_df = node_df.drop(columns=['feature', 'description'])
node_df = node_df.rename(columns={
    'description_cleaned': 'description',
    'feature_cleaned': 'feature'
})

print("Description and feature cleaned")
display(node_df.head())

In [None]:
# ============================================================================
# Clean price column - remove non-numeric characters and convert to numeric
# ============================================================================

node_df['price'] = node_df['price'].replace(r'[^\d.]', '', regex=True)
node_df['price'] = node_df['price'].replace('', pd.NA)
node_df['price'] = pd.to_numeric(node_df['price'], errors='coerce')

print("Price format fixed")
display(node_df.head())

In [None]:
# ============================================================================
# Extract numeric part from rank, remove commas
# ============================================================================

import re

node_df['rank_cleaned'] = node_df['rank'].apply(lambda x: str(x).replace(',', '')) \
                                         .apply(lambda x: re.findall(r'\d+', x)) \
                                         .apply(lambda x: x[0] if x else "Unknown")

# Drop original and rename
node_df = node_df.drop(columns=['rank'])
node_df = node_df.rename(columns={'rank_cleaned': 'rank'})

print("Rank format fixed")
display(node_df.head())

In [None]:
# ============================================================================
# Calculate avg_rating and rating_count from ratings column
# ============================================================================

node_df['avg_rating'] = node_df['ratings'].apply(
    lambda x: sum(eval(x)) / len(eval(x)) if isinstance(x, str) and len(eval(x)) > 0 else 0
)
node_df['rating_count'] = node_df['ratings'].apply(
    lambda x: len(eval(x)) if isinstance(x, str) else 0
)

print("Average rating and rating count calculated")
display(node_df.head())

In [None]:
# ============================================================================
# Remove duplicate node IDs from edge columns
# ============================================================================

edge_columns = ['also_buy', 'also_view', 'has_brand', 'has_category', 'has_color']

for col in edge_columns:
    if col in node_df.columns:
        node_df[col] = node_df[col].apply(
            lambda x: list(set(eval(x))) if isinstance(x, str) else []
        )

print("Duplicates removed from edge data")
display(node_df.head())

In [None]:
# ============================================================================
# Create combined_text column
# ============================================================================

print(f"Creating combined_text column for {len(node_df)} rows...")

def create_combined_text(row):
    """
    Combine all relevant text fields into a single text column.
    Format matches existing file: "Field: value. Field: value. ..."
    """
    parts = []

    # Title
    if pd.notna(row.get('title')) and str(row['title']).strip():
        parts.append(f"Title: {row['title']}")

    # Description
    if pd.notna(row.get('description')) and str(row['description']).strip():
        parts.append(f"Description: {row['description']}")

    # Feature
    if pd.notna(row.get('feature')) and str(row['feature']).strip():
        parts.append(f"Features: {row['feature']}")

    # Brand
    if pd.notna(row.get('brand')) and str(row['brand']).strip():
        parts.append(f"Brand: {row['brand']}")

    # Reviews
    if pd.notna(row.get('reviews')) and str(row['reviews']).strip():
        parts.append(f"Reviews: {row['reviews']}")

    # Price
    if pd.notna(row.get('price')):
        parts.append(f"Price: {row['price']}")

    # Global Category
    if pd.notna(row.get('global_category')) and str(row['global_category']).strip():
        parts.append(f"Global Category: {row['global_category']}")

    # Categories
    if pd.notna(row.get('categories')) and str(row['categories']).strip():
        parts.append(f"Categories: {row['categories']}")

    # Rank
    if pd.notna(row.get('rank')) and str(row['rank']).strip() and str(row['rank']) != 'Unknown':
        parts.append(f"Rank: {row['rank']}")

    # Average Rating
    if pd.notna(row.get('avg_rating')) and row['avg_rating'] > 0:
        parts.append(f"Rating: {row['avg_rating']}")

    # Join all parts with ". "
    return ". ".join(parts)

# Apply the function to create combined_text
node_df['combined_text'] = node_df.apply(create_combined_text, axis=1)

print(f"✅ Created combined_text column")

# Verify
print(f"\nSample combined_text from new file:")
print(node_df['combined_text'].iloc[0])

print(f"\nLength statistics:")
print(f"Avg length: {node_df['combined_text'].str.len().mean():.0f} chars")
print(f"Max length: {node_df['combined_text'].str.len().max():.0f} chars")
print(f"Min length: {node_df['combined_text'].str.len().min():.0f} chars")


In [None]:
# Save the cleaned dataframe
output_path = '/content/drive/MyDrive/clean_stark_amazon_skb.csv'
node_df.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")
print(f"\nFinal shape: {node_df.shape}")
print(f"Columns: {list(node_df.columns)}")