# =============================================================================
# Cell 1: Setup and Imports
# =============================================================================

In [1]:
import requests
import pandas as pd
import numpy as np
import json
import os
import random
from datetime import datetime
from faker import Faker
import warnings

warnings.filterwarnings('ignore')

# Set seeds for reproducible results
random.seed(42)
np.random.seed(42)
fake = Faker()
Faker.seed(42)

# Create project directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

print("Setup complete - ready to collect e-commerce data")

Setup complete - ready to collect e-commerce data


# =============================================================================
# Cell 2: Product Category Templates
# =============================================================================

In [2]:
# Realistic product categories with brands and price ranges
PRODUCT_CATEGORIES = {
    "electronics": {
        "types": ["smartphones", "laptops", "tablets", "headphones", "cameras", "speakers"],
        "brands": ["Apple", "Samsung", "Sony", "Dell", "HP", "Canon", "Bose", "JBL"],
        "price_range": (50, 2000),
        "popularity_factor": 1.5  # Electronics get more reviews
    },
    "jewelery": {
        "types": ["rings", "necklaces", "earrings", "bracelets", "watches", "chains"],
        "brands": ["Tiffany", "Cartier", "Pandora", "Swarovski", "Kay", "Zales"],
        "price_range": (25, 1000),
        "popularity_factor": 0.8
    },
    "men's clothing": {
        "types": ["shirts", "pants", "jackets", "shoes", "accessories", "activewear"],
        "brands": ["Nike", "Adidas", "Levi's", "Ralph Lauren", "Calvin Klein", "Under Armour"],
        "price_range": (15, 400),
        "popularity_factor": 1.0
    },
    "women's clothing": {
        "types": ["dresses", "tops", "pants", "shoes", "bags", "accessories"],
        "brands": ["Zara", "H&M", "Kate Spade", "Michael Kors", "Lululemon", "Anthropologie"],
        "price_range": (20, 500),
        "popularity_factor": 1.3
    }
}

print(f"Product templates ready for {len(PRODUCT_CATEGORIES)} categories")

Product templates ready for 4 categories


# =============================================================================
# Cell 3: API Data Collection
# =============================================================================

In [3]:
def fetch_base_products():
    """Get sample products from Fake Store API as templates"""
    try:
        response = requests.get("https://fakestoreapi.com/products", timeout=10)
        if response.status_code == 200:
            products = response.json()
            print(f"Fetched {len(products)} template products from API")
            return products
        else:
            print(f"API request failed with status {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching from API: {e}")
        return None

# Get base products
base_products = fetch_base_products()

if base_products:
    # Quick analysis
    categories = [p['category'] for p in base_products]
    prices = [p['price'] for p in base_products]
    
    print(f"API data overview:")
    print(f"  Categories: {list(set(categories))}")
    print(f"  Price range: ${min(prices):.2f} - ${max(prices):.2f}")
    print(f"  Ready to expand dataset")
else:
    print("Could not fetch base products - will use fallback generation")

Fetched 20 template products from API
API data overview:
  Categories: ["women's clothing", 'electronics', 'jewelery', "men's clothing"]
  Price range: $7.95 - $999.99
  Ready to expand dataset


# =============================================================================
# Cell 4: Product Generation Functions
# =============================================================================

In [4]:
def generate_realistic_product(category, product_id, base_template=None):
    """Generate a single realistic product for the category"""
    
    category_data = PRODUCT_CATEGORIES[category]
    
    # Select random attributes
    product_type = random.choice(category_data["types"])
    brand = random.choice(category_data["brands"])
    
    # Generate realistic price
    min_price, max_price = category_data["price_range"]
    base_price = random.uniform(min_price, max_price)
    
    # Add some price variation (market inefficiencies)
    if random.random() < 0.05:  # 5% get unusual pricing
        if random.random() < 0.5:
            base_price *= random.uniform(1.5, 2.0)  # Overpriced
        else:
            base_price *= random.uniform(0.5, 0.8)  # Underpriced gems
    
    price = round(base_price, 2)
    
    # Generate rating with realistic distribution
    # Higher prices get slight rating boost, but with variation
    price_factor = (price - min_price) / (max_price - min_price)
    base_rating = random.normalvariate(4.0, 0.8)
    rating = max(1.0, min(5.0, base_rating + price_factor * 0.3))
    rating = round(rating, 1)
    
    # Generate review count based on rating and category popularity
    popularity = category_data["popularity_factor"]
    rating_boost = (rating - 1) / 4  # 0-1 scale
    base_reviews = random.lognormvariate(4, 1.2)
    review_count = max(5, int(base_reviews * popularity * (1 + rating_boost)))
    
    # Create product title and description
    title = f"{brand} {product_type.title()} - {fake.catch_phrase()}"
    description = f"Quality {product_type} from {brand}. {fake.text(max_nb_chars=150)}"
    
    # Use base template image if available
    image_url = "https://via.placeholder.com/300"
    if base_template and 'image' in base_template:
        image_url = base_template['image']
    
    return {
        "id": product_id,
        "title": title,
        "price": price,
        "description": description,
        "category": category,
        "image": image_url,
        "rating": {
            "rate": rating,
            "count": review_count
        }
    }

def create_product_dataset(target_size=2000):
    """Generate a large, realistic product dataset"""
    
    print(f"Generating {target_size} realistic products...")
    
    products = []
    products_per_category = target_size // len(PRODUCT_CATEGORIES)
    
    for category in PRODUCT_CATEGORIES.keys():
        print(f"  Creating {products_per_category} {category} products...")
        
        # Find base template for this category
        base_template = None
        if base_products:
            category_templates = [p for p in base_products if p['category'].lower() == category.lower()]
            if category_templates:
                base_template = random.choice(category_templates)
        
        # Generate products for this category
        for i in range(products_per_category):
            product_id = len(products) + 1000  # Start IDs from 1000
            product = generate_realistic_product(category, product_id, base_template)
            products.append(product)
    
    print(f"Generated {len(products)} total products")
    return products

# Generate the dataset
print("Starting product generation...")
large_dataset = create_product_dataset(2000)

# Quick validation
if large_dataset:
    categories = [p['category'] for p in large_dataset]
    prices = [p['price'] for p in large_dataset]
    ratings = [p['rating']['rate'] for p in large_dataset]
    
    print(f"\nDataset summary:")
    print(f"  Total products: {len(large_dataset)}")
    print(f"  Price range: ${min(prices):.2f} - ${max(prices):.2f}")
    print(f"  Rating range: {min(ratings):.1f} - {max(ratings):.1f}")
    
    # Category breakdown
    for category in set(categories):
        count = categories.count(category)
        print(f"  {category}: {count} products")
else:
    print("Product generation failed")

Starting product generation...
Generating 2000 realistic products...
  Creating 500 electronics products...
  Creating 500 jewelery products...
  Creating 500 men's clothing products...
  Creating 500 women's clothing products...
Generated 2000 total products

Dataset summary:
  Total products: 2000
  Price range: $13.58 - $2708.43
  Rating range: 1.0 - 5.0
  women's clothing: 500 products
  electronics: 500 products
  jewelery: 500 products
  men's clothing: 500 products


# =============================================================================
# Cell 5: Create Enhanced DataFrame
# =============================================================================

In [5]:
def create_business_dataframe(products_data):
    """Convert product data to DataFrame with business metrics"""
    
    if not products_data:
        print("No product data to process")
        return None
    
    print(f"Processing {len(products_data)} products into business DataFrame...")
    
    # Convert to DataFrame
    df = pd.DataFrame(products_data)
    
    # Extract rating components
    if 'rating' in df.columns:
        df['rating_score'] = df['rating'].apply(lambda x: x['rate'])
        df['rating_count'] = df['rating'].apply(lambda x: x['count'])
        df = df.drop('rating', axis=1)
    
    # Add business intelligence features
    print("Adding business metrics...")
    
    # Value score - rating quality per dollar spent
    df['value_score'] = df['rating_score'] / (df['price'] / 100)
    
    # Popularity score - normalized review volume
    df['popularity_score'] = df['rating_count'] / df['rating_count'].max()
    
    # Performance categories (simple 2x2 matrix)
    price_median = df['price'].median()
    rating_median = df['rating_score'].median()
    
    def categorize_performance(row):
        high_price = row['price'] >= price_median
        high_rating = row['rating_score'] >= rating_median
        
        if high_price and high_rating:
            return 'Premium Star'
        elif not high_price and high_rating:
            return 'Value Champion'
        elif high_price and not high_rating:
            return 'Overpriced'
        else:
            return 'Budget Basic'
    
    df['performance_category'] = df.apply(categorize_performance, axis=1)
    
    # Revenue potential estimate
    df['revenue_potential'] = (df['rating_score'] * df['rating_count'] * df['price']) / 1000
    
    # Simple text features
    df['title_length'] = df['title'].str.len()
    df['description_length'] = df['description'].str.len()
    
    return df

# Create the business DataFrame
enhanced_df = create_business_dataframe(large_dataset)

if enhanced_df is not None:
    print(f"\nEnhanced DataFrame created:")
    print(f"  Shape: {enhanced_df.shape}")
    print(f"  Columns: {list(enhanced_df.columns)}")
    
    # Show performance distribution
    print(f"\nPerformance categories:")
    perf_counts = enhanced_df['performance_category'].value_counts()
    for category, count in perf_counts.items():
        pct = (count / len(enhanced_df)) * 100
        print(f"  {category}: {count} ({pct:.1f}%)")
    
    # Show top value products
    print(f"\nTop 5 value champions:")
    top_value = enhanced_df.nlargest(5, 'value_score')
    for idx, row in top_value.iterrows():
        print(f"  {row['title'][:40]}... - ${row['price']:.2f} - {row['rating_score']}/5")
else:
    print("Failed to create DataFrame")

Processing 2000 products into business DataFrame...
Adding business metrics...

Enhanced DataFrame created:
  Shape: (2000, 14)
  Columns: ['id', 'title', 'price', 'description', 'category', 'image', 'rating_score', 'rating_count', 'value_score', 'popularity_score', 'performance_category', 'revenue_potential', 'title_length', 'description_length']

Performance categories:
  Premium Star: 505 (25.2%)
  Budget Basic: 501 (25.1%)
  Value Champion: 499 (24.9%)
  Overpriced: 495 (24.8%)

Top 5 value champions:
  Cartier Earrings - Configurable responsi... - $14.21 - 5.0/5
  Nike Shirts - Programmable homogeneous d... - $13.58 - 4.4/5
  Under Armour Accessories - Cloned upward... - $15.28 - 4.7/5
  Ralph Lauren Jackets - Ergonomic asymmet... - $21.74 - 4.5/5
  Ralph Lauren Activewear - Profit-focused... - $22.91 - 4.7/5


# =============================================================================
# Cell 6: Save Dataset
# =============================================================================

In [6]:
def save_dataset(products, dataframe):
    """Save both raw and processed data"""
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    try:
        # Save raw JSON
        raw_file = f'data/raw/products_{timestamp}.json'
        with open(raw_file, 'w') as f:
            json.dump({
                'products': products,
                'metadata': {
                    'timestamp': timestamp,
                    'total_products': len(products),
                    'generation_method': 'realistic_simulation'
                }
            }, f, indent=2)
        
        print(f"Raw data saved: {raw_file}")
        
        # Save processed CSV
        if dataframe is not None:
            csv_file = f'data/processed/products_{timestamp}.csv'
            dataframe.to_csv(csv_file, index=False)
            
            # Also save as latest version
            latest_file = 'data/processed/products_latest.csv'
            dataframe.to_csv(latest_file, index=False)
            
            print(f"Processed data saved: {csv_file}")
            print(f"Latest version: {latest_file}")
            
            # Create simple summary
            summary_file = f'data/processed/summary_{timestamp}.txt'
            with open(summary_file, 'w') as f:
                f.write("E-COMMERCE DATASET SUMMARY\n")
                f.write("=" * 25 + "\n\n")
                f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
                f.write(f"Total Products: {len(dataframe)}\n")
                f.write(f"Categories: {dataframe['category'].nunique()}\n")
                f.write(f"Price Range: ${dataframe['price'].min():.2f} - ${dataframe['price'].max():.2f}\n")
                f.write(f"Average Rating: {dataframe['rating_score'].mean():.2f}\n\n")
                
                f.write("Performance Distribution:\n")
                for cat, count in dataframe['performance_category'].value_counts().items():
                    pct = (count / len(dataframe)) * 100
                    f.write(f"  {cat}: {count} ({pct:.1f}%)\n")
            
            print(f"Summary saved: {summary_file}")
        
        return True
        
    except Exception as e:
        print(f"Error saving files: {e}")
        return False

# Save the dataset
if large_dataset and enhanced_df is not None:
    print("\nSaving dataset...")
    success = save_dataset(large_dataset, enhanced_df)
    
    if success:
        print(f"\nData collection complete!")
        print(f"Ready for analysis with {len(enhanced_df)} products")
    else:
        print("Error saving dataset")
else:
    print("No data to save")


Saving dataset...
Raw data saved: data/raw/products_20250525_160043.json
Processed data saved: data/processed/products_20250525_160043.csv
Latest version: data/processed/products_latest.csv
Summary saved: data/processed/summary_20250525_160043.txt

Data collection complete!
Ready for analysis with 2000 products


# =============================================================================
# Cell 7: Dataset Validation & Preview
# =============================================================================

In [7]:
def validate_dataset(dataframe):
    """Quick validation and preview of the dataset"""
    
    if dataframe is None:
        print("No dataframe to validate")
        return False
    
    print("Dataset validation:")
    
    # Check for missing values
    missing = dataframe.isnull().sum()
    if missing.sum() > 0:
        print("  Warning: Missing values found")
        print(missing[missing > 0])
    else:
        print("  ✓ No missing values")
    
    # Check data types
    print(f"  ✓ Data types look good")
    
    # Check value ranges
    if dataframe['price'].min() < 0:
        print("  Warning: Negative prices found")
    else:
        print("  ✓ Price values valid")
    
    if not (1 <= dataframe['rating_score'].min() and dataframe['rating_score'].max() <= 5):
        print("  Warning: Ratings outside 1-5 range")
    else:
        print("  ✓ Rating values valid")
    
    # Show sample data
    print(f"\nDataset preview:")
    print(dataframe[['title', 'category', 'price', 'rating_score', 'performance_category']].head())
    
    return True

# Validate the final dataset
if enhanced_df is not None:
    validate_dataset(enhanced_df)
    print(f"\nDataset ready for exploratory data analysis!")
else:
    print("No dataset to validate")

Dataset validation:
  ✓ No missing values
  ✓ Data types look good
  ✓ Price values valid
  ✓ Rating values valid

Dataset preview:
                                               title     category    price  \
0  Apple Smartphones - Sharable bifurcated algorithm  electronics  1496.02   
1       Dell Smartphones - Reactive explicit product  electronics   503.69   
2   HP Headphones - Customizable systemic monitoring  electronics  1155.25   
3     Bose Smartphones - Virtual national throughput  electronics   238.60   
4  Samsung Headphones - Open-source maximized sup...  electronics  1126.48   

   rating_score performance_category  
0           4.9         Premium Star  
1           4.4         Premium Star  
2           3.9           Overpriced  
3           4.8       Value Champion  
4           5.0         Premium Star  

Dataset ready for exploratory data analysis!
