In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random

# Set random seed for reproducibility
np.random.seed(42)

# Define style categories
style_categories = ['Casual', 'Formal', 'Vintage', 'Sporty', 'Bohemian', 'Minimalist']

# Define clothing categories
clothing_categories = ['Shirts', 'Pants', 'Dresses', 'Jackets', 'Accessories', 'Shoes']

# Define colors
colors = ['Black', 'White', 'Blue', 'Red', 'Green', 'Yellow', 'Pink', 'Purple', 'Brown', 'Gray']

# Define price tiers
price_tiers = ['Budget', 'Mid-range', 'Premium']

# Generate user data
num_users = 1000
users = []

for user_id in range(1, num_users + 1):
    # Basic user info
    gender = np.random.choice(['Male', 'Female', 'Non-binary'], p=[0.35, 0.60, 0.05])
    age = np.random.randint(18, 65)
    subscription_tier = np.random.choice(['Budget', 'Mid-range', 'Premium'], p=[0.5, 0.3, 0.2])
    
    # Style preferences
    style_comfort = {}
    style_willing = {}
    style_avoid = {}
    
    # Randomly assign comfort levels for each style
    for style in style_categories:
        comfort_level = np.random.randint(1, 11)  # 1-10 scale
        style_comfort[f'{style}_comfort'] = comfort_level
        
        # Willing to try if comfort level is medium-low
        style_willing[f'{style}_willing'] = 1 if 3 <= comfort_level <= 6 else 0
        
        # Avoid if comfort level is very low
        style_avoid[f'{style}_avoid'] = 1 if comfort_level < 3 else 0
    
    # Create user record
    user = {
        'user_id': user_id,
        'gender': gender,
        'age': age,
        'subscription_tier': subscription_tier,
        'signup_date': (datetime.now() - timedelta(days=np.random.randint(1, 365))).strftime('%Y-%m-%d'),
        **style_comfort,
        **style_willing,
        **style_avoid
    }
    
    users.append(user)

# Create user dataframe
user_df = pd.DataFrame(users)

# Generate product data
num_products = 500
products = []

for product_id in range(1, num_products + 1):
    # Basic product info
    category = np.random.choice(clothing_categories)
    style = np.random.choice(style_categories)
    color = np.random.choice(colors)
    price_tier = np.random.choice(price_tiers, p=[0.4, 0.4, 0.2])
    price = np.random.randint(10, 30) if price_tier == 'Budget' else \
            np.random.randint(30, 80) if price_tier == 'Mid-range' else \
            np.random.randint(80, 200)
    
    # Create product record
    product = {
        'product_id': product_id,
        'category': category,
        'style': style,
        'color': color,
        'price_tier': price_tier,
        'price': price,
        'season': np.random.choice(['Spring', 'Summer', 'Fall', 'Winter', 'All-season']),
        'in_stock': np.random.choice([True, False], p=[0.9, 0.1])
    }
    
    products.append(product)

# Create product dataframe
product_df = pd.DataFrame(products)

# Generate user interaction data (browsing, carting, purchasing)
num_interactions = 20000
interactions = []

# Current date for reference
current_date = datetime.now()

for _ in range(num_interactions):
    user_id = np.random.randint(1, num_users + 1)
    product_id = np.random.randint(1, num_products + 1)
    
    # Get user's preference for the product's style
    user_row = user_df[user_df['user_id'] == user_id].iloc[0]
    product_row = product_df[product_df['product_id'] == product_id].iloc[0]
    
    style = product_row['style']
    comfort_level = user_row[f'{style}_comfort']
    
    # Higher comfort level increases likelihood of positive interaction
    browse_probability = 0.3 + (comfort_level / 20)  # Scale from 0.3 to 0.8
    cart_probability = browse_probability * 0.7
    purchase_probability = cart_probability * 0.6
    
    # Determine interaction type with weighted probabilities
    p_view = browse_probability
    p_cart = (1 - p_view) * 0.7  # 70% chance of cart if not viewed
    p_purchase = 1 - p_view - p_cart  # Remaining probability goes to purchase

    interaction_type = np.random.choice(
    ['view', 'cart', 'purchase'], 
    p=[p_view, p_cart, p_purchase]
    )
    
    # Generate interaction timestamp (within last 90 days)
    days_ago = np.random.randint(0, 90)
    timestamp = (current_date - timedelta(days=days_ago)).strftime('%Y-%m-%d %H:%M:%S')
    
    # Create interaction record
    interaction = {
        'user_id': user_id,
        'product_id': product_id,
        'interaction_type': interaction_type,
        'timestamp': timestamp
    }
    
    interactions.append(interaction)

# Create interaction dataframe
interaction_df = pd.DataFrame(interactions)

# Generate subscription delivery data
num_deliveries = 5000
deliveries = []

for _ in range(num_deliveries):
    user_id = np.random.randint(1, num_users + 1)
    
    # Get user's subscription tier
    user_row = user_df[user_df['user_id'] == user_id].iloc[0]
    subscription_tier = user_row['subscription_tier']
    
    # Generate delivery timestamp (within last 180 days)
    days_ago = np.random.randint(0, 180)
    delivery_date = (current_date - timedelta(days=days_ago)).strftime('%Y-%m-%d')
    
    # Generate 3-5 items per delivery
    num_items = np.random.randint(3, 6)
    
    # Filter products by subscription tier
    tier_products = product_df[product_df['price_tier'] == subscription_tier]
    
    # Randomly select products for the delivery
    if len(tier_products) >= num_items:
        delivery_products = tier_products.sample(num_items)['product_id'].tolist()
    else:
        delivery_products = tier_products['product_id'].tolist()
        remaining = num_items - len(delivery_products)
        other_products = product_df[~product_df['product_id'].isin(delivery_products)].sample(remaining)['product_id'].tolist()
        delivery_products.extend(other_products)
    
    # Create delivery records
    for product_id in delivery_products:
        # Determine if item was returned
        product_row = product_df[product_df['product_id'] == product_id].iloc[0]
        style = product_row['style']
        comfort_level = user_row[f'{style}_comfort']
        
        # Higher comfort level decreases likelihood of return
        return_probability = max(0.1, 0.5 - (comfort_level / 20))  # Scale from 0.5 to 0.1
        returned = np.random.choice([True, False], p=[return_probability, 1-return_probability])
        
        # Create delivery record
        delivery = {
            'user_id': user_id,
            'product_id': product_id,
            'delivery_date': delivery_date,
            'subscription_tier': subscription_tier,
            'returned': returned,
            'feedback_rating': None if returned else np.random.randint(1, 6)  # 1-5 rating if not returned
        }
        
        deliveries.append(delivery)

# Create delivery dataframe
delivery_df = pd.DataFrame(deliveries)

# Save all dataframes to CSV
user_df.to_csv('hnm_users.csv', index=False)
product_df.to_csv('hnm_products.csv', index=False)
interaction_df.to_csv('hnm_interactions.csv', index=False)
delivery_df.to_csv('hnm_deliveries.csv', index=False)

print("Datasets created successfully!")


Datasets created successfully!
