In [None]:
# RETAILROCKET DATASET - EXPLORATORY DATA ANALYSIS
# Dataset: E-commerce user behavior (views, add-to-cart, purchases)
# Goal: Understand user patterns, item popularity, and conversion funnels

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# 1. PROJECT SETUP

In [None]:
cwd = Path(os.getcwd())
if (cwd / "data").exists():
    PROJECT_ROOT = cwd
else:
    PROJECT_ROOT = cwd.parent  
print("project root:", PROJECT_ROOT)

DATA_DIR = PROJECT_ROOT / "data" / "raw" 
print("data dir:", DATA_DIR)
print("files:", list(DATA_DIR.glob("*"))[:10])  

In [None]:

# 2. DATA LOADING

In [None]:
events = pd.read_csv(DATA_DIR / "events.csv",dtype={'timestamp': 'int64'})
category_tree = pd.read_csv(DATA_DIR / "category_tree.csv")
prop1 = pd.read_csv(DATA_DIR / "item_properties_part1.csv",dtype={'timestamp': 'int64'})
prop2 = pd.read_csv(DATA_DIR / "item_properties_part2.csv",dtype={'timestamp': 'int64'})

In [None]:
# Merging properties
item_properties = pd.concat([prop1, prop2], ignore_index=True)

In [None]:
print(f"Item properties merged: {item_properties.shape} (part1: {prop1.shape}, part2: {prop2.shape})")


In [None]:
# 3. INITIAL DATA OVERVIEW

In [None]:
datasets = {
    'events': events,
    'category_tree': category_tree,
    'item_properties': item_properties
}

for name, df in datasets.items():
    print(f"\n{name.upper()}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Memory: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
    print(f"\nFirst 3 rows:")
    print(df.head(3).to_string(index=False))
    print(f"Nulls:\n{df.isna().sum()}")

In [None]:
# 4. EVENTS - PREPROCESSING & ANALYSIS

In [None]:
# Convert timestamp
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
events['date'] = events['timestamp'].dt.date
events['hour'] = events['timestamp'].dt.hour
events['dayofweek'] = events['timestamp'].dt.dayofweek
events['day_name'] = events['timestamp'].dt.day_name()

In [None]:
item_col ="itemid"
user_col = "visitorid"
event_col = "event" 

# unique counts
print("\nunique counts (sample):")
for c in [user_col, item_col, event_col]:
    if c in events.columns:
        print(c, "unique:", events[c].nunique())


# top products and top users in the sample
print("\nTop 10 items (by events):\n", events[item_col].value_counts().head(10))
print("\nTop 10 users (by events):\n", events[user_col].value_counts().head(10))

In [None]:
#Basic Statistics
print(f"\nBasic Statistics:")
print(f"  Total events: {len(events):,}")
print(f"  Unique visitors: {events['visitorid'].nunique():,}")
print(f"  Unique items: {events['itemid'].nunique():,}")
print(f"  Date range: {events['timestamp'].min()} to {events['timestamp'].max()}")
print(f"  Days span: {(events['timestamp'].max() - events['timestamp'].min()).days} days")

In [None]:
print("\nEvent Type Distribution:")
event_dist = events['event'].value_counts()
for event_type, count in event_dist.items():
    pct = count / len(events) * 100
    print(f"  {event_type}: {count:,} ({pct:.2f}%)")

In [None]:
# Conversion metrics
total_views = event_dist.get('view', 0)
total_addtocart = event_dist.get('addtocart', 0)
total_transactions = event_dist.get('transaction', 0)

print("\nConversion Funnel:")
print(f"  Views ‚Üí AddToCart: {total_addtocart/total_views*100:.2f}%" if total_views > 0 else "N/A")
print(f"  Views ‚Üí Purchase: {total_transactions/total_views*100:.2f}%" if total_views > 0 else "N/A")
print(f"  AddToCart ‚Üí Purchase: {total_transactions/total_addtocart*100:.2f}%" if total_addtocart > 0 else "N/A")

In [None]:
print(events['timestamp'].min())
print(events['timestamp'].max())

In [None]:
# 5. ITEM PROPERTIES - analysis

In [None]:
# Convert timestamp
item_properties['timestamp'] = pd.to_datetime(item_properties['timestamp'], unit='ms')

In [None]:
print(f"\nBasic Statistics:")
print(f"  Total property records: {len(item_properties):,}")
print(f"  Unique items: {item_properties['itemid'].nunique():,}")
print(f"  Unique properties: {item_properties['property'].nunique()}")
print(f"  Date range: {item_properties['timestamp'].min()} to {item_properties['timestamp'].max()}")


In [None]:
print("\nüìã Property Types Overview:")
prop_counts = item_properties['property'].value_counts()
print(f"  Total unique property types: {len(prop_counts):,}")

# Show only the most common properties (top 10)
print(f"\n  Top 10 Most Common Properties:")
for prop, count in prop_counts.head(10).items():
    unique_items = item_properties[item_properties['property'] == prop]['itemid'].nunique()
    print(f"    {prop}: {count:,} records across {unique_items:,} items")


In [None]:
# Property coverage per item
print("\nüìä Property Coverage per Item:")
props_per_item = item_properties.groupby('itemid')['property'].nunique()
print(f"  Items with 1 property: {(props_per_item == 1).sum():,}")
print(f"  Items with 2+ properties: {(props_per_item >= 2).sum():,}")
print(f"  Items with 5+ properties: {(props_per_item >= 5).sum():,}")


In [None]:
# Sample item with all properties
print("\nüîç Sample Item Properties:")
# Find an item with many properties but not the absolute max (which might be an outlier)
props_per_item_sorted = item_properties.groupby('itemid')['property'].nunique().sort_values(ascending=False)
sample_item = props_per_item_sorted.iloc[min(10, len(props_per_item_sorted)-1)]  # 11th item or last
sample_item_id = props_per_item_sorted.index[min(10, len(props_per_item_sorted)-1)]
sample_props = item_properties[item_properties['itemid'] == sample_item_id][['property', 'value']].head(5)
print(f"Item {sample_item_id} (has {sample_item} properties, showing first 5):")
print(sample_props.to_string(index=False))
if sample_item > 5:
    print(f"  ... and {sample_item - 5} more properties")

In [None]:
# 6. CATEGORY TREE - HIERARCHY ANALYSIS

In [None]:
print(f"\nBasic Statistics:")
print(f"  Total categories: {len(category_tree):,}")
print(f"  Unique category IDs: {category_tree['categoryid'].nunique():,}")
print(f"  Unique parent IDs: {category_tree['parentid'].nunique():,}")


In [None]:
print(f"\nBasic Statistics:")
print(f"  Total categories: {len(category_tree):,}")
print(f"  Unique category IDs: {category_tree['categoryid'].nunique():,}")
print(f"  Unique parent IDs: {category_tree['parentid'].nunique():,}")

In [None]:
# Root categories (no parent)
root_categories = category_tree[category_tree['parentid'].isna()]
print(f"\n Root Categories (top-level): {len(root_categories)}")

In [None]:
# Depth analysis 
print("\nüìè Category Hierarchy Depth Analysis:")

#  parent lookup dictionary 
parent_lookup = dict(zip(category_tree['categoryid'], category_tree['parentid']))
def get_category_depth_fast(cat_id, parent_dict, max_depth=20):
    """Calculate depth using dictionary lookup (much faster)"""
    depth = 0
    current = cat_id
    visited = set()
    
    while depth < max_depth and current in parent_dict:
        if current in visited:  # Detect cycles
            return -1
        visited.add(current)
        
        parent = parent_dict[current]
        if pd.isna(parent):
            return depth
        current = parent
        depth += 1
    
    return depth


In [None]:
# Sample categories for depth calculation (or use all if not too many)
sample_size = min(1000, len(category_tree))
sample_cats = category_tree['categoryid'].sample(sample_size).tolist()
depths = [get_category_depth_fast(cat, parent_lookup) for cat in sample_cats]
valid_depths = [d for d in depths if d >= 0]  # Filter out cycles

In [None]:
if valid_depths:
    print(f"  Sample size: {len(valid_depths)} categories")
    print(f"  Min depth: {min(valid_depths)}")
    print(f"  Max depth: {max(valid_depths)}")
    print(f"  Avg depth: {np.mean(valid_depths):.2f}")
    if len(depths) != len(valid_depths):
        print(f"  ‚ö†Ô∏è  Circular references detected: {len(depths) - len(valid_depths)}")
else:
    print("  Could not calculate depths (possible data issues)")

In [None]:
# Categories with children
cats_with_children = category_tree['parentid'].value_counts()
print(f"\nüë∂ Categories with Children:")
print(f"  Categories that are parents: {len(cats_with_children)}")
print(f"  Max children per category: {cats_with_children.max()}")
print(f"  Avg children per parent: {cats_with_children.mean():.2f}")

In [None]:
# Items in events vs item_properties
items_in_events = set(events['itemid'].unique())
items_in_properties = set(item_properties['itemid'].unique())

items_both = items_in_events & items_in_properties
items_only_events = items_in_events - items_in_properties
items_only_properties = items_in_properties - items_in_events

print(f"\nüîó Events ‚Üî Item Properties:")
print(f"  Items in BOTH: {len(items_both):,} ({len(items_both)/len(items_in_events)*100:.1f}% of events)")
print(f"  Items ONLY in events: {len(items_only_events):,} (no metadata)")
print(f"  Items ONLY in properties: {len(items_only_properties):,} (no interactions)")


In [None]:
# Get categoryid from properties
items_with_cat = item_properties[item_properties['property'] == 'categoryid'][['itemid', 'value']]
items_with_cat.columns = ['itemid', 'categoryid']
items_with_cat['categoryid'] = pd.to_numeric(items_with_cat['categoryid'], errors='coerce')


In [None]:
# Categories in category_tree vs properties
if len(items_with_cat) > 0:
    cats_in_properties = set(items_with_cat['categoryid'].dropna().astype(int).unique())
    cats_in_tree = set(category_tree['categoryid'].unique())
    
    cats_both = cats_in_properties & cats_in_tree
    cats_only_props = cats_in_properties - cats_in_tree
    cats_only_tree = cats_in_tree - cats_in_properties
    
    print(f"\nüîó Item Properties ‚Üî Category Tree:")
    print(f"  Categories in BOTH: {len(cats_both):,}")
    print(f"  Categories ONLY in properties: {len(cats_only_props):,} (orphan categories)")
    print(f"  Categories ONLY in tree: {len(cats_only_tree):,} (unused categories)")


In [None]:
# Data completeness summary
print(f"\nüìä Overall Data Completeness:")
events_with_props = events[events['itemid'].isin(items_in_properties)]
print(f"  Events with item metadata: {len(events_with_props):,} ({len(events_with_props)/len(events)*100:.1f}%)")

if len(items_with_cat) > 0:
    events_with_category = events.merge(items_with_cat, on='itemid', how='inner')
    print(f"  Events with category info: {len(events_with_category):,} ({len(events_with_category)/len(events)*100:.1f}%)")


In [None]:
print("\nüîç Duplicates:")
print(f"  Events: {events.duplicated().sum():,}")
print(f"  Item Properties: {item_properties.duplicated().sum():,}")
print(f"  Category Tree: {category_tree.duplicated().sum():,}")


In [None]:
print("\n‚ö†Ô∏è  Invalid Values:")
print(f"  Negative item IDs in events: {(events['itemid'] < 0).sum()}")
print(f"  Negative visitor IDs: {(events['visitorid'] < 0).sum()}")
print(f"  Null item IDs in events: {events['itemid'].isna().sum()}")
print(f"  Null visitor IDs: {events['visitorid'].isna().sum()}")

In [None]:
print("\nüìâ Missing Data:")
print(f"  Events - missing timestamps: {events['timestamp'].isna().sum()}")
print(f"  Item Properties - missing values: {item_properties['value'].isna().sum():,}")
print(f"  Category Tree - missing parentid: {category_tree['parentid'].isna().sum()} (root categories)")


In [None]:

print("\n‚ö†Ô∏è  Challenges:")
if len(items_only_events) > 0:
    print(f"  ‚Ä¢ {len(items_only_events):,} items in events lack metadata (cold start problem)")
print("  ‚Ä¢ Item properties in long format (needs pivoting for modeling)")
if len(cats_only_props) > 0:
    print(f"  ‚Ä¢ {len(cats_only_props)} orphan categories not in tree")