In [2]:
# ============================================================================
# RETAILROCKET DATASET - EXPLORATORY DATA ANALYSIS
# ============================================================================
# Dataset: E-commerce user behavior (views, add-to-cart, purchases)
# Goal: Understand user patterns, item popularity, and conversion funnels
# ============================================================================

In [3]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [4]:
# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [5]:
# ============================================================================
# 1. PROJECT SETUP
# ============================================================================

In [6]:
# Setup project paths (portable & reusable)
# - Prints current working directory (for debugging)
# - Defines PROJECT_ROOT as the main project folder
# - Builds a reusable DATA_DIR path to raw data

cwd = Path(os.getcwd())
if (cwd / "data").exists():
    PROJECT_ROOT = cwd
else:
    PROJECT_ROOT = cwd.parent  
print("project root:", PROJECT_ROOT)

DATA_DIR = PROJECT_ROOT / "data" / "raw" / "retailrocket"
print("data dir:", DATA_DIR)
print("files:", list(DATA_DIR.glob("*"))[:10])  

project root: c:\Users\yasmi\projects\dynamic-recommendation-system
data dir: c:\Users\yasmi\projects\dynamic-recommendation-system\data\raw\retailrocket
files: [WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/category_tree.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/events.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part1.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part2.csv')]


In [7]:
# ============================================================================
# 2. DATA LOADING
# ============================================================================

In [8]:
events = pd.read_csv(DATA_DIR / "events.csv",dtype={'timestamp': 'int64'})
category_tree = pd.read_csv(DATA_DIR / "category_tree.csv")
prop1 = pd.read_csv(DATA_DIR / "item_properties_part1.csv",dtype={'timestamp': 'int64'})
prop2 = pd.read_csv(DATA_DIR / "item_properties_part2.csv",dtype={'timestamp': 'int64'})

In [9]:
# Merging properties
item_properties = pd.concat([prop1, prop2], ignore_index=True)

In [10]:
print(f"Item properties merged: {item_properties.shape} (part1: {prop1.shape}, part2: {prop2.shape})")


Item properties merged: (20275902, 4) (part1: (10999999, 4), part2: (9275903, 4))


In [11]:
# ============================================================================
# 3. INITIAL DATA OVERVIEW
# ============================================================================

In [12]:
datasets = {
    'events': events,
    'category_tree': category_tree,
    'item_properties': item_properties
}

for name, df in datasets.items():
    print(f"\n{name.upper()}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Memory: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
    print(f"\nFirst 3 rows:")
    print(df.head(3).to_string(index=False))
    print(f"Nulls:\n{df.isna().sum()}")


EVENTS
Shape: (2756101, 5)
Columns: ['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']
Memory: 256.82 MB

First 3 rows:
    timestamp  visitorid event  itemid  transactionid
1433221332117     257597  view  355908            NaN
1433224214164     992329  view  248676            NaN
1433221999827     111016  view  318965            NaN
Nulls:
timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
dtype: int64

CATEGORY_TREE
Shape: (1669, 2)
Columns: ['categoryid', 'parentid']
Memory: 0.03 MB

First 3 rows:
 categoryid  parentid
       1016     213.0
        809     169.0
        570       9.0
Nulls:
categoryid     0
parentid      25
dtype: int64

ITEM_PROPERTIES
Shape: (20275902, 4)
Columns: ['timestamp', 'itemid', 'property', 'value']
Memory: 3067.60 MB

First 3 rows:
    timestamp  itemid   property                           value
1435460400000  460429 categoryid                            1338
144150840

In [13]:
# ============================================================================
# 4. EVENTS - PREPROCESSING & ANALYSIS
# ============================================================================

In [14]:
# Convert timestamp
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
events['date'] = events['timestamp'].dt.date
events['hour'] = events['timestamp'].dt.hour
events['dayofweek'] = events['timestamp'].dt.dayofweek
events['day_name'] = events['timestamp'].dt.day_name()

In [15]:
item_col ="itemid"
user_col = "visitorid"
event_col = "event" 

# unique counts
print("\nunique counts (sample):")
for c in [user_col, item_col, event_col]:
    if c in events.columns:
        print(c, "unique:", events[c].nunique())


# top products and top users in the sample
print("\nTop 10 items (by events):\n", events[item_col].value_counts().head(10))
print("\nTop 10 users (by events):\n", events[user_col].value_counts().head(10))


unique counts (sample):
visitorid unique: 1407580
itemid unique: 235061
event unique: 3

Top 10 items (by events):
 itemid
187946    3412
461686    2978
5411      2334
370653    1854
219512    1800
257040    1647
298009    1642
96924     1633
309778    1628
384302    1608
Name: count, dtype: int64

Top 10 users (by events):
 visitorid
1150086    7757
530559     4328
152963     3024
895999     2474
163561     2410
371606     2345
286616     2252
684514     2246
892013     2024
861299     1991
Name: count, dtype: int64


In [16]:
#Basic Statistics
print(f"\nBasic Statistics:")
print(f"  Total events: {len(events):,}")
print(f"  Unique visitors: {events['visitorid'].nunique():,}")
print(f"  Unique items: {events['itemid'].nunique():,}")
print(f"  Date range: {events['timestamp'].min()} to {events['timestamp'].max()}")
print(f"  Days span: {(events['timestamp'].max() - events['timestamp'].min()).days} days")


Basic Statistics:
  Total events: 2,756,101
  Unique visitors: 1,407,580
  Unique items: 235,061
  Date range: 2015-05-03 03:00:04.384000 to 2015-09-18 02:59:47.788000
  Days span: 137 days


In [17]:
print("\nEvent Type Distribution:")
event_dist = events['event'].value_counts()
for event_type, count in event_dist.items():
    pct = count / len(events) * 100
    print(f"  {event_type}: {count:,} ({pct:.2f}%)")


Event Type Distribution:
  view: 2,664,312 (96.67%)
  addtocart: 69,332 (2.52%)
  transaction: 22,457 (0.81%)


In [18]:
# Conversion metrics
total_views = event_dist.get('view', 0)
total_addtocart = event_dist.get('addtocart', 0)
total_transactions = event_dist.get('transaction', 0)

print("\nConversion Funnel:")
print(f"  Views → AddToCart: {total_addtocart/total_views*100:.2f}%" if total_views > 0 else "N/A")
print(f"  Views → Purchase: {total_transactions/total_views*100:.2f}%" if total_views > 0 else "N/A")
print(f"  AddToCart → Purchase: {total_transactions/total_addtocart*100:.2f}%" if total_addtocart > 0 else "N/A")


Conversion Funnel:
  Views → AddToCart: 2.60%
  Views → Purchase: 0.84%
  AddToCart → Purchase: 32.39%


In [19]:
print(events['timestamp'].min())
print(events['timestamp'].max())

2015-05-03 03:00:04.384000
2015-09-18 02:59:47.788000


In [20]:
# ============================================================================
# 5. ITEM PROPERTIES - analysis
# ============================================================================


In [21]:
# Convert timestamp
item_properties['timestamp'] = pd.to_datetime(item_properties['timestamp'], unit='ms')


In [22]:
print(f"\nBasic Statistics:")
print(f"  Total property records: {len(item_properties):,}")
print(f"  Unique items: {item_properties['itemid'].nunique():,}")
print(f"  Unique properties: {item_properties['property'].nunique()}")
print(f"  Date range: {item_properties['timestamp'].min()} to {item_properties['timestamp'].max()}")



Basic Statistics:
  Total property records: 20,275,902
  Unique items: 417,053
  Unique properties: 1104
  Date range: 2015-05-10 03:00:00 to 2015-09-13 03:00:00


In [43]:
print("\n📋 Property Types Overview:")
prop_counts = item_properties['property'].value_counts()
print(f"  Total unique property types: {len(prop_counts):,}")

# Show only the most common properties (top 10)
print(f"\n  Top 10 Most Common Properties:")
for prop, count in prop_counts.head(10).items():
    unique_items = item_properties[item_properties['property'] == prop]['itemid'].nunique()
    print(f"    {prop}: {count:,} records across {unique_items:,} items")



📋 Property Types Overview:
  Total unique property types: 1,104

  Top 10 Most Common Properties:
    888: 3,000,398 records across 417,053 items
    790: 1,790,516 records across 417,053 items
    available: 1,503,639 records across 417,053 items
    categoryid: 788,214 records across 417,053 items
    6: 631,471 records across 409,065 items
    283: 597,419 records across 417,053 items
    776: 574,220 records across 407,305 items
    678: 481,966 records across 417,019 items
    364: 476,486 records across 417,053 items
    202: 448,938 records across 414,217 items


In [24]:
# Property coverage per item
print("\n📊 Property Coverage per Item:")
props_per_item = item_properties.groupby('itemid')['property'].nunique()
print(f"  Items with 1 property: {(props_per_item == 1).sum():,}")
print(f"  Items with 2+ properties: {(props_per_item >= 2).sum():,}")
print(f"  Items with 5+ properties: {(props_per_item >= 5).sum():,}")



📊 Property Coverage per Item:
  Items with 1 property: 0
  Items with 2+ properties: 417,053
  Items with 5+ properties: 417,053


In [25]:
# Sample item with all properties
print("\n🔍 Sample Item Properties:")
# Find an item with many properties but not the absolute max (which might be an outlier)
props_per_item_sorted = item_properties.groupby('itemid')['property'].nunique().sort_values(ascending=False)
sample_item = props_per_item_sorted.iloc[min(10, len(props_per_item_sorted)-1)]  # 11th item or last
sample_item_id = props_per_item_sorted.index[min(10, len(props_per_item_sorted)-1)]
sample_props = item_properties[item_properties['itemid'] == sample_item_id][['property', 'value']].head(5)
print(f"Item {sample_item_id} (has {sample_item} properties, showing first 5):")
print(sample_props.to_string(index=False))
if sample_item > 5:
    print(f"  ... and {sample_item - 5} more properties")


🔍 Sample Item Properties:
Item 406081 (has 56 properties, showing first 5):
 property                         value
      355                        586893
      523 991263 1238769 126079 1141242
available                             0
      678                        367680
      981    288796 23770 769062 769062
  ... and 51 more properties


In [26]:
# ============================================================================
# 6. CATEGORY TREE - HIERARCHY ANALYSIS
# ============================================================================

In [27]:
print(f"\nBasic Statistics:")
print(f"  Total categories: {len(category_tree):,}")
print(f"  Unique category IDs: {category_tree['categoryid'].nunique():,}")
print(f"  Unique parent IDs: {category_tree['parentid'].nunique():,}")



Basic Statistics:
  Total categories: 1,669
  Unique category IDs: 1,669
  Unique parent IDs: 362


In [28]:
print(f"\nBasic Statistics:")
print(f"  Total categories: {len(category_tree):,}")
print(f"  Unique category IDs: {category_tree['categoryid'].nunique():,}")
print(f"  Unique parent IDs: {category_tree['parentid'].nunique():,}")


Basic Statistics:
  Total categories: 1,669
  Unique category IDs: 1,669
  Unique parent IDs: 362


In [29]:
# Root categories (no parent)
root_categories = category_tree[category_tree['parentid'].isna()]
print(f"\n Root Categories (top-level): {len(root_categories)}")


 Root Categories (top-level): 25


In [30]:
# Depth analysis - optimized with dictionary lookup
print("\n📏 Category Hierarchy Depth Analysis:")

# Create parent lookup dictionary for faster access
parent_lookup = dict(zip(category_tree['categoryid'], category_tree['parentid']))
def get_category_depth_fast(cat_id, parent_dict, max_depth=20):
    """Calculate depth using dictionary lookup (much faster)"""
    depth = 0
    current = cat_id
    visited = set()
    
    while depth < max_depth and current in parent_dict:
        if current in visited:  # Detect cycles
            return -1
        visited.add(current)
        
        parent = parent_dict[current]
        if pd.isna(parent):
            return depth
        current = parent
        depth += 1
    
    return depth



📏 Category Hierarchy Depth Analysis:


In [31]:
# Sample categories for depth calculation (or use all if not too many)
sample_size = min(1000, len(category_tree))
sample_cats = category_tree['categoryid'].sample(sample_size).tolist()
depths = [get_category_depth_fast(cat, parent_lookup) for cat in sample_cats]
valid_depths = [d for d in depths if d >= 0]  # Filter out cycles

In [32]:
if valid_depths:
    print(f"  Sample size: {len(valid_depths)} categories")
    print(f"  Min depth: {min(valid_depths)}")
    print(f"  Max depth: {max(valid_depths)}")
    print(f"  Avg depth: {np.mean(valid_depths):.2f}")
    if len(depths) != len(valid_depths):
        print(f"  ⚠️  Circular references detected: {len(depths) - len(valid_depths)}")
else:
    print("  Could not calculate depths (possible data issues)")

  Sample size: 1000 categories
  Min depth: 0
  Max depth: 5
  Avg depth: 2.42


In [33]:
# Categories with children
cats_with_children = category_tree['parentid'].value_counts()
print(f"\n👶 Categories with Children:")
print(f"  Categories that are parents: {len(cats_with_children)}")
print(f"  Max children per category: {cats_with_children.max()}")
print(f"  Avg children per parent: {cats_with_children.mean():.2f}")


👶 Categories with Children:
  Categories that are parents: 362
  Max children per category: 31
  Avg children per parent: 4.54


In [34]:
# Items in events vs item_properties
items_in_events = set(events['itemid'].unique())
items_in_properties = set(item_properties['itemid'].unique())

items_both = items_in_events & items_in_properties
items_only_events = items_in_events - items_in_properties
items_only_properties = items_in_properties - items_in_events

print(f"\n🔗 Events ↔ Item Properties:")
print(f"  Items in BOTH: {len(items_both):,} ({len(items_both)/len(items_in_events)*100:.1f}% of events)")
print(f"  Items ONLY in events: {len(items_only_events):,} (no metadata)")
print(f"  Items ONLY in properties: {len(items_only_properties):,} (no interactions)")



🔗 Events ↔ Item Properties:
  Items in BOTH: 185,246 (78.8% of events)
  Items ONLY in events: 49,815 (no metadata)
  Items ONLY in properties: 231,807 (no interactions)


In [35]:
# Get categoryid from properties
items_with_cat = item_properties[item_properties['property'] == 'categoryid'][['itemid', 'value']]
items_with_cat.columns = ['itemid', 'categoryid']
items_with_cat['categoryid'] = pd.to_numeric(items_with_cat['categoryid'], errors='coerce')


In [36]:
# Categories in category_tree vs properties
if len(items_with_cat) > 0:
    cats_in_properties = set(items_with_cat['categoryid'].dropna().astype(int).unique())
    cats_in_tree = set(category_tree['categoryid'].unique())
    
    cats_both = cats_in_properties & cats_in_tree
    cats_only_props = cats_in_properties - cats_in_tree
    cats_only_tree = cats_in_tree - cats_in_properties
    
    print(f"\n🔗 Item Properties ↔ Category Tree:")
    print(f"  Categories in BOTH: {len(cats_both):,}")
    print(f"  Categories ONLY in properties: {len(cats_only_props):,} (orphan categories)")
    print(f"  Categories ONLY in tree: {len(cats_only_tree):,} (unused categories)")



🔗 Item Properties ↔ Category Tree:
  Categories in BOTH: 1,212
  Categories ONLY in properties: 30 (orphan categories)
  Categories ONLY in tree: 457 (unused categories)


In [37]:
# Data completeness summary
print(f"\n📊 Overall Data Completeness:")
events_with_props = events[events['itemid'].isin(items_in_properties)]
print(f"  Events with item metadata: {len(events_with_props):,} ({len(events_with_props)/len(events)*100:.1f}%)")

if len(items_with_cat) > 0:
    events_with_category = events.merge(items_with_cat, on='itemid', how='inner')
    print(f"  Events with category info: {len(events_with_category):,} ({len(events_with_category)/len(events)*100:.1f}%)")



📊 Overall Data Completeness:
  Events with item metadata: 2,500,516 (90.7%)
  Events with category info: 5,644,678 (204.8%)


In [38]:
print("\n🔍 Duplicates:")
print(f"  Events: {events.duplicated().sum():,}")
print(f"  Item Properties: {item_properties.duplicated().sum():,}")
print(f"  Category Tree: {category_tree.duplicated().sum():,}")



🔍 Duplicates:
  Events: 460
  Item Properties: 0
  Category Tree: 0


In [39]:
print("\n⚠️  Invalid Values:")
print(f"  Negative item IDs in events: {(events['itemid'] < 0).sum()}")
print(f"  Negative visitor IDs: {(events['visitorid'] < 0).sum()}")
print(f"  Null item IDs in events: {events['itemid'].isna().sum()}")
print(f"  Null visitor IDs: {events['visitorid'].isna().sum()}")


⚠️  Invalid Values:
  Negative item IDs in events: 0
  Negative visitor IDs: 0
  Null item IDs in events: 0
  Null visitor IDs: 0


In [40]:
print("\n📉 Missing Data:")
print(f"  Events - missing timestamps: {events['timestamp'].isna().sum()}")
print(f"  Item Properties - missing values: {item_properties['value'].isna().sum():,}")
print(f"  Category Tree - missing parentid: {category_tree['parentid'].isna().sum()} (root categories)")



📉 Missing Data:
  Events - missing timestamps: 0
  Item Properties - missing values: 0
  Category Tree - missing parentid: 25 (root categories)


In [41]:

print("\n⚠️  Challenges:")
if len(items_only_events) > 0:
    print(f"  • {len(items_only_events):,} items in events lack metadata (cold start problem)")
print("  • Item properties in long format (needs pivoting for modeling)")
if len(cats_only_props) > 0:
    print(f"  • {len(cats_only_props)} orphan categories not in tree")


⚠️  Challenges:
  • 49,815 items in events lack metadata (cold start problem)
  • Item properties in long format (needs pivoting for modeling)
  • 30 orphan categories not in tree
