In [None]:
# ============================================================================
# RETAILROCKET DATASET - EXPLORATORY DATA ANALYSIS
# ============================================================================
# Dataset: E-commerce user behavior (views, add-to-cart, purchases)
# Goal: Understand user patterns, item popularity, and conversion funnels
# ============================================================================

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# ============================================================================
# 1. PROJECT SETUP
# ============================================================================

In [None]:
# Setup project paths (portable & reusable)
# - Prints current working directory (for debugging)
# - Defines PROJECT_ROOT as the main project folder
# - Builds a reusable DATA_DIR path to raw data

cwd = Path(os.getcwd())
if (cwd / "data").exists():
    PROJECT_ROOT = cwd
else:
    PROJECT_ROOT = cwd.parent  
print("project root:", PROJECT_ROOT)

DATA_DIR = PROJECT_ROOT / "data" / "raw" / "retailrocket"
print("data dir:", DATA_DIR)
print("files:", list(DATA_DIR.glob("*"))[:10])  

In [None]:
# ============================================================================
# 2. DATA LOADING
# ============================================================================

In [None]:
events = pd.read_csv(DATA_DIR / "events.csv",dtype={'timestamp': 'int64'})
category_tree = pd.read_csv(DATA_DIR / "category_tree.csv")
prop1 = pd.read_csv(DATA_DIR / "item_properties_part1.csv",dtype={'timestamp': 'int64'})
prop2 = pd.read_csv(DATA_DIR / "item_properties_part2.csv",dtype={'timestamp': 'int64'})

In [None]:
# Merging properties
item_properties = pd.concat([prop1, prop2], ignore_index=True)

In [67]:
print(f"✓ Item properties merged: {item_properties.shape} (part1: {prop1.shape}, part2: {prop2.shape})")
print()

✓ Item properties merged: (20275902, 4) (part1: (10999999, 4), part2: (9275903, 4))



In [None]:
# ============================================================================
# 3. INITIAL DATA OVERVIEW
# ============================================================================

In [None]:
for f in DATA_DIR.iterdir():
    if f.suffix.lower() in [".csv"]:
        print(f.name, "size(MB)=", round(f.stat().st_size/1e6,2))
        print(pd.read_csv(f, nrows=3).head(3).to_string(index=False))
        print("-"*60)


In [None]:
# Inspecting Events 
print("columns:", list(events.columns))
print("\ninfo:")
print("null counts:\n", events.isna().sum())
display(events.describe(include='all').T)


In [None]:
# Inspecting Category Tree
print("columns:", list(category_tree.columns))
print("\ninfo:")
print("null counts:\n", category_tree.isna().sum())
display(category_tree.describe(include='all').T)


In [None]:
# Inspecting Item Properties 
print("columns:", list(item_properties.columns))
print("\ninfo:")
print("null counts:\n", item_properties.isna().sum())
display(item_properties.describe(include='all').T)


In [None]:
events.head()

In [None]:
print(events['timestamp'].dtype)
print(events['timestamp'].head())

In [None]:
print("Raw timestamp values:")
print(events['timestamp'].head())

# Convert from milliseconds to datetime
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')

# Verify the conversion
print("\nConverted timestamps:")
print(events['timestamp'].head())
print(f"\nDate range: {events['timestamp'].min()} to {events['timestamp'].max()}")

In [None]:
print(events['timestamp'].min())
print(events['timestamp'].max())

In [None]:
events.head()

In [None]:
item_col ="itemid"
user_col = "visitorid"
event_col = "event" 

# unique counts
print("\nunique counts (sample):")
for c in [user_col, item_col, event_col]:
    if c in events.columns:
        print(c, "unique:", events[c].nunique())

#distribution of event types (views, purchases etc.)
print("\nevent counts:\n", events[event_col].value_counts())

# top products and top users in the sample
print("\nTop 10 items (by events):\n", events[item_col].value_counts().head(10))
print("\nTop 10 users (by events):\n", events[user_col].value_counts().head(10))

