In [96]:
# ============================================================================
# RETAILROCKET DATASET - EXPLORATORY DATA ANALYSIS
# ============================================================================
# Dataset: E-commerce user behavior (views, add-to-cart, purchases)
# Goal: Understand user patterns, item popularity, and conversion funnels
# ============================================================================

In [97]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [98]:
# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [99]:
# ============================================================================
# 1. PROJECT SETUP
# ============================================================================

In [100]:
# Setup project paths (portable & reusable)
# - Prints current working directory (for debugging)
# - Defines PROJECT_ROOT as the main project folder
# - Builds a reusable DATA_DIR path to raw data

cwd = Path(os.getcwd())
if (cwd / "data").exists():
    PROJECT_ROOT = cwd
else:
    PROJECT_ROOT = cwd.parent  
print("project root:", PROJECT_ROOT)

DATA_DIR = PROJECT_ROOT / "data" / "raw" / "retailrocket"
print("data dir:", DATA_DIR)
print("files:", list(DATA_DIR.glob("*"))[:10])  

project root: c:\Users\yasmi\projects\dynamic-recommendation-system
data dir: c:\Users\yasmi\projects\dynamic-recommendation-system\data\raw\retailrocket
files: [WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/category_tree.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/events.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part1.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part2.csv')]


In [101]:
# ============================================================================
# 2. DATA LOADING
# ============================================================================

In [102]:
events = pd.read_csv(DATA_DIR / "events.csv",dtype={'timestamp': 'int64'})
category_tree = pd.read_csv(DATA_DIR / "category_tree.csv")
prop1 = pd.read_csv(DATA_DIR / "item_properties_part1.csv",dtype={'timestamp': 'int64'})
prop2 = pd.read_csv(DATA_DIR / "item_properties_part2.csv",dtype={'timestamp': 'int64'})

In [103]:
# Merging properties
item_properties = pd.concat([prop1, prop2], ignore_index=True)

In [104]:
print(f"Item properties merged: {item_properties.shape} (part1: {prop1.shape}, part2: {prop2.shape})")


Item properties merged: (20275902, 4) (part1: (10999999, 4), part2: (9275903, 4))


In [105]:
# ============================================================================
# 3. INITIAL DATA OVERVIEW
# ============================================================================

In [106]:
datasets = {
    'events': events,
    'category_tree': category_tree,
    'item_properties': item_properties
}

for name, df in datasets.items():
    print(f"\n{name.upper()}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Memory: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
    print(f"\nFirst 3 rows:")
    print(df.head(3).to_string(index=False))
    print(f"Nulls:\n{df.isna().sum()}")


EVENTS
Shape: (2756101, 5)
Columns: ['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']
Memory: 256.82 MB

First 3 rows:
    timestamp  visitorid event  itemid  transactionid
1433221332117     257597  view  355908            NaN
1433224214164     992329  view  248676            NaN
1433221999827     111016  view  318965            NaN
Nulls:
timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
dtype: int64

CATEGORY_TREE
Shape: (1669, 2)
Columns: ['categoryid', 'parentid']
Memory: 0.03 MB

First 3 rows:
 categoryid  parentid
       1016     213.0
        809     169.0
        570       9.0
Nulls:
categoryid     0
parentid      25
dtype: int64

ITEM_PROPERTIES
Shape: (20275902, 4)
Columns: ['timestamp', 'itemid', 'property', 'value']
Memory: 3067.60 MB

First 3 rows:
    timestamp  itemid   property                           value
1435460400000  460429 categoryid                            1338
144150840

In [107]:
# ============================================================================
# 4. EVENTS - PREPROCESSING & ANALYSIS
# ============================================================================

In [108]:
# Convert timestamp
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
events['date'] = events['timestamp'].dt.date
events['hour'] = events['timestamp'].dt.hour
events['dayofweek'] = events['timestamp'].dt.dayofweek
events['day_name'] = events['timestamp'].dt.day_name()

In [109]:
#Basic Statistics
print(f"\nBasic Statistics:")
print(f"  Total events: {len(events):,}")
print(f"  Unique visitors: {events['visitorid'].nunique():,}")
print(f"  Unique items: {events['itemid'].nunique():,}")
print(f"  Date range: {events['timestamp'].min()} to {events['timestamp'].max()}")
print(f"  Days span: {(events['timestamp'].max() - events['timestamp'].min()).days} days")


Basic Statistics:
  Total events: 2,756,101
  Unique visitors: 1,407,580
  Unique items: 235,061
  Date range: 2015-05-03 03:00:04.384000 to 2015-09-18 02:59:47.788000
  Days span: 137 days


In [110]:
print("\nEvent Type Distribution:")
event_dist = events['event'].value_counts()
for event_type, count in event_dist.items():
    pct = count / len(events) * 100
    print(f"  {event_type}: {count:,} ({pct:.2f}%)")


Event Type Distribution:
  view: 2,664,312 (96.67%)
  addtocart: 69,332 (2.52%)
  transaction: 22,457 (0.81%)


In [None]:
# Conversion metrics
total_views = event_dist.get('view', 0)
total_addtocart = event_dist.get('addtocart', 0)
total_transactions = event_dist.get('transaction', 0)

print("\nConversion Funnel:")
print(f"  Views → AddToCart: {total_addtocart/total_views*100:.2f}%" if total_views > 0 else "N/A")
print(f"  Views → Purchase: {total_transactions/total_views*100:.2f}%" if total_views > 0 else "N/A")
print(f"  AddToCart → Purchase: {total_transactions/total_addtocart*100:.2f}%" if total_addtocart > 0 else "N/A")


📊 Conversion Funnel:
  Views → AddToCart: 2.60%
  Views → Purchase: 0.84%
  AddToCart → Purchase: 32.39%


In [112]:
print("Raw timestamp values:")
print(events['timestamp'].head())

# Convert from milliseconds to datetime
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')

# Verify the conversion
print("\nConverted timestamps:")
print(events['timestamp'].head())
print(f"\nDate range: {events['timestamp'].min()} to {events['timestamp'].max()}")

Raw timestamp values:
0   2015-06-02 05:02:12.117
1   2015-06-02 05:50:14.164
2   2015-06-02 05:13:19.827
3   2015-06-02 05:12:35.914
4   2015-06-02 05:02:17.106
Name: timestamp, dtype: datetime64[ns]

Converted timestamps:
0   2015-06-02 05:02:12.117
1   2015-06-02 05:50:14.164
2   2015-06-02 05:13:19.827
3   2015-06-02 05:12:35.914
4   2015-06-02 05:02:17.106
Name: timestamp, dtype: datetime64[ns]

Date range: 2015-05-03 03:00:04.384000 to 2015-09-18 02:59:47.788000


In [113]:
print(events['timestamp'].min())
print(events['timestamp'].max())

2015-05-03 03:00:04.384000
2015-09-18 02:59:47.788000


In [114]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date,hour,dayofweek,day_name
0,2015-06-02 05:02:12.117,257597,view,355908,,2015-06-02,5,1,Tuesday
1,2015-06-02 05:50:14.164,992329,view,248676,,2015-06-02,5,1,Tuesday
2,2015-06-02 05:13:19.827,111016,view,318965,,2015-06-02,5,1,Tuesday
3,2015-06-02 05:12:35.914,483717,view,253185,,2015-06-02,5,1,Tuesday
4,2015-06-02 05:02:17.106,951259,view,367447,,2015-06-02,5,1,Tuesday


In [115]:
item_col ="itemid"
user_col = "visitorid"
event_col = "event" 

# unique counts
print("\nunique counts (sample):")
for c in [user_col, item_col, event_col]:
    if c in events.columns:
        print(c, "unique:", events[c].nunique())

#distribution of event types (views, purchases etc.)
print("\nevent counts:\n", events[event_col].value_counts())

# top products and top users in the sample
print("\nTop 10 items (by events):\n", events[item_col].value_counts().head(10))
print("\nTop 10 users (by events):\n", events[user_col].value_counts().head(10))




unique counts (sample):
visitorid unique: 1407580
itemid unique: 235061
event unique: 3

event counts:
 event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64

Top 10 items (by events):
 itemid
187946    3412
461686    2978
5411      2334
370653    1854
219512    1800
257040    1647
298009    1642
96924     1633
309778    1628
384302    1608
Name: count, dtype: int64

Top 10 users (by events):
 visitorid
1150086    7757
530559     4328
152963     3024
895999     2474
163561     2410
371606     2345
286616     2252
684514     2246
892013     2024
861299     1991
Name: count, dtype: int64
