In [2]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
# Setup project paths (portable & reusable)
# - Prints current working directory (for debugging)
# - Defines PROJECT_ROOT as the main project folder
# - Builds a reusable DATA_DIR path to raw data

cwd = Path(os.getcwd())
if (cwd / "data").exists():
    PROJECT_ROOT = cwd
else:
    PROJECT_ROOT = cwd.parent  
print("project root:", PROJECT_ROOT)

DATA_DIR = PROJECT_ROOT / "data" / "raw" / "retailrocket"
print("data dir:", DATA_DIR)
print("files:", list(DATA_DIR.glob("*"))[:10])  

project root: c:\Users\yasmi\projects\dynamic-recommendation-system
data dir: c:\Users\yasmi\projects\dynamic-recommendation-system\data\raw\retailrocket
files: [WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/category_tree.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/events.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part1.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part2.csv')]


In [None]:
events = pd.read_csv(DATA_DIR / "events.csv")
category_tree = pd.read_csv(DATA_DIR / "category_tree.csv")
prop1 = pd.read_csv(DATA_DIR / "item_properties_part1.csv")
prop2 = pd.read_csv(DATA_DIR / "item_properties_part2.csv")

In [None]:
# Merging properties
item_properties = pd.concat([prop1, prop2], ignore_index=True)

In [None]:
print("Part1 shape:", prop1.shape)
print("Part2 shape:", prop2.shape)
print("Merged shape:", item_properties.shape)

Part1 shape: (10999999, 4)
Part2 shape: (9275903, 4)
Merged shape: (20275902, 4)


In [5]:
for f in DATA_DIR.iterdir():
    if f.suffix.lower() in [".csv"]:
        print(f.name, "size(MB)=", round(f.stat().st_size/1e6,2))
        print(pd.read_csv(f, nrows=3).head(3).to_string(index=False))
        print("-"*60)


category_tree.csv size(MB)= 0.01
 categoryid  parentid
       1016       213
        809       169
        570         9
------------------------------------------------------------
events.csv size(MB)= 94.24
    timestamp  visitorid event  itemid  transactionid
1433221332117     257597  view  355908            NaN
1433224214164     992329  view  248676            NaN
1433221999827     111016  view  318965            NaN
------------------------------------------------------------
item_properties_part1.csv size(MB)= 484.32
    timestamp  itemid   property                           value
1435460400000  460429 categoryid                            1338
1441508400000  206783        888         1116713 960601 n277.200
1439089200000  395014        400 n552.000 639502 n720.000 424566
------------------------------------------------------------
item_properties_part2.csv size(MB)= 408.93
    timestamp  itemid  property           value
1433041200000  183478       561          769062
14396940000

In [None]:
# Inspecting Events 
print("columns:", list(events.columns))
print("\ninfo:")
print("null counts:\n", events.isna().sum())
display(events.describe(include='all').T)


In [12]:
# Inspecting Category Tree
print("columns:", list(category_tree.columns))
print("\ninfo:")
print("null counts:\n", category_tree.isna().sum())
display(category_tree.describe(include='all').T)


columns: ['categoryid', 'parentid']

info:
null counts:
 categoryid     0
parentid      25
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
categoryid,1669.0,849.285201,490.195116,0.0,427.0,848.0,1273.0,1698.0
parentid,1644.0,847.571168,505.058485,8.0,381.0,866.0,1291.0,1698.0


In [18]:
# Inspecting Item Properties 
print("columns:", list(item_properties.columns))
print("\ninfo:")
print("null counts:\n", item_properties.isna().sum())
display(item_properties.describe(include='all').T)


columns: ['timestamp', 'itemid', 'property', 'value']

info:
null counts:
 timestamp    0
itemid       0
property     0
value        0
dtype: int64


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
timestamp,20275902.0,,,,1435156943682.8833,3327797780.567144,1431226800000.0,1432436400000.0,1433646000000.0,1437879600000.0,1442113200000.0
itemid,20275902.0,,,,233390.432525,134845.230668,0.0,116516.0,233483.0,350304.0,466866.0
property,20275902.0,1104.0,888.0,3000398.0,,,,,,,
value,20275902.0,1966868.0,769062.0,1537247.0,,,,,,,


In [19]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [33]:
# Convert timestamp 
events["timestamp"] = pd.to_datetime(events["timestamp"], unit = 'ms')
print(events["timestamp"].head())
print(events["timestamp"].min(), events["timestamp"].max())

0   1970-01-01 00:23:53.221332117
1   1970-01-01 00:23:53.224214164
2   1970-01-01 00:23:53.221999827
3   1970-01-01 00:23:53.221955914
4   1970-01-01 00:23:53.221337106
Name: timestamp, dtype: datetime64[ns]
1970-01-01 00:23:50.622004384 1970-01-01 00:24:02.545187788
