In [25]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [30]:
# Setup project paths (portable & reusable)
# - Prints current working directory (for debugging)
# - Defines PROJECT_ROOT as the main project folder
# - Builds a reusable DATA_DIR path to raw data

cwd = Path(os.getcwd())
if (cwd / "data").exists():
    PROJECT_ROOT = cwd
else:
    PROJECT_ROOT = cwd.parent  
print("project root:", PROJECT_ROOT)

DATA_DIR = PROJECT_ROOT / "data" / "raw" / "retailrocket"
print("data dir:", DATA_DIR)
print("files:", list(DATA_DIR.glob("*"))[:10])  

project root: c:\Users\yasmi\projects\dynamic-recommendation-system
data dir: c:\Users\yasmi\projects\dynamic-recommendation-system\data\raw\retailrocket
files: [WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/category_tree.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/events.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part1.csv'), WindowsPath('c:/Users/yasmi/projects/dynamic-recommendation-system/data/raw/retailrocket/item_properties_part2.csv')]


In [10]:
events = pd.read_csv("../data/raw/retailrocket/events.csv")
category_tree = pd.read_csv("../data/raw/retailrocket/category_tree.csv")
item_properties1 = pd.read_csv("../data/raw/retailrocket/item_properties_part1.csv")
item_properties2 = pd.read_csv("../data/raw/retailrocket/item_properties_part2.csv")

In [36]:
for f in DATA_DIR.iterdir():
    if f.suffix.lower() in [".csv"]:
        print(f.name, "size(MB)=", round(f.stat().st_size/1e6,2))
        print(pd.read_csv(f, nrows=3).head(3).to_string(index=False))
        print("-"*60)


category_tree.csv size(MB)= 0.01
 categoryid  parentid
       1016       213
        809       169
        570         9
------------------------------------------------------------
events.csv size(MB)= 94.24
    timestamp  visitorid event  itemid  transactionid
1433221332117     257597  view  355908            NaN
1433224214164     992329  view  248676            NaN
1433221999827     111016  view  318965            NaN
------------------------------------------------------------
item_properties_part1.csv size(MB)= 484.32
    timestamp  itemid   property                           value
1435460400000  460429 categoryid                            1338
1441508400000  206783        888         1116713 960601 n277.200
1439089200000  395014        400 n552.000 639502 n720.000 424566
------------------------------------------------------------
item_properties_part2.csv size(MB)= 408.93
    timestamp  itemid  property           value
1433041200000  183478       561          769062
14396940000

In [37]:
# Cell 3 — helper functions to guess key columns
def guess_col(cols, keywords):
    cols_l = [c.lower() for c in cols]
    for k in keywords:
        for i, c in enumerate(cols_l):
            if k in c:
                return cols[i]
    return None

# Pick a small file sample to inspect header names - prefer events
sample = pd.read_csv(DATA_DIR / "events.csv", nrows=5)
cols = sample.columns.tolist()
print("columns:", cols)

user_col = guess_col(cols, ["user", "visitor", "visitorid", "userid"])
item_col = guess_col(cols, ["item", "product", "itemid", "offerid"])
time_col = guess_col(cols, ["time", "date", "timestamp", "ts"])
event_col = guess_col(cols, ["event", "event_type", "action", "eventtype"])

print("guessed -> user:", user_col, "item:", item_col, "time:", time_col, "event_type:", event_col)


columns: ['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']
guessed -> user: visitorid item: itemid time: timestamp event_type: event


In [14]:
item_properties1.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [15]:
item_properties2.head()

Unnamed: 0,timestamp,itemid,property,value
0,1433041200000,183478,561,769062
1,1439694000000,132256,976,n26.400 1135780
2,1435460400000,420307,921,1149317 1257525
3,1431831600000,403324,917,1204143
4,1435460400000,230701,521,769062
