In this project, we will start by defining the paths correctly.

In [1]:
import pandas as pd
import os

# DEFINE PATHS
RAW_PATH = os.path.join("..", "data", "raw")
PROCESSED_PATH = os.path.join("..", "data", "processed")

# Check if paths exist to avoid errors
if not os.path.exists(RAW_PATH):
    print(f"WARNING: Data path not found at {RAW_PATH}")
    print("Please ensure your CSV files are in the 'data/raw' folder!")
else:
    print(f"Data path confirmed: {RAW_PATH}")

Data path confirmed: ..\data\raw


In this section, we are loading the 3 key tables and filter out any orders that were canceled or unavailable so we don't train bad data

In [2]:
# 1. Load the Core Tables
print("Loading datasets...")
try:
    orders = pd.read_csv(os.path.join(RAW_PATH, "olist_orders_dataset.csv"))
    items = pd.read_csv(os.path.join(RAW_PATH, "olist_order_items_dataset.csv"))
    reviews = pd.read_csv(os.path.join(RAW_PATH, "olist_order_reviews_dataset.csv"))
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")

# 2. Pre-processing Dates
# Convert the string date to a real datetime object
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])

# 3. Filter for Valid Demand (Business Logic)
# We only want to train our model on completed sales.
filtered_orders = orders[orders['order_status'] == 'delivered'].copy()

print(f"Orders Loaded: {len(orders)}")
print(f"Orders after filtering for 'Delivered': {len(filtered_orders)}")

Loading datasets...
Datasets loaded successfully.
Orders Loaded: 99441
Orders after filtering for 'Delivered': 96478
