In [1]:
# ===== Cell 1: Import and Load Raw Data =====
import pandas as pd
import numpy as np

orders = pd.read_csv("../data/olist_orders_dataset.csv", parse_dates=[
    'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date'])
items = pd.read_csv("../data/olist_order_items_dataset.csv")
products = pd.read_csv("../data/olist_products_dataset.csv")
payments = pd.read_csv("../data/olist_order_payments_dataset.csv")
reviews = pd.read_csv("../data/olist_order_reviews_dataset.csv")

In [2]:
# ===== Cell 2: Clean and Create Delay Features =====
orders = orders.dropna(subset=[
    'order_delivered_customer_date',
    'order_estimated_delivery_date',
    'order_purchase_timestamp'
])
orders['delay_days'] = (orders['order_delivered_customer_date'] - orders['order_estimated_delivery_date']).dt.days
orders['delay_days'] = orders['delay_days'].apply(lambda x: x if x > 0 and x <= 30 else 0)
orders['shipping_days'] = (orders['order_delivered_customer_date'] - orders['order_purchase_timestamp']).dt.days
orders['month'] = orders['order_purchase_timestamp'].dt.month


In [3]:
# ===== Cell 3: Merge Datasets =====
order_full = orders.merge(items, on='order_id', how='left')
order_full = order_full.merge(products[['product_id', 'product_category_name']], on='product_id', how='left')
order_full = order_full.merge(payments[['order_id', 'payment_value']], on='order_id', how='left')
order_full = order_full.merge(reviews[['order_id', 'review_score']], on='order_id', how='left')


In [5]:
# ===== Cell 4: Save Cleaned Data =====
order_full.to_csv("../CleanDeta/order_full_cleaned.csv", index=False)
print(" Cleaned dataset saved to CleanDeta/order_full_cleaned.csv")

 Cleaned dataset saved to CleanDeta/order_full_cleaned.csv
