In [1]:
import os 
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

PROCESSED_DIR = "./data/processed"
FEATURES_DIR = "./data/features"
os.makedirs(FEATURES_DIR, exist_ok=True)

print("PROCESSED_DIR =", PROCESSED_DIR)
print("FEATURES_DIR  =", FEATURES_DIR)

PROCESSED_DIR = ./data/processed
FEATURES_DIR  = ./data/features


In [3]:
orders    = pd.read_parquet(PROCESSED_DIR + "/orders.parquet")
customers = pd.read_parquet(PROCESSED_DIR + "/customers.parquet")
items     = pd.read_parquet(PROCESSED_DIR + "/order_items.parquet")
payments  = pd.read_parquet(PROCESSED_DIR + "/order_payments.parquet")
reviews   = pd.read_parquet(PROCESSED_DIR + "/order_reviews.parquet")
products  = pd.read_parquet(PROCESSED_DIR + "/products.parquet")
sellers   = pd.read_parquet(PROCESSED_DIR + "/sellers.parquet")
geo       = pd.read_parquet(PROCESSED_DIR + "/geolocation.parquet")
catmap    = pd.read_parquet(PROCESSED_DIR + "/category_translation.parquet")

print("Data loaded successfully.")
print("Orders shape:", orders.shape)

Data loaded successfully.
Orders shape: (99441, 11)


In [5]:
# Extract time features from purchase timestamp
orders["purchase_year"]  = orders["order_purchase_timestamp"].dt.year
orders["purchase_month"] = orders["order_purchase_timestamp"].dt.month
orders["purchase_dow"]   = orders["order_purchase_timestamp"].dt.dayofweek
orders["purchase_hour"]  = orders["order_purchase_timestamp"].dt.hour

orders[["purchase_year","purchase_month","purchase_dow","purchase_hour"]].head()


Unnamed: 0,purchase_year,purchase_month,purchase_dow,purchase_hour
0,2017,10,0,10
1,2018,7,1,20
2,2018,8,2,8
3,2017,11,5,19
4,2018,2,1,21


In [7]:
order_item_agg = items.groupby("order_id").agg(
    n_items=("order_item_id","count"),          # number of items in the order
    n_sellers=("seller_id","nunique"),          # number of different sellers
    n_products=("product_id","nunique"),        # number of unique products
    price_sum=("price","sum"),                  # total product value
    freight_sum=("freight_value","sum"),        # total shipping cost
    avg_price=("price","mean"),                 # average price per item
).reset_index()

order_item_agg.head()


Unnamed: 0,order_id,n_items,n_sellers,n_products,price_sum,freight_sum,avg_price
0,00010242fe8c5a6d1ba2dd792cb16214,1,1,1,58.9,13.29,58.9
1,00018f77f2f0320c557190d7a144bdd3,1,1,1,239.9,19.93,239.9
2,000229ec398224ef6ca0657da4fc703e,1,1,1,199.0,17.87,199.0
3,00024acbcdf0a6daa1e931b038114c75,1,1,1,12.99,12.79,12.99
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,1,1,199.9,18.14,199.9


In [9]:
order_pay_agg = payments.groupby("order_id").agg(
    pay_total=("payment_value","sum"),          # total payment
    pay_types=("payment_type","nunique"),       # number of payment methods
    installments_max=("payment_installments","max") # max installments
).reset_index()

order_pay_agg.head()


Unnamed: 0,order_id,pay_total,pay_types,installments_max
0,00010242fe8c5a6d1ba2dd792cb16214,72.19,1,2
1,00018f77f2f0320c557190d7a144bdd3,259.83,1,3
2,000229ec398224ef6ca0657da4fc703e,216.87,1,5
3,00024acbcdf0a6daa1e931b038114c75,25.78,1,2
4,00042b26cf59d7ce69dfabb4e55b4fd9,218.04,1,3


In [11]:
# Merge product category translation (Portuguese → English)
products_full = products.merge(catmap, how="left", left_on="product_category_name", right_on="product_category_name")

# Aggregate product-level features to order level
product_agg = items.merge(products_full, on="product_id", how="left").groupby("order_id").agg(
    avg_product_weight=("product_weight_g","mean"),
    avg_product_length=("product_length_cm","mean"),
    avg_product_height=("product_height_cm","mean"),
    avg_product_width=("product_width_cm","mean"),
    main_category=("product_category_name_english", 
                   lambda x: x.mode()[0] if not x.mode().empty else "unknown")
).reset_index()

product_agg.head()


Unnamed: 0,order_id,avg_product_weight,avg_product_length,avg_product_height,avg_product_width,main_category
0,00010242fe8c5a6d1ba2dd792cb16214,650.0,28.0,9.0,14.0,cool_stuff
1,00018f77f2f0320c557190d7a144bdd3,30000.0,50.0,30.0,40.0,pet_shop
2,000229ec398224ef6ca0657da4fc703e,3050.0,33.0,13.0,33.0,furniture_decor
3,00024acbcdf0a6daa1e931b038114c75,200.0,16.0,10.0,15.0,perfumery
4,00042b26cf59d7ce69dfabb4e55b4fd9,3750.0,35.0,40.0,30.0,garden_tools


In [12]:
def haversine_km(lat1, lon1, lat2, lon2):
    # Convert degrees → radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c  # distance in km

# 1) Compute average lat/lon per ZIP prefix
geo_avg = geo.groupby("geolocation_zip_code_prefix").agg(
    lat=("geolocation_lat","mean"),
    lon=("geolocation_lng","mean")
).reset_index()

# 2) Attach geo coords to customers & sellers
customers_geo = customers.merge(
    geo_avg, left_on="customer_zip_code_prefix", right_on="geolocation_zip_code_prefix", how="left"
).rename(columns={"lat":"customer_lat","lon":"customer_lon"}).drop(columns=["geolocation_zip_code_prefix"])

sellers_geo = sellers.merge(
    geo_avg, left_on="seller_zip_code_prefix", right_on="geolocation_zip_code_prefix", how="left"
).rename(columns={"lat":"seller_lat","lon":"seller_lon"}).drop(columns=["geolocation_zip_code_prefix"])

# 3) Bring customer_id into items via orders (since items doesn’t have it)
items_with_cust = items.merge(
    orders[["order_id","customer_id"]], on="order_id", how="left"
)

# 4) Merge coordinates
items_geo = (items_with_cust
    .merge(customers_geo[["customer_id","customer_lat","customer_lon"]], on="customer_id", how="left")
    .merge(sellers_geo[["seller_id","seller_lat","seller_lon"]], on="seller_id", how="left")
)

# 5) Compute distance in vectorized form
items_geo["distance_seller_customer_km"] = haversine_km(
    items_geo["customer_lat"], items_geo["customer_lon"],
    items_geo["seller_lat"], items_geo["seller_lon"]
)

# 6) Aggregate average distance per order
distance_agg = items_geo.groupby("order_id", as_index=False).agg(
    avg_distance_km=("distance_seller_customer_km","mean")
)

# Preview results
distance_agg.head(3)

Unnamed: 0,order_id,avg_distance_km
0,00010242fe8c5a6d1ba2dd792cb16214,301.504681
1,00018f77f2f0320c557190d7a144bdd3,585.563937
2,000229ec398224ef6ca0657da4fc703e,312.343511


In [13]:
# Merge everything into one "features" table
features = (orders
    .merge(order_item_agg, on="order_id", how="left")
    .merge(order_pay_agg,  on="order_id", how="left")
    .merge(product_agg,    on="order_id", how="left")
    .merge(distance_agg,   on="order_id", how="left")   # <-- 用修正后的距离聚合
    .merge(customers,      on="customer_id", how="left")
)
print("Final features shape:", features.shape)
features.head()

Final features shape: (99441, 34)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,actual_delivery_days,delivery_delay_days,...,avg_product_weight,avg_product_length,avg_product_height,avg_product_width,main_category,avg_distance_km,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,8.0,-8.0,...,500.0,19.0,8.0,13.0,housewares,18.57611,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,13.0,-6.0,...,400.0,19.0,13.0,19.0,perfumery,851.495069,af07308b275d755c9edb36a90c618231,47813,barreiras,BA
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,9.0,-18.0,...,420.0,24.0,19.0,21.0,auto,514.410666,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,13.0,-13.0,...,450.0,30.0,10.0,20.0,pet_shop,1822.226336,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,2.0,-10.0,...,250.0,51.0,15.0,15.0,stationery,29.676625,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP


In [14]:
# Save the final feature dataset
features.to_parquet(FEATURES_DIR + "/features.parquet", index=False)

print("Feature dataset saved to", FEATURES_DIR)

# Quick reload test
features_check = pd.read_parquet(FEATURES_DIR + "/features.parquet")
print("Reloaded features shape:", features_check.shape)
features_check.head()


Feature dataset saved to ./data/features
Reloaded features shape: (99441, 34)


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,actual_delivery_days,delivery_delay_days,...,avg_product_weight,avg_product_length,avg_product_height,avg_product_width,main_category,avg_distance_km,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,8.0,-8.0,...,500.0,19.0,8.0,13.0,housewares,18.57611,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,13.0,-6.0,...,400.0,19.0,13.0,19.0,perfumery,851.495069,af07308b275d755c9edb36a90c618231,47813,barreiras,BA
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,9.0,-18.0,...,420.0,24.0,19.0,21.0,auto,514.410666,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,13.0,-13.0,...,450.0,30.0,10.0,20.0,pet_shop,1822.226336,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,2.0,-10.0,...,250.0,51.0,15.0,15.0,stationery,29.676625,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP
