# Olist Data Cleaning

This notebook expects Olist raw CSVs in `data/raw/`, applies basic cleaning, and writes cleaned tables to `data/processed/`. Paths are resolved relative to the project root so it works cross‑platform.

In [1]:
from pathlib import Path
import pandas as pd

# Go up one level from the notebook directory
PROJECT_ROOT = Path().resolve().parents[0]
RAW_DATA_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DATA_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

RAW_DATA_DIR, PROCESSED_DATA_DIR

(PosixPath('/home/gwei4/e_commerce_customer_ai_platform/data/raw'),
 PosixPath('/home/gwei4/e_commerce_customer_ai_platform/data/processed'))

In [2]:
def save_clean(df: pd.DataFrame, name: str):
    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
    out_path = PROCESSED_DATA_DIR / f"{name}.parquet"
    df.to_parquet(out_path, index=False)
    print("✔️ saved", out_path)

# (0) Orders

In [3]:
# === Orders ===
orders = pd.read_csv(RAW_DATA_DIR / "olist_orders_dataset.csv")
orders = orders.drop_duplicates(subset="order_id")

# parse timestamps
ts_cols = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
]
for col in ts_cols:
    orders[col] = pd.to_datetime(orders[col])

# derive features
orders["delivery_time_days"] = (
    orders["order_delivered_customer_date"] - orders["order_purchase_timestamp"]
).dt.days
orders["approval_lag_hours"] = (
    orders["order_approved_at"] - orders["order_purchase_timestamp"]
).dt.total_seconds() / 3600
orders["days_diff_estimate"] = (
    orders["order_estimated_delivery_date"] - orders["order_delivered_customer_date"]
).dt.days

# extract date parts
orders["purchase_year"]  = orders["order_purchase_timestamp"].dt.year
orders["purchase_month"] = orders["order_purchase_timestamp"].dt.month
orders["purchase_day"]   = orders["order_purchase_timestamp"].dt.day
orders["purchase_week"]  = orders["order_purchase_timestamp"].dt.isocalendar().week

# sanity checks
print("Missingness:\n", orders[ts_cols].isnull().mean().round(3))
print("Negative delivery days:", (orders["delivery_time_days"] < 0).sum())
print(orders["delivery_time_days"].describe())

# save
save_clean(orders, "orders_clean")
orders.head()

Missingness:
 order_purchase_timestamp         0.000
order_approved_at                0.002
order_delivered_carrier_date     0.018
order_delivered_customer_date    0.030
order_estimated_delivery_date    0.000
dtype: float64
Negative delivery days: 0
count    96476.000000
mean        12.094086
std          9.551746
min          0.000000
25%          6.000000
50%         10.000000
75%         15.000000
max        209.000000
Name: delivery_time_days, dtype: float64
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/orders_clean.parquet


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,delivery_time_days,approval_lag_hours,days_diff_estimate,purchase_year,purchase_month,purchase_day,purchase_week
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,8.0,0.178333,7.0,2017,10,2,40
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,13.0,30.713889,5.0,2018,7,24,30
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,9.0,0.276111,17.0,2018,8,8,32
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,13.0,0.298056,12.0,2017,11,18,46
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,2.0,1.030556,9.0,2018,2,13,7


# (1) Customers

In [4]:
from pathlib import Path
import pandas as pd
import unidecode

# ─── Paths & helper ─────────────────────────────────────────────────────────────
PROJECT_ROOT       = Path().resolve().parents[0]
RAW_DATA_DIR       = PROJECT_ROOT / "data" / "raw"
PROCESSED_DATA_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

def save_clean(df: pd.DataFrame, name: str):
    out_path = PROCESSED_DATA_DIR / f"{name}.parquet"
    df.to_parquet(out_path, index=False)
    print("✔️ saved", out_path)

# ─── Load & peek ────────────────────────────────────────────────────────────────
cust = pd.read_csv(RAW_DATA_DIR / "olist_customers_dataset.csv")
cust.head()

# ─── 1. Deduplicate on order-level customer_id ─────────────────────────────────
cust = cust.drop_duplicates(subset="customer_id")

# ─── 2. (Optional) one-row-per-person instead of per-order ──────────────────────
# cust = cust.drop_duplicates(subset="customer_unique_id")

# ─── 3. Normalize ZIP code prefix to 5-digit string ────────────────────────────
cust["customer_zip_code_prefix"] = (
    cust["customer_zip_code_prefix"]
        .astype(str)
        .str.zfill(5)
)

# ─── 4. Standardize text fields ─────────────────────────────────────────────────
cust["customer_city"] = (
    cust["customer_city"]
        .str.strip()
        .str.lower()
        .map(unidecode.unidecode)
)
cust["customer_state"] = cust["customer_state"].str.upper()

# ─── 5. Enrich with lat/lon via Olist geolocation table ────────────────────────
geo = pd.read_csv(RAW_DATA_DIR / "olist_geolocation_dataset.csv")
geo = (
    geo
      .drop_duplicates(subset="geolocation_zip_code_prefix")
      .rename(columns={
          "geolocation_zip_code_prefix": "customer_zip_code_prefix",
          "geolocation_lat":        "latitude",
          "geolocation_lng":        "longitude",
      })
)
geo["customer_zip_code_prefix"] = (
    geo["customer_zip_code_prefix"]
      .astype(str)
      .str.zfill(5)
)

cust = cust.merge(
    geo[["customer_zip_code_prefix", "latitude", "longitude"]],
    on="customer_zip_code_prefix",
    how="left"
)

# ─── 6. Sanity checks ───────────────────────────────────────────────────────────
print("Nulls per column:\n", cust.isnull().mean().round(3))
print("Unique customers (unique_id):", cust["customer_unique_id"].nunique())

# ─── 7. Save cleaned customers ─────────────────────────────────────────────────
save_clean(cust, "customers_clean")

# final peek
cust.head()

Nulls per column:
 customer_id                 0.000
customer_unique_id          0.000
customer_zip_code_prefix    0.000
customer_city               0.000
customer_state              0.000
latitude                    0.003
longitude                   0.003
dtype: float64
Unique customers (unique_id): 96096
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/customers_clean.parquet


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,latitude,longitude
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,-23.726853,-46.545746
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,-23.527788,-46.66031
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,-23.49693,-46.185352
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,-22.987222,-47.151073


# (2) Order Items

In [5]:
# ─── Load & peek ────────────────────────────────────────────────────────────────
order_items = pd.read_csv(RAW_DATA_DIR / "olist_order_items_dataset.csv")
order_items.head()

# ─── 1. Drop exact duplicates on (order_id, order_item_id) ──────────────────────
order_items = order_items.drop_duplicates(subset=["order_id", "order_item_id"])

# ─── 2. Parse shipping limit date ───────────────────────────────────────────────
order_items["shipping_limit_date"] = pd.to_datetime(order_items["shipping_limit_date"])

# ─── 3. Ensure numeric types for monetary columns ──────────────────────────────
order_items["price"]         = order_items["price"].astype(float)
order_items["freight_value"] = order_items["freight_value"].astype(float)

# ─── 4. Derive total cost per item ──────────────────────────────────────────────
order_items["total_cost"] = order_items["price"] + order_items["freight_value"]

# ─── 5. Sanity checks ───────────────────────────────────────────────────────────
print("Any negative prices?",       (order_items["price"] < 0).any())
print("Any negative freight?",      (order_items["freight_value"] < 0).any())
print("Total cost summary:\n",      order_items["total_cost"].describe())

# ─── 6. Save cleaned order_items ───────────────────────────────────────────────
save_clean(order_items, "order_items_clean")

# final peek
order_items.head()

Any negative prices? False
Any negative freight? False
Total cost summary:
 count    112650.000000
mean        140.644059
std         190.724394
min           6.080000
25%          55.220000
50%          92.320000
75%         157.937500
max        6929.310000
Name: total_cost, dtype: float64
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/order_items_clean.parquet


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,total_cost
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,72.19
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,259.83
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,216.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,25.78
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14,218.04


# (3) Payments

In [6]:
# ─── Load & peek ────────────────────────────────────────────────────────────────
payments = pd.read_csv(RAW_DATA_DIR / "olist_order_payments_dataset.csv")
payments.head()

# ─── 1. Drop exact duplicates on (order_id, payment_sequential) ─────────────────
payments = payments.drop_duplicates(subset=["order_id", "payment_sequential"])

# ─── 2. Cast types ──────────────────────────────────────────────────────────────
payments["payment_installments"] = payments["payment_installments"].astype(int)
payments["payment_value"]        = payments["payment_value"].astype(float)

# ─── 3. Sanity checks ───────────────────────────────────────────────────────────
print("Missingness per column:\n", payments.isnull().mean().round(3))
print("Payment types:\n", payments["payment_type"].value_counts())
print("Payment value stats:\n", payments["payment_value"].describe())

# ─── 4. Save cleaned payments ───────────────────────────────────────────────────
save_clean(payments, "order_payments_clean")

# final peek
payments.head()

Missingness per column:
 order_id                0.0
payment_sequential      0.0
payment_type            0.0
payment_installments    0.0
payment_value           0.0
dtype: float64
Payment types:
 payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: count, dtype: int64
Payment value stats:
 count    103886.000000
mean        154.100380
std         217.494064
min           0.000000
25%          56.790000
50%         100.000000
75%         171.837500
max       13664.080000
Name: payment_value, dtype: float64
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/order_payments_clean.parquet


Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


# (4) Reviews

In [7]:
# ─── Load & peek ────────────────────────────────────────────────────────────────
reviews = pd.read_csv(RAW_DATA_DIR / "olist_order_reviews_dataset.csv")
reviews.head()

# ─── 1. Drop duplicate reviews by review_id ─────────────────────────────────────
reviews = reviews.drop_duplicates(subset="review_id")

# ─── 2. Parse datetime columns ──────────────────────────────────────────────────
reviews["review_creation_date"]     = pd.to_datetime(reviews["review_creation_date"])
reviews["review_answer_timestamp"]  = pd.to_datetime(reviews["review_answer_timestamp"])

# ─── 3. Compute response lag in days ────────────────────────────────────────────
reviews["response_time_days"] = (
    reviews["review_answer_timestamp"] - reviews["review_creation_date"]
).dt.days

# ─── 4. Derive comment lengths ──────────────────────────────────────────────────
# handle missing titles/messages
reviews["review_comment_title"]   = reviews["review_comment_title"].fillna("")
reviews["review_comment_message"] = reviews["review_comment_message"].fillna("")
reviews["title_length"]   = reviews["review_comment_title"].str.len()
reviews["message_length"] = reviews["review_comment_message"].str.len()

# ─── 5. Sanity checks ───────────────────────────────────────────────────────────
print("Missingness per column:\n", reviews.isnull().mean().round(3))
print("Score distribution:\n", reviews["review_score"].value_counts().sort_index())
print("Response time stats (days):\n", reviews["response_time_days"].describe())
print("Comment length stats:\n",
      "title:", reviews["title_length"].describe(),
      "\nmessage:", reviews["message_length"].describe())

# ─── 6. Save cleaned reviews ───────────────────────────────────────────────────
save_clean(reviews, "order_reviews_clean")

# final peek
reviews.head()

Missingness per column:
 review_id                  0.0
order_id                   0.0
review_score               0.0
review_comment_title       0.0
review_comment_message     0.0
review_creation_date       0.0
review_answer_timestamp    0.0
response_time_days         0.0
title_length               0.0
message_length             0.0
dtype: float64
Score distribution:
 review_score
1    11282
2     3114
3     8097
4    19007
5    56910
Name: count, dtype: int64
Response time stats (days):
 count    98410.000000
mean         2.583183
std          9.915193
min          0.000000
25%          1.000000
50%          1.000000
75%          3.000000
max        518.000000
Name: response_time_days, dtype: float64
Comment length stats:
 title: count    98410.000000
mean         1.397724
std          4.406631
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         26.000000
Name: title_length, dtype: float64 
message: count    98410.000000
mean        28.3

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,response_time_days,title_length,message_length
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,0,0,0
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,1,0,0
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,1,0,0
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,0,0,37
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,1,0,100


# (5) geolocation

In [8]:
# ─── Load & peek ────────────────────────────────────────────────────────────────
sellers = pd.read_csv(RAW_DATA_DIR / "olist_sellers_dataset.csv")
sellers.head()

# ─── 1. Deduplicate on seller_id ────────────────────────────────────────────────
sellers = sellers.drop_duplicates(subset="seller_id")

# ─── 2. Normalize ZIP code prefix as zero-padded 8-digit string ───────────────
sellers["seller_zip_code_prefix"] = (
    sellers["seller_zip_code_prefix"]
        .astype(str)
        .str.zfill(8)
)

# ─── 3. Standardize text fields ─────────────────────────────────────────────────
sellers["seller_city"] = (
    sellers["seller_city"]
        .str.strip()
        .str.lower()
        .map(unidecode.unidecode)
)
sellers["seller_state"] = sellers["seller_state"].str.upper()

# ─── 4. Sanity checks ───────────────────────────────────────────────────────────
print("Nulls per column:\n", sellers.isnull().mean().round(3))
print("Unique sellers:", sellers["seller_id"].nunique())

# ─── 5. Save cleaned sellers ───────────────────────────────────────────────────
save_clean(sellers, "sellers_clean")

# final peek
sellers.head()

Nulls per column:
 seller_id                 0.0
seller_zip_code_prefix    0.0
seller_city               0.0
seller_state              0.0
dtype: float64
Unique sellers: 3095
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/sellers_clean.parquet


Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


# (6) Products

In [9]:

# ─── Load & peek ────────────────────────────────────────────────────────────────
products = pd.read_csv(RAW_DATA_DIR / "olist_products_dataset.csv")
products.head()

# ─── 1. Drop duplicate products ─────────────────────────────────────────────────
products = products.drop_duplicates(subset="product_id")

# ─── 2. Cast numeric columns ────────────────────────────────────────────────────
num_cols = [
    "product_name_lenght", 
    "product_description_lenght",
    "product_photos_qty",
    "product_weight_g",
    "product_length_cm",
    "product_height_cm",
    "product_width_cm",
]
for col in num_cols:
    products[col] = pd.to_numeric(products[col], errors="coerce")

# ─── 3. Derive volume and density ───────────────────────────────────────────────
products["product_volume_cm3"] = (
    products["product_length_cm"]
    * products["product_width_cm"]
    * products["product_height_cm"]
)
products["product_density_g_cm3"] = (
    products["product_weight_g"] / products["product_volume_cm3"]
)

# ─── 4. Merge human-readable category names ─────────────────────────────────────
cat_trans = pd.read_csv(RAW_DATA_DIR / "product_category_name_translation.csv")
# keep only the English name and original Portuguese key
cat_trans = cat_trans.rename(
    columns={"product_category_name_english": "product_category_name_en"}
)[["product_category_name", "product_category_name_en"]]

products = products.merge(
    cat_trans,
    on="product_category_name",
    how="left"
)

# drop the Portuguese category column
products = products.drop(columns=["product_category_name"])

# ─── 5. Sanity checks ───────────────────────────────────────────────────────────
print("Nulls per column:\n", products.isnull().mean().round(3))
print("Missing categories:", products["product_category_name_en"].isnull().sum())
print("Volume stats (cm³):\n", products["product_volume_cm3"].describe())
print("Density stats (g/cm³):\n", products["product_density_g_cm3"].describe())

# ─── 6. Save cleaned products ──────────────────────────────────────────────────
save_clean(products, "products_clean")

# final peek
products.head()

Nulls per column:
 product_id                    0.000
product_name_lenght           0.019
product_description_lenght    0.019
product_photos_qty            0.019
product_weight_g              0.000
product_length_cm             0.000
product_height_cm             0.000
product_width_cm              0.000
product_volume_cm3            0.000
product_density_g_cm3         0.000
product_category_name_en      0.019
dtype: float64
Missing categories: 623
Volume stats (cm³):
 count     32949.000000
mean      16564.096695
std       27057.041650
min         168.000000
25%        2880.000000
50%        6840.000000
75%       18480.000000
max      296208.000000
Name: product_volume_cm3, dtype: float64
Density stats (g/cm³):
 count    32949.000000
mean         0.203690
std          1.009271
min          0.000000
25%          0.066176
50%          0.116550
75%          0.195869
max         85.227273
Name: product_density_g_cm3, dtype: float64
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/dat

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_volume_cm3,product_density_g_cm3,product_category_name_en
0,1e9e8ef04dbcff4541ed26657ea517e5,40.0,287.0,1.0,225.0,16.0,10.0,14.0,2240.0,0.100446,perfumery
1,3aa071139cb16b67ca9e5dea641aaa2f,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,10800.0,0.092593,art
2,96bd76ec8810374ed1b65e291975717f,46.0,250.0,1.0,154.0,18.0,9.0,15.0,2430.0,0.063374,sports_leisure
3,cef67bcfe19066a932b7673e239eb23d,27.0,261.0,1.0,371.0,26.0,4.0,26.0,2704.0,0.137204,baby
4,9dc1a7de274444849c219cff195d0b71,37.0,402.0,4.0,625.0,20.0,17.0,13.0,4420.0,0.141403,housewares


# (7) Geolocation

In [10]:
# ─── Load & peek ────────────────────────────────────────────────────────────────
geo = pd.read_csv(RAW_DATA_DIR / "olist_geolocation_dataset.csv")
geo.head()

# ─── 1. Deduplicate on ZIP prefix ───────────────────────────────────────────────
geo = geo.drop_duplicates(subset="geolocation_zip_code_prefix")

# ─── 2. Zero-pad ZIP prefixes ───────────────────────────────────────────────────
geo["geolocation_zip_code_prefix"] = (
    geo["geolocation_zip_code_prefix"]
        .astype(str)
        .str.zfill(8)
)

# ─── 3. Cast lat/lng to numeric ─────────────────────────────────────────────────
geo["geolocation_lat"] = pd.to_numeric(geo["geolocation_lat"], errors="coerce")
geo["geolocation_lng"] = pd.to_numeric(geo["geolocation_lng"], errors="coerce")

# ─── 4. Rename columns for consistency ──────────────────────────────────────────
geo = geo.rename(columns={
    "geolocation_zip_code_prefix": "zip_code_prefix",
    "geolocation_lat":            "latitude",
    "geolocation_lng":            "longitude",
    "geolocation_city":           "city",
    "geolocation_state":          "state"
})

# ─── 5. Sanity checks ───────────────────────────────────────────────────────────
print("Nulls per column:\n", geo.isnull().mean().round(3))
print("Unique ZIP prefixes:", geo["zip_code_prefix"].nunique())

# ─── 6. Save cleaned geolocation ────────────────────────────────────────────────
save_clean(geo, "geolocation_clean")

# final peek
geo.head()

Nulls per column:
 zip_code_prefix    0.0
latitude           0.0
longitude          0.0
city               0.0
state              0.0
dtype: float64
Unique ZIP prefixes: 19015
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/geolocation_clean.parquet


Unnamed: 0,zip_code_prefix,latitude,longitude,city,state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
5,1012,-23.547762,-46.635361,são paulo,SP


# (8) Product Category Translation

In [11]:
# ─── Load & peek ────────────────────────────────────────────────────────────────
cat_trans = pd.read_csv(
    RAW_DATA_DIR / "product_category_name_translation.csv",
    dtype=str
)
cat_trans.head()

# ─── 1. Drop duplicate category keys ────────────────────────────────────────────
cat_trans = cat_trans.drop_duplicates(subset="product_category_name")

# ─── 2. Strip & standardize text ────────────────────────────────────────────────
cat_trans["product_category_name"] = (
    cat_trans["product_category_name"]
       .str.strip()
)
cat_trans["product_category_name_english"] = (
    cat_trans["product_category_name_english"]
       .str.strip()
)

# ─── 3. Rename columns for clarity ──────────────────────────────────────────────
cat_trans = cat_trans.rename(columns={
    "product_category_name":             "category_br",
    "product_category_name_english":     "category_en"
})

# ─── 4. Sanity checks ───────────────────────────────────────────────────────────
print("Nulls per column:\n", cat_trans.isnull().mean().round(3))
print("Total categories:", cat_trans.shape[0])

# ─── 5. Save cleaned translation table ──────────────────────────────────────────
save_clean(cat_trans, "category_translation_clean")

# final peek
cat_trans.head()

Nulls per column:
 category_br    0.0
category_en    0.0
dtype: float64
Total categories: 71
✔️ saved /home/gwei4/e_commerce_customer_ai_platform/data/processed/category_translation_clean.parquet


Unnamed: 0,category_br,category_en
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [12]:
print("All tables cleaned and saved to", PROCESSED_DATA_DIR)

All tables cleaned and saved to /home/gwei4/e_commerce_customer_ai_platform/data/processed
