In [2]:
# Data Cleaning for shop_sales_project
# This notebook performs the following tasks on the raw CSV files:
 #   1. Load and inspect each dataset
 #   2. Remove duplicate rows
 #   3. Handle missing values
 #   4. Standardize column formats and types
 #   5. Save cleaned datasets for analysis

In [3]:
import pandas as pd 

In [4]:
#load the datasets used in this project

customers = pd.read_csv('../data/olist_customers_dataset.csv')
orders = pd.read_csv('../data/olist_orders_dataset.csv')
items = pd.read_csv('../data/olist_order_items_dataset.csv')

In [5]:
#create a function to display each csv info
def info(file):
    file.info()
    print(file.isna().sum())

In [6]:
## Inspect the customers dataset: preview, structure, and missing values
info(customers)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64


In [7]:
# Inspect the items dataset: preview, structure, and missing values
info(items)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64


In [8]:
# Inspect the orders dataset: preview, structure, and missing values
info(orders)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2

In [9]:
# Drop rows where the order approval timestamp is missing
# because we cannot analyze orders without this information
orders.dropna(subset = ["order_approved_at"],inplace = True)

In [10]:
# Remove duplicate rows in each dataset to ensure data quality
orders =orders.drop_duplicates() 
items =items.drop_duplicates() 
customers = customers.drop_duplicates() 

In [11]:
# Standardize column data types for the customers dataset

customers = customers.astype({
    "customer_id" : str,
    "customer_unique_id" : str,
    "customer_zip_code_prefix" : str,
    "customer_city" : str,
    "customer_state" : str
})
# here we set "customer_zip_code_prefix" as a string,
#because zip codes are stored as strings because they are identifiers,
#and some values in it can start with a 0

In [12]:
# Standardize column data types for the items dataset

items["shipping_limit_date"] = pd.to_datetime(items["shipping_limit_date"])
items = items.astype({
    "order_id" : str,
    "order_item_id" : int,
    "product_id" : str,
    "seller_id" : str,
    "price" : float,
    "freight_value" : float
})

In [13]:
# Standardize column data types for the orders dataset

orders = orders.astype({
    "order_id" : str,
    "customer_id" : str,
    "order_status" : str
})
# create a function to speed up the process
def date_time(column) : 
    orders[column] = pd.to_datetime(orders[column])

date_time("order_purchase_timestamp")
date_time("order_approved_at")
date_time("order_delivered_carrier_date")
date_time("order_delivered_customer_date")
date_time("order_estimated_delivery_date")
                                    

In [15]:
# Save the cleaned datasets

customers.to_csv("../output/customers_cleaned.csv", index=False)
orders.to_csv("../output/orders_cleaned.csv", index=False)
items.to_csv("../output/items_cleaned.csv", index=False)