### This notebook well analyze the cleaned datasets
#### The input data comes from the cleaned CSV files produced in the data_cleaning notebook.

In [1]:
# import pandas
import pandas as pd

In [2]:
# Load the cleaned csv datasets

items = pd.read_csv("../output/items_cleaned.csv")
orders = pd.read_csv("../output/orders_cleaned.csv")
customers = pd.read_csv("../output/customers_cleaned.csv")

In [3]:
# Standardize column data types in the cleaned items dataset to ensure accurate analysis


items["shipping_limit_date"] = pd.to_datetime(items["shipping_limit_date"])
items = items.astype({
    "order_id" : str,
    "order_item_id" : int,
    "product_id" : str,
    "seller_id" : str,
    "price" : float,
    "freight_value" : float
})

In [4]:
# Standardize column data types in the cleaned orders dataset to ensure accurate analysis


orders = orders.astype({
    "order_id" : str,
    "customer_id" : str,
    "order_status" : str
})
# create a function to speed up the process
def date_time(column) : 
    orders[column] = pd.to_datetime(orders[column])

date_time("order_purchase_timestamp")
date_time("order_approved_at")
date_time("order_delivered_carrier_date")
date_time("order_delivered_customer_date")
date_time("order_estimated_delivery_date")

In [5]:
# Standardize column data types in the cleaned customers dataset to ensure accurate analysis


customers = customers.astype({
    "customer_id" : str,
    "customer_unique_id" : str,
    "customer_zip_code_prefix" : str,
    "customer_city" : str,
    "customer_state" : str
})
#   here we set "customer_zip_code_prefix" as a string,
# because zip codes are stored as strings because they are identifiers,
# and some values in it can start with a 0

#### calculating the total number of orders in the **orders** dataset

In [6]:
len(orders)

99281

> 99,281 orders were placed 

#### calculating orders by status 

In [7]:
order_status_count = orders["order_status"].value_counts()
print(order_status_count) 

order_status
delivered      96464
shipped         1107
unavailable      609
canceled         484
invoiced         314
processing       301
approved           2
Name: count, dtype: int64


> 96464 orders are delivered, while 484 are canceled 

In [8]:
# Counting with .loc[] how many values are in orders["order_status"] that are not delivered or canceled 
undelivered_orders = orders.loc[~orders["order_status"].isin(["canceled","delivered"]), "order_status"].value_counts().sum()
print(undelivered_orders)

2333


> there is 2333 undelivered orders 

#### calculating the average delivery time 

In [9]:
# only selecting the delivered orders
delivered = orders[orders["order_status"] == "delivered"] 

avg_delivery_time = delivered["order_delivered_customer_date"] - delivered["order_purchase_timestamp"]
avg_delivery_time.mean()

Timedelta('12 days 13:23:40.390437090')

>  the average delivery time is around 12 and a half days

#### calculating orders per city/state 

In [10]:
# merging orders and customers
orders_customers = orders.merge(
    customers,
    on = "customer_id",
    how = "left"
)

orders_customers["customer_state"].value_counts().head()

customer_state
SP    41667
RJ    12832
MG    11619
RS     5456
PR     5038
Name: count, dtype: int64

>  - top proferming states are : SP,RJ,MG,RS,PR
>  - the state Sao Paulo(SP) has the most orders

In [11]:
orders_customers["customer_city"].value_counts().head()

customer_city
sao paulo         15511
rio de janeiro     6870
belo horizonte     2768
brasilia           2128
curitiba           1519
Name: count, dtype: int64

> - top proferming cities are : sao paulo,rio de janeiro,belo horizonte,brasilia,curitiba
> - the top proferming city is sao paulo with 15511 orders 

#### calculating the total revenue 

##### calculating the total revenue without the freight value

In [12]:
orders_items = orders.merge(
    items,
    on = "order_id",
    how = "left" 
)
total_revenue = orders_items["price"].sum()
print(f"{total_revenue} $")

13589973.270000001 $


> the total revenue without the freight_value is 13589973.27$

##### calculating the total revenue with the freight value

In [13]:
total_revenue_with_freight = (orders_items["price"] + orders_items["freight_value"]).sum()
print(f"{total_revenue_with_freight} $")

15841598.639999997 $


> the total revenue with the freight value is 15841598.63 $

#### calculating the delivery delays 

In [14]:
delivery_delays = orders[orders["order_estimated_delivery_date"] < orders["order_delivered_customer_date"]]
len(delivery_delays)

7827

> there's 7827 delivery delays

#### calculating trends over time 

In [15]:
orders_by_time = orders.set_index("order_purchase_timestamp")
monthly_orders_trend = orders_by_time.resample("ME")["order_id"].count()
monthly_orders_trend.sort_values(ascending=False).head(10)

order_purchase_timestamp
2017-11-30    7535
2018-01-31    7268
2018-03-31    7208
2018-04-30    6939
2018-05-31    6872
2018-02-28    6724
2018-08-31    6459
2018-07-31    6291
2018-06-30    6167
2017-12-31    5666
Name: order_id, dtype: int64

> **the top trending months are :**
> 1. november 2017 with the most orders.
> 2. january 2018
> 3. march 2018