In [91]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [294]:
# Functions that are used throughout the project 
def call_data(name,columns_dropped):

#Reading the documento CSV
    db = pd.read_csv(name)

# Drop the columns which are gonna be unecessary to the analysis
    db.drop(db.columns[columns_dropped],axis=1,inplace=True)
    return db

#Function to drop duplicates and NaN
def drop_clean_table(name_table,columns_to_drop_duplicate,columns_to_drop_NaN):
    name_table.drop_duplicates(subset = columns_to_drop_duplicate, inplace=True)

#Drop NaN values which is not interesting for me 
    name_table.dropna(subset = columns_to_drop_NaN,inplace=True)
    return name_table

In [295]:
order_date = call_data("olist_orders_dataset.csv",[1,3,4,7])

In [296]:
# Subsetting for deliveries completed 
order_date = order_date[order_date["order_status"]=="delivered"]

# Disconsider rows which there is no products departure and arrival dates (Difference: from 96478 to 96469)
order_date.dropna(subset=['order_delivered_carrier_date', 'order_delivered_customer_date'],inplace=True)

#Converting the dates columns into Series(datetimes type) and subtracting them 
date_departure = pd.to_datetime(order_date["order_delivered_carrier_date"])
date_arrival = pd.to_datetime(order_date["order_delivered_customer_date"])
time_delay = (date_arrival - date_departure) / np.timedelta64(1,"D")

In [297]:
# Merge the Series into DataFrame back
date_merged = order_date.merge(time_delay.rename("Delivery Delay"),left_index=True,right_index=True)

In [298]:
#Set index to merge with the other DF(seller_order) which there is the same data columns 
date_merged.set_index("order_id",inplace=True)

In [299]:
seller_state = call_data("olist_sellers_dataset.csv",[1])

In [300]:
# Verifying if there is no duplicates or NaN by the function
seller_state =drop_clean_table(seller_state,["seller_id"],["seller_id","seller_state"])

In [301]:
#Set index to merge with the other DF(seller_order) which there is the same data columns 
seller_state.set_index("seller_id",inplace=True)

In [302]:
# Using the function to call the DF
seller_order = call_data("olist_order_items_dataset.csv",[1,2,4,5,6])

In [303]:
# Using the function to clean the DF
seller_order =drop_clean_table(seller_order,["order_id","seller_id"],["order_id","seller_id"])

In [304]:
# Set index to merge with the other DF(seller_state) which there is the same data columns 
seller_order.set_index("seller_id",inplace=True)

In [305]:
# Merge the 2 DFs to get the relation between the order and where its departuring 
seller_order = seller_order.merge(seller_state,left_index=True,right_index=True)

In [306]:
#Set index to merge with the other DF(date_merged) which there is the same data columns 
seller_order.set_index("order_id",inplace=True)

In [307]:
place_time = date_merged.merge(seller_order,left_index=True,right_index=True)

In [308]:
#Counting the number of orders for each state
delay_orders = place_time.groupby(["seller_state"])["Delivery Delay"].agg(np.mean)
num_orders_place = place_time.groupby(["seller_state"]).count()


# If i needed just the delivery orders i would create a DF based on the Series using dictionaries as below in order to work 
# easily with the DF. However, i want to get more info so i merge the Series into a DF using concat and generate a DF needed automatically 
#Create a DF based on the Series already created 
#delay_orders_df = pd.DataFrame({'States':delay_orders.index, 'Delivery Delay':delay_orders.values})

# Prepare the DF to merge to another 
num_orders_place.drop(num_orders_place.columns[[1,2,3,4]],axis=1,inplace=True)

# Merge the Series into the DF to get all the info that i need in one table 
delay_orders_delivery = pd.concat([num_orders_place,delay_orders],axis=1)

# Name the columns
delay_orders_delivery.columns = ["Delivery Orders","Delivery Delay(Days)"]

delay_orders_delivery = delay_orders_delivery.sort_values(by = ["Delivery Delay(Days)","Delivery Orders"], ascending= [True,False])
print(delay_orders_delivery)

              Delivery Orders  Delivery Delay(Days)
seller_state                                       
RS                       1964              8.064388
MS                         49              8.098366
RN                         51              8.716393
SP                      69395              9.095651
RJ                       4229              9.225426
PB                         35              9.371992
DF                        808              9.519663
MG                       7745              9.621394
PR                       7555              9.938253
PA                          8              9.954369
SE                          9             10.292127
GO                        451             10.298114
BA                        550             10.519742
ES                        310             10.534219
SC                       3608             10.597436
PE                        403             10.608309
MT                        136             11.434185
PI          