In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Análises iniciais
Primeiramente, irei analizar os datasets individualmente, afim de responder as seguintes questões:
* Há dados faltantes?
* Qual o tamanho das bases?
* Há dados incosistentes?
* Qual a proporção de classes para cada variável?

## Dataset 1 - Customers / Clientes

In [4]:
df_customers = pd.read_csv("data/olist_customers_dataset.csv")

In [5]:
df_customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [9]:
df_customers.shape

(99441, 5)

In [6]:
df_customers.isnull().sum()/df_customers.shape[0]

customer_id                 0.0
customer_unique_id          0.0
customer_zip_code_prefix    0.0
customer_city               0.0
customer_state              0.0
dtype: float64

In [7]:
df_customers.nunique()

customer_id                 99441
customer_unique_id          96096
customer_zip_code_prefix    14994
customer_city                4119
customer_state                 27
dtype: int64

In [20]:
cs = df_customers['customer_state'].value_counts().reset_index()
cs['proporcao'] = cs['customer_state'] / cs['customer_state'].sum()
cs['proporcao_acumulada'] = cs['proporcao'].cumsum()
cs.head()

Unnamed: 0,index,customer_state,proporcao,proporcao_acumulada
0,SP,41746,0.419807,0.419807
1,RJ,12852,0.129242,0.549049
2,MG,11635,0.117004,0.666053
3,RS,5466,0.054967,0.721021
4,PR,5045,0.050734,0.771754


In [32]:
print('''{} estados são responsáveis por aproximadamente 55% dos clientes.

Esses estados são:
{}'''.format(
    cs[cs['proporcao_acumulada'] <= 0.55]['index'].count(), cs[cs['proporcao_acumulada'] <= 0.55]['index']))

2 estados são responsáveis por aproximadamente 55% dos clientes.

Esses estados são:
0    SP
1    RJ
Name: index, dtype: object


In [28]:
cy = df_customers['customer_city'].value_counts().reset_index()
cy['proporcao'] = cy['customer_city'] / cy['customer_city'].sum(); 
cy['proporcao_acumulada'] = cy['proporcao'].cumsum()
cy

Unnamed: 0,index,customer_city,proporcao,proporcao_acumulada
0,sao paulo,15540,0.156274,0.156274
1,rio de janeiro,6882,0.069207,0.225480
2,belo horizonte,2773,0.027886,0.253366
3,brasilia,2131,0.021430,0.274796
4,curitiba,1521,0.015296,0.290092
...,...,...,...,...
4114,santo antonio de goias,1,0.000010,0.999960
4115,duas barras,1,0.000010,0.999970
4116,tapurah,1,0.000010,0.999980
4117,ourizona,1,0.000010,0.999990


In [33]:
print('''{} cidades são responsáveis por 50% ds clientes.
Essas cidades são: 
{}'''.format(
    cy[cy['proporcao_acumulada'] <= 0.5]['index'].count(), cy[cy['proporcao_acumulada'] <= 0.5]['index']))

39 cidades são responsáveis por 50% ds clientes.
Essas cidades são: 
0                 sao paulo
1            rio de janeiro
2            belo horizonte
3                  brasilia
4                  curitiba
5                  campinas
6              porto alegre
7                  salvador
8                 guarulhos
9     sao bernardo do campo
10                  niteroi
11              santo andre
12                   osasco
13                   santos
14                  goiania
15      sao jose dos campos
16                fortaleza
17                 sorocaba
18                   recife
19            florianopolis
20                  jundiai
21           ribeirao preto
22                    belem
23              nova iguacu
24                  barueri
25             juiz de fora
26                 contagem
27              sao goncalo
28          mogi das cruzes
29                  vitoria
30               uberlandia
31               piracicaba
32                 sao luis
33    s

## Dataset 2 - Geolocation / Geocalização

In [34]:
df_geo = pd.read_csv("data/olist_geolocation_dataset.csv")

In [35]:
df_geo.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [38]:
df_geo.shape[0]

1000163

In [37]:
df_geo.isnull().sum() / df_geo.shape[0]

geolocation_zip_code_prefix    0.0
geolocation_lat                0.0
geolocation_lng                0.0
geolocation_city               0.0
geolocation_state              0.0
dtype: float64

## Dataset 3 - Orders / Ordens de compra

In [40]:
df_orders = pd.read_csv("data/olist_orders_dataset.csv")

In [41]:
df_orders

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00
...,...,...,...,...,...,...,...,...
99436,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,delivered,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28 00:00:00
99437,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,delivered,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02 00:00:00
99438,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,delivered,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27 00:00:00
99439,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,delivered,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15 00:00:00


In [42]:
df_orders.shape

(99441, 8)

In [43]:
df_orders.isnull().sum() / df_orders.shape[0]

order_id                         0.000000
customer_id                      0.000000
order_status                     0.000000
order_purchase_timestamp         0.000000
order_approved_at                0.001609
order_delivered_carrier_date     0.017930
order_delivered_customer_date    0.029817
order_estimated_delivery_date    0.000000
dtype: float64

In [48]:
 df_orders[df_orders['order_approved_at'].isnull()]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
1130,00b1cb0320190ca0daa2c88b35206009,3532ba38a3fd242259a514ac2b6ae6b6,canceled,2018-08-28 15:26:39,,,,2018-09-12 00:00:00
1801,ed3efbd3a87bea76c2812c66a0b32219,191984a8ba4cbb2145acb4fe35b69664,canceled,2018-09-20 13:54:16,,,,2018-10-17 00:00:00
1868,df8282afe61008dc26c6c31011474d02,aa797b187b5466bc6925aaaa4bb3bed1,canceled,2017-03-04 12:14:30,,,,2017-04-10 00:00:00
2029,8d4c637f1accf7a88a4555f02741e606,b1dd715db389a2077f43174e7a675d07,canceled,2018-08-29 16:27:49,,,,2018-09-13 00:00:00
2161,7a9d4c7f9b068337875b95465330f2fc,7f71ae48074c0cfec9195f88fcbfac55,canceled,2017-05-01 16:12:39,,,,2017-05-30 00:00:00
...,...,...,...,...,...,...,...,...
97696,5a00b4d35edffc56b825c3646a99ba9d,6a3bdf004ca96338fb5fad1b8d93c2e6,canceled,2017-07-02 15:38:46,,,,2017-07-25 00:00:00
98415,227c804e2a44760671a6a5697ea549e4,62e7477e75e542243ee62a0ba73f410f,canceled,2017-09-28 15:02:56,,,,2017-10-16 00:00:00
98909,e49e7ce1471b4693482d40c2bd3ad196,e4e7ab3f449aeb401f0216f86c2104db,canceled,2018-08-07 11:16:28,,,,2018-08-10 00:00:00
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,,,,2018-10-01 00:00:00


In [49]:
 df_orders[df_orders['order_delivered_carrier_date'].isnull()]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00
103,0760a852e4e9d89eb77bf631eaaf1c84,d2a79636084590b7465af8ab374a8cf5,invoiced,2018-08-03 17:44:42,2018-08-07 06:15:14,,,2018-08-21 00:00:00
128,15bed8e2fec7fdbadb186b57c46c92f2,f3f0e613e0bdb9c7cee75504f0f90679,processing,2017-09-03 14:22:03,2017-09-03 14:30:09,,,2017-10-03 00:00:00
266,8e24261a7e58791d10cb1bf9da94df5c,64a254d30eed42cd0e6c36dddb88adf0,unavailable,2017-11-16 15:09:28,2017-11-16 15:26:57,,,2017-12-05 00:00:00
324,d3c8851a6651eeff2f73b0e011ac45d0,957f8e082185574de25992dc659ebbc0,processing,2016-10-05 22:44:13,2016-10-06 15:51:05,,,2016-12-09 00:00:00
...,...,...,...,...,...,...,...,...
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,,,,2018-10-01 00:00:00
99313,e9e64a17afa9653aacf2616d94c005b8,b4cd0522e632e481f8eaf766a2646e86,processing,2018-01-05 23:07:24,2018-01-09 07:18:05,,,2018-02-06 00:00:00
99347,a89abace0dcc01eeb267a9660b5ac126,2f0524a7b1b3845a1a57fcf3910c4333,canceled,2018-09-06 18:45:47,,,,2018-09-27 00:00:00
99348,a69ba794cc7deb415c3e15a0a3877e69,726f0894b5becdf952ea537d5266e543,unavailable,2017-08-23 16:28:04,2017-08-28 15:44:47,,,2017-09-15 00:00:00


In [51]:
df_orders[df_orders['order_delivered_customer_date'].isnull()]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,,,2017-05-09 00:00:00
44,ee64d42b8cf066f35eac1cf57de1aa85,caded193e8e47b8362864762a83db3c5,shipped,2018-06-04 16:44:48,2018-06-05 04:31:18,2018-06-05 14:32:00,,2018-06-28 00:00:00
103,0760a852e4e9d89eb77bf631eaaf1c84,d2a79636084590b7465af8ab374a8cf5,invoiced,2018-08-03 17:44:42,2018-08-07 06:15:14,,,2018-08-21 00:00:00
128,15bed8e2fec7fdbadb186b57c46c92f2,f3f0e613e0bdb9c7cee75504f0f90679,processing,2017-09-03 14:22:03,2017-09-03 14:30:09,,,2017-10-03 00:00:00
154,6942b8da583c2f9957e990d028607019,52006a9383bf149a4fb24226b173106f,shipped,2018-01-10 11:33:07,2018-01-11 02:32:30,2018-01-11 19:39:23,,2018-02-07 00:00:00
...,...,...,...,...,...,...,...,...
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,,,,2018-10-01 00:00:00
99313,e9e64a17afa9653aacf2616d94c005b8,b4cd0522e632e481f8eaf766a2646e86,processing,2018-01-05 23:07:24,2018-01-09 07:18:05,,,2018-02-06 00:00:00
99347,a89abace0dcc01eeb267a9660b5ac126,2f0524a7b1b3845a1a57fcf3910c4333,canceled,2018-09-06 18:45:47,,,,2018-09-27 00:00:00
99348,a69ba794cc7deb415c3e15a0a3877e69,726f0894b5becdf952ea537d5266e543,unavailable,2017-08-23 16:28:04,2017-08-28 15:44:47,,,2017-09-15 00:00:00


In [50]:
df_orders['order_status'].value_counts()

delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64

Por se tratar de uma base que tem todos as ordens de compra, há pedidos que foram cancelados ou não foram aprovados por quaiquer motivos. Portanto há valores ausentes nesses casos mencionados.

## Dataset 4 - Pays / Pagamentos

In [52]:
df_pays = pd.read_csv("data/olist_order_payments_dataset.csv")

In [53]:
df_pays.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [54]:
df_pays.shape[0]

103886

In [55]:
df_pays.isnull().sum() / df_pays.shape[0]

order_id                0.0
payment_sequential      0.0
payment_type            0.0
payment_installments    0.0
payment_value           0.0
dtype: float64

In [57]:
df_pays['payment_sequential'].unique()

array([ 1,  2,  4,  5,  3,  8,  6,  7, 10, 11, 17, 19, 27, 12,  9, 15, 13,
       14, 16, 25, 22, 26, 29, 28, 18, 21, 24, 23, 20], dtype=int64)

In [58]:
df_pays['payment_type'].value_counts()

credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: payment_type, dtype: int64

In [60]:
df_pays.describe()

Unnamed: 0,payment_sequential,payment_installments,payment_value
count,103886.0,103886.0,103886.0
mean,1.092679,2.853349,154.10038
std,0.706584,2.687051,217.494064
min,1.0,0.0,0.0
25%,1.0,1.0,56.79
50%,1.0,1.0,100.0
75%,1.0,4.0,171.8375
max,29.0,24.0,13664.08


## Dataset 5 - Products / Produtos

In [69]:
df_prodts = pd.read_csv("data/olist_products_dataset.csv")

In [70]:
df_prodts.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [81]:
df_prodts.shape

(32951, 9)

In [82]:
df_prodts.isna().sum() / df_prodts.shape[0]

product_id                    0.000000
product_category_name         0.018512
product_name_lenght           0.018512
product_description_lenght    0.018512
product_photos_qty            0.018512
product_weight_g              0.000061
product_length_cm             0.000061
product_height_cm             0.000061
product_width_cm              0.000061
dtype: float64

## Dataset 6 - Order_Items / Ordens_Itens

In [73]:
df_oitens = pd.read_csv("data/olist_order_items_dataset.csv")

In [76]:
df_oitens.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [79]:
df_oitens.shape

(112650, 7)

In [80]:
df_oitens.isna().sum() / df_oitens.shape[0]

order_id               0.0
order_item_id          0.0
product_id             0.0
seller_id              0.0
shipping_limit_date    0.0
price                  0.0
freight_value          0.0
dtype: float64

## Dataset 7 - Sellers / Vendedores

In [71]:
df_sellers = pd.read_csv("data/olist_sellers_dataset.csv")

In [72]:
df_sellers.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [77]:
df_sellers.shape

(3095, 4)

In [78]:
df_sellers.isna().sum() / df_sellers.shape[0]

seller_id                 0.0
seller_zip_code_prefix    0.0
seller_city               0.0
seller_state              0.0
dtype: float64

In [89]:
ss = df_sellers['seller_state'].value_counts().reset_index()
ss['proporcao'] = ss['seller_state'] / ss['seller_state'].sum()
ss['proporcao_acumulada'] = ss['proporcao'].cumsum()
ss

Unnamed: 0,index,seller_state,proporcao,proporcao_acumulada
0,SP,1849,0.597415,0.597415
1,PR,349,0.112763,0.710178
2,MG,244,0.078837,0.789015
3,SC,190,0.061389,0.850404
4,RJ,171,0.05525,0.905654
5,RS,129,0.04168,0.947334
6,GO,40,0.012924,0.960258
7,DF,30,0.009693,0.969952
8,ES,23,0.007431,0.977383
9,BA,19,0.006139,0.983522


In [92]:
print('''{} estados são responsáveis por aproximadamente 78% dos vendedores

Os estados são os seguintes:
{}'''.format(
    ss[ss['proporcao_acumulada'] < 0.79]['index'].count(), ss[ss['proporcao_acumulada'] < 0.79]['index']))

3 estados são responsáveis por aproximadamente 78% dos vendedores

Os estados são os seguintes:
0    SP
1    PR
2    MG
Name: index, dtype: object


In [86]:
sc = df_sellers['seller_city'].value_counts().reset_index()
sc['proporcao'] = sc['seller_city'] / sc['seller_city'].sum()
sc['proporcao_acumulada'] = sc['proporcao'].cumsum()
sc

Unnamed: 0,index,seller_city,proporcao,proporcao_acumulada
0,sao paulo,694,0.224233,0.224233
1,curitiba,127,0.041034,0.265267
2,rio de janeiro,96,0.031018,0.296284
3,belo horizonte,68,0.021971,0.318255
4,ribeirao preto,52,0.016801,0.335057
...,...,...,...,...
606,lages - sc,1,0.000323,0.998708
607,angra dos reis,1,0.000323,0.999031
608,sbc,1,0.000323,0.999354
609,congonhas,1,0.000323,0.999677


In [93]:
print('''{} cidade são responsáveis por aproximadamente 78% dos vendedores

Os estados são os seguintes:
{}'''.format(
    sc[sc['proporcao_acumulada'] < 0.79]['index'].count(), sc[sc['proporcao_acumulada'] < 0.51]['index']))

137 cidade são responsáveis por aproximadamente 78% dos vendedores

Os estados são os seguintes:
0                 sao paulo
1                  curitiba
2            rio de janeiro
3            belo horizonte
4            ribeirao preto
5                 guarulhos
6                  ibitinga
7               santo andre
8                  campinas
9                   maringa
10    sao jose do rio preto
11                 sorocaba
12                   osasco
13    sao bernardo do campo
14             porto alegre
15                 brasilia
16                 londrina
17                  goiania
18                joinville
19                 blumenau
20                   franca
Name: index, dtype: object
