In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
geo = pd.read_csv("DataSet/olist_geolocation_dataset.csv", dtype={'geolocation_zip_code_prefix': str})
customers = pd.read_csv("DataSet/olist_customers_dataset.csv", dtype={'customer_zip_code_prefix': str})
sellers = pd.read_csv("DataSet/olist_sellers_dataset.csv", dtype={'seller_zip_code_prefix': str})
orders = pd.read_csv("DataSet/olist_orders_dataset.csv")

In [3]:
# There were duplicates row in geolocation
# I tried to create a geodataframe, with the duplicates data 
# It was not success, and have to remove duplicates
geo = geo.drop_duplicates(subset=['geolocation_zip_code_prefix'])

In [4]:
geo.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
5,1012,-23.547762,-46.635361,são paulo,SP


In [5]:
customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [6]:
sellers.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


## Zip_Code

### Understanding the Zip_Code
Brinco De Ouro Da Princessa
Av. Imperatriz Dona Tereza Cristina, 87
Jardim Guarani, Campinas
SP, 13100-200, Brasile


    1 – the first character indicates the region, in our case São Paulo
    3 – the second character indicates the sub-region, Campinas
    1 – the third character indicates the sector, Campinas and surroundings
    0 – the fourth character indicates the sub-sector
    0 – the fifth character indicates the sub-sector divider
    200 – the last three characters indicate the delivery area, respectively the exact address

ref - https://www.europacco.com/en/find-zip/br?country=br

In [7]:
geo.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
5,1012,-23.547762,-46.635361,são paulo,SP


### Finding 1
When I tried to join sellers and geo, geo dataset have **missing values**.
I tried finding the (lat, lng) value on google map by using the nearest zip_code.
Turn out they are not exactly correct.
I'll filled the missing values by using the **nearest value**.

In [8]:
lst1 = sellers['seller_zip_code_prefix'].tolist()
lst2 = customers['customer_zip_code_prefix'].tolist()
zipcodes = pd.DataFrame(lst1 + lst2, columns = ["geolocation_zip_code_prefix"])
zipcodes.shape

(102536, 1)

In [9]:
zipcodes.head()

Unnamed: 0,geolocation_zip_code_prefix
0,13023
1,13844
2,20031
3,4195
4,12914


In [10]:
zipcodes.drop_duplicates(inplace = True)
zipcodes.shape

(15078, 1)

In [11]:
geo = pd.merge(
    zipcodes, 
    geo, 
    on = 'geolocation_zip_code_prefix', 
    how = 'outer')
geo.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1001,-23.549292,-46.633559,sao paulo,SP
1,1002,-23.548318,-46.635421,sao paulo,SP
2,1003,-23.549032,-46.635313,sao paulo,SP
3,1004,-23.550116,-46.635122,sao paulo,SP
4,1005,-23.549819,-46.635606,sao paulo,SP


In [12]:
geo.isnull().sum()

geolocation_zip_code_prefix      0
geolocation_lat                162
geolocation_lng                162
geolocation_city               162
geolocation_state              162
dtype: int64

In [13]:
geo.drop_duplicates(subset=['geolocation_zip_code_prefix'], inplace = True)
geo.sort_values(by='geolocation_zip_code_prefix', inplace=True)

# Fills all columns
geo = geo.ffill()
print(geo.shape)

(19177, 5)


In [14]:
def save_file(df, filename):
    now = datetime.now()

    # 2. Format it (YYYY-MM-DD_HH-MM-SS)
    # %Y = Year, %m = Month, %d = Day, %H = Hour, %M = Minute, %S = Second
    timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")

    fname = f"DataSet/{filename}_{timestamp}.csv"
    df.to_csv(fname, index=False)

In [15]:
save_file(geo, "geolocation")