In [1]:
import os
import pandas as pd
import numpy as np
import time
import sqlalchemy as db
import unidecode
from datetime import date
from dotenv import load_dotenv

# Download and inspect Dataset

In [51]:
# Load environment variables from .env file
load_dotenv()

# Access environment variable
geo_csv_local=os.environ['olist_geolocation_local']
geo_csv_azure=os.environ['olist_geolocation_azure']
customer_csv=os.environ['olist_customers_local']
seller_zip_csv=os.environ['olist_seller_zip_local']
db_local=os.environ['olist_db_local']
db_azure=os.environ['olist_db_azure']
export_path_local=os.environ['export_path_local']

In [3]:
# Download data from storage, convert to a dataframe
geo_df = pd.read_csv(geo_csv_local)
geo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [4]:
# Look at first 5 entries
geo_df.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [5]:
# Zip code prefix should be 5 digits long with leading zeroes
geo_df['geolocation_zip_code_prefix'] = geo_df['geolocation_zip_code_prefix'].apply(lambda x: '{0:0>5}'.format(x))

In [6]:
# Change lat/lng fields to strings to check for duplicates
geo_df['geolocation_lat'] = geo_df['geolocation_lat'].apply(str)
geo_df['geolocation_lng'] = geo_df['geolocation_lng'].apply(str)
geo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype 
---  ------                       --------------    ----- 
 0   geolocation_zip_code_prefix  1000163 non-null  object
 1   geolocation_lat              1000163 non-null  object
 2   geolocation_lng              1000163 non-null  object
 3   geolocation_city             1000163 non-null  object
 4   geolocation_state            1000163 non-null  object
dtypes: object(5)
memory usage: 38.2+ MB


In [7]:
# Count duplicates
num_dupes = geo_df.duplicated()
print(num_dupes.sum() / len(geo_df))

0.2617883285024541


In [8]:
# Inspect unique values in city (1)
city = list(geo_df['geolocation_city'].unique())
print('Unique city values: ', geo_df['geolocation_city'].nunique(), '\n', city)

Unique city values:  8011 
 ['sao paulo', 'são paulo', 'sao bernardo do campo', 'jundiaí', 'taboão da serra', 'sãopaulo', 'sp', 'sa£o paulo', 'sao jose dos campos', 'osasco', 'carapicuíba', 'carapicuiba', 'barueri', 'santana de parnaiba', 'pirapora do bom jesus', 'santana de parnaíba', 'jandira', 'itapevi', 'cotia', 'taboao da serra', 'vargem grande paulista', 'embu das artes', 'itapecerica da serra', 'embu', 'são lourenço da serra', 'sao lourenco da serra', 'embu-guacu', 'embu-guaçu', 'embu guaçu', 'juquitiba', 'embu guacu', 'embuguacu', 'guarulhos', 'adamantina', 'guarulhos-sp', 'aruja', 'arujá', 'santa isabel', 'mairipora', 'mairiporã', 'cajamar', 'caieiras', 'jordanesia', 'polvilho', 'mauá', 'jordanésia', 'franco da rocha', 'francisco morato', 'poa', 'itaquaquecetuba', 'ferraz de vasconcelos', 'poá', 'suzano', 'mogi das cruzes', 'mogidascruzes', 'salesopolis', 'biritiba-mirim', 'guararema', 'salesópolis', 'biritiba mirim', 'santo andre', 'santo andré', 'maua', 'ribeirão pires', 'ri

In [9]:
# Write function to replace special characters (portuguese orthography)
def replace_special_characters(df, column):
    df[column] = df[column].apply(unidecode.unidecode)
    return df

In [10]:
# Run function on merged_df
replace_special_characters(geo_df, 'geolocation_city')

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,01037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1,01046,-23.54608112703553,-46.64482029837157,sao paulo,SP
2,01046,-23.54612896641469,-46.64295148361138,sao paulo,SP
3,01041,-23.5443921648681,-46.63949930627844,sao paulo,SP
4,01035,-23.541577961711493,-46.64160722329613,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.06863887662893,-52.01070524698279,tapejara,RS
1000159,99900,-27.87712511771236,-52.22488216264145,getulio vargas,RS
1000160,99950,-28.07185541645192,-52.01471586864233,tapejara,RS
1000161,99980,-28.38893187660049,-51.84687132274168,david canabarro,RS


In [11]:
# Count duplicates
num_dupes = geo_df.duplicated()
print(num_dupes.sum() / len(geo_df))

0.27962142170826154


In [12]:
# Inspect unique values in city
city = list(geo_df['geolocation_city'].unique())
print('Unique city values: ', geo_df['geolocation_city'].nunique(), '\n', city)

Unique city values:  5969 
 ['sao paulo', 'sao bernardo do campo', 'jundiai', 'taboao da serra', 'saopaulo', 'sp', 'saPSo paulo', 'sao jose dos campos', 'osasco', 'carapicuiba', 'barueri', 'santana de parnaiba', 'pirapora do bom jesus', 'jandira', 'itapevi', 'cotia', 'vargem grande paulista', 'embu das artes', 'itapecerica da serra', 'embu', 'sao lourenco da serra', 'embu-guacu', 'embu guacu', 'juquitiba', 'embuguacu', 'guarulhos', 'adamantina', 'guarulhos-sp', 'aruja', 'santa isabel', 'mairipora', 'cajamar', 'caieiras', 'jordanesia', 'polvilho', 'maua', 'franco da rocha', 'francisco morato', 'poa', 'itaquaquecetuba', 'ferraz de vasconcelos', 'suzano', 'mogi das cruzes', 'mogidascruzes', 'salesopolis', 'biritiba-mirim', 'guararema', 'biritiba mirim', 'santo andre', 'ribeirao pires', 'rio grande da serra', 'sao caetano do sul', 'sbcampo', 'diadema', 'santos', 'bertioga', 'caruara', 'sao vicente', 'guaruja', 'cubatao', 'sao sebastiao', 'ubatuba', 'ilhabela', 'caraguatatuba', 'maresias', 

In [13]:
# Inspect duplicates to verify
num_dupes = geo_df.duplicated(keep=False)
display(geo_df[num_dupes].sort_values(['geolocation_lat', 'geolocation_lng']))

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
774591,68903,-0.002317920530416,-51.09144531474383,macapa,AP
774648,68903,-0.002317920530416,-51.09144531474383,macapa,AP
774337,68903,-0.0048387683330362,-51.091224003171845,macapa,AP
774732,68903,-0.0048387683330362,-51.091224003171845,macapa,AP
775064,68903,-0.0048387683330362,-51.091224003171845,macapa,AP
...,...,...,...,...,...
769391,68275,42.16725082536653,-6.898559051071016,porto trombetas,PA
769436,68275,42.16725082536653,-6.898559051071016,porto trombetas,PA
769489,68275,42.16725082536653,-6.898559051071016,porto trombetas,PA
860562,83252,42.18400274298598,-8.723762147513938,ilha dos valadares,PR


In [14]:
# Drop duplicates
geo_df.drop_duplicates(inplace = True)
geo_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 720496 entries, 0 to 1000161
Data columns (total 5 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   geolocation_zip_code_prefix  720496 non-null  object
 1   geolocation_lat              720496 non-null  object
 2   geolocation_lng              720496 non-null  object
 3   geolocation_city             720496 non-null  object
 4   geolocation_state            720496 non-null  object
dtypes: object(5)
memory usage: 33.0+ MB


# Check for dupes based on location info only

In [15]:
# Can 1 city have multiple zip code prefixes? Yes
# Can there be duplicate zip code prefixes? Yes
subset = ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']
loc_dupe = geo_df.duplicated(subset = subset, keep = False)
geo_df[loc_dupe].sort_values(by = 'geolocation_lat')

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
813290,76850,-10.78071026459307,-65.33175973540759,guajara-mirim,RO
813631,76850,-10.78071026459307,-65.33175973540759,guajara mirim,RO
815072,76930,-11.33501778449655,-62.276737115089645,alvorada do oeste,RO
814616,76930,-11.33501778449655,-62.276737115089645,alvorada d'oeste,RO
815291,76963,-11.421487424569468,-61.44937318799787,cacoal,RO
...,...,...,...,...,...
813109,76868,-9.431109145449556,-61.99530666279655,machadinho d oeste,RO
813385,76846,-9.655558978861762,-65.73848496097509,vista alegre do abuna,RO
814075,76846,-9.655558978861762,-65.73848496097509,porto velho,RO
779381,69919,-9.958020162651277,-67.84582514698864,rio bracnco,AC


In [16]:
# Remove dupes
geo_df = geo_df.drop_duplicates(subset = subset, keep = 'first').reset_index(drop = True)
geo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720154 entries, 0 to 720153
Data columns (total 5 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   geolocation_zip_code_prefix  720154 non-null  object
 1   geolocation_lat              720154 non-null  object
 2   geolocation_lng              720154 non-null  object
 3   geolocation_city             720154 non-null  object
 4   geolocation_state            720154 non-null  object
dtypes: object(5)
memory usage: 27.5+ MB


In [17]:
display(geo_df.sort_values('geolocation_zip_code_prefix'))

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
84,01001,-23.54929199999999,-46.633559478233785,sao paulo,SP
165,01001,-23.550497706907517,-46.63433817805407,sao paulo,SP
184,01001,-23.55064182209015,-46.63440979032252,sao paulo,SP
218,01001,-23.54969829946912,-46.63390859285005,sao paulo,SP
370,01001,-23.550263371631395,-46.63419639384839,sao paulo,SP
...,...,...,...,...,...
720105,99980,-28.389731581235868,-51.85075455842014,david canabarro,RS
719802,99980,-28.38921783052984,-51.84601199170674,david canabarro,RS
720153,99980,-28.38893187660049,-51.84687132274168,david canabarro,RS
719916,99990,-28.329471980815523,-51.76910882440838,muliterno,RS


In [18]:
# Inspect unique values in city (2)
print('Unique city values: ', geo_df['geolocation_city'].nunique())

Unique city values:  5957


# Use city_names from customer table

In [21]:
# Import customers database
cust_df = pd.read_csv(customer_csv)
cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


In [22]:
# Change field to string object to match geolocation table
cust_df['customer_zip_code_prefix'] = cust_df['customer_zip_code_prefix'].apply(str)

In [23]:
# Extract zip_code and city fields only
zip_df = cust_df[['customer_zip_code_prefix', 'customer_city']]

In [24]:
# Drop duplicates
zip = zip_df.drop_duplicates()

In [25]:
display(zip.sort_values('customer_zip_code_prefix'))

Unnamed: 0,customer_zip_code_prefix,customer_city
23914,1003,sao paulo
11747,1004,sao paulo
11378,1005,sao paulo
20535,1006,sao paulo
24302,1007,sao paulo
...,...,...
25169,99960,charrua
17275,99965,agua santa
41113,99970,ciriaco
956,99980,david canabarro


In [26]:
# Merge geo_df with zip with a 'left join'
merged_df = geo_df.merge(zip, how='left', left_on='geolocation_zip_code_prefix', right_on='customer_zip_code_prefix')

In [27]:
# Replace geolocation_city with customer_city only if customer_city exists
merged_df['geolocation_city'] = merged_df['customer_city'].combine_first(merged_df['geolocation_city'])

In [28]:
# Remove customer columns
merged_df.drop(columns=['customer_city', 'customer_zip_code_prefix'], inplace=True)

In [29]:
# Inspect unique values in city
city = list(merged_df['geolocation_city'].unique())
print('Unique city values: ', merged_df['geolocation_city'].nunique(), '\n', city)

Unique city values:  5847 
 ['sao paulo', 'jundiai', 'taboao da serra', 'saopaulo', 'sp', 'sao jose dos campos', 'osasco', 'carapicuiba', 'barueri', 'santana de parnaiba', 'pirapora do bom jesus', 'jandira', 'itapevi', 'cotia', 'vargem grande paulista', 'embu das artes', 'itapecerica da serra', 'embu', 'sao lourenco da serra', 'embu-guacu', 'embu guacu', 'juquitiba', 'embuguacu', 'guarulhos', 'adamantina', 'guarulhos-sp', 'aruja', 'santa isabel', 'mairipora', 'cajamar', 'caieiras', 'jordanesia', 'polvilho', 'maua', 'franco da rocha', 'francisco morato', 'poa', 'itaquaquecetuba', 'ferraz de vasconcelos', 'suzano', 'mogi das cruzes', 'salesopolis', 'biritiba-mirim', 'guararema', 'biritiba mirim', 'santo andre', 'ribeirao pires', 'rio grande da serra', 'sao caetano do sul', 'sao bernardo do campo', 'sbcampo', 'diadema', 'santos', 'bertioga', 'caruara', 'sao vicente', 'guaruja', 'cubatao', 'sao sebastiao', 'ubatuba', 'ilhabela', 'caraguatatuba', 'maresias', 'mongagua', 'praia grande', 'ita

In [30]:
display(merged_df)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,01037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1,01046,-23.54608112703553,-46.64482029837157,sao paulo,SP
2,01046,-23.54612896641469,-46.64295148361138,sao paulo,SP
3,01041,-23.5443921648681,-46.63949930627844,sao paulo,SP
4,01035,-23.541577961711493,-46.64160722329613,sao paulo,SP
...,...,...,...,...,...
722416,99965,-28.180655165648112,-52.034366727397455,agua santa,RS
722417,99950,-28.072187652217004,-52.01127167058397,tapejara,RS
722418,99950,-28.068863628939333,-52.01296438020919,tapejara,RS
722419,99950,-28.06863887662893,-52.01070524698279,tapejara,RS


   # Use city name from seller table where not exist in customer table

In [31]:
# Import seller zip code file, prepped in pgadmin
seller_df = pd.read_csv(seller_zip_csv)
seller_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_city             85 non-null     object
 1   seller_zip_code_prefix  85 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ KB


In [32]:
# Drop duplicates
seller_df = seller_df.drop_duplicates()
seller_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_city             85 non-null     object
 1   seller_zip_code_prefix  85 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ KB


In [33]:
display(seller_df.sort_values('seller_zip_code_prefix'))

Unnamed: 0,seller_city,seller_zip_code_prefix
48,sao paulo,1001
49,sao paulo,1039
50,sao paulo,1040
51,sao paulo,1126
52,sao paulo,1212
...,...,...
10,cordilheira alta,89819
34,porto alegre,91901
13,estancia velha,93608
4,bento goncalves,95711


In [34]:
# Change field to string object to match geolocation table
seller_df['seller_zip_code_prefix'] = seller_df['seller_zip_code_prefix'].apply(str)

In [35]:
# Merge merged_df with zip with a 'left join'
final_df = merged_df.merge(seller_df, how='left', left_on='geolocation_zip_code_prefix', right_on='seller_zip_code_prefix')

In [36]:
final_df.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,seller_city,seller_zip_code_prefix
0,1037,-23.54562128115268,-46.63929204800168,sao paulo,SP,,
1,1046,-23.54608112703553,-46.64482029837157,sao paulo,SP,,
2,1046,-23.54612896641469,-46.64295148361138,sao paulo,SP,,
3,1041,-23.5443921648681,-46.63949930627844,sao paulo,SP,,
4,1035,-23.541577961711493,-46.64160722329613,sao paulo,SP,,


In [37]:
# Replace geolocation_city with seller_city only if seller_city exists
final_df['geolocation_city'] = final_df['seller_city'].combine_first(final_df['geolocation_city'])

In [38]:
# Remove customer columns
final_df.drop(columns=['seller_city', 'seller_zip_code_prefix'], inplace=True)

In [39]:
display(final_df)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,01037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1,01046,-23.54608112703553,-46.64482029837157,sao paulo,SP
2,01046,-23.54612896641469,-46.64295148361138,sao paulo,SP
3,01041,-23.5443921648681,-46.63949930627844,sao paulo,SP
4,01035,-23.541577961711493,-46.64160722329613,sao paulo,SP
...,...,...,...,...,...
722449,99965,-28.180655165648112,-52.034366727397455,agua santa,RS
722450,99950,-28.072187652217004,-52.01127167058397,tapejara,RS
722451,99950,-28.068863628939333,-52.01296438020919,tapejara,RS
722452,99950,-28.06863887662893,-52.01070524698279,tapejara,RS


# Look for dupes

In [40]:
# Count complete duplicates
num_dupes = final_df.duplicated()
num_dupes.sum()

0

In [41]:
# Check for duplicates based on location info only
subset = ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']
loc_dupe = final_df.duplicated(subset = subset, keep = False)
final_df[loc_dupe].sort_values(by = 'geolocation_lat')

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
506105,55485,-10.895990269778954,-37.06039133666111,santo antonio das queimadas,PE
506104,55485,-10.895990269778954,-37.06039133666111,jurema,PE
491702,48355,-11.716065903961772,-38.227774780029016,apora,BA
491701,48355,-11.716065903961772,-38.227774780029016,itamira,BA
491740,48355,-11.720562650666556,-38.22460325413346,apora,BA
...,...,...,...,...,...
528769,62600,-3.74011963481903,-39.5452731337601,itapaje,CE
506262,55485,-8.040275033445656,-34.875009162122446,santo antonio das queimadas,PE
506261,55485,-8.040275033445656,-34.875009162122446,jurema,PE
506290,55485,-8.760694500000003,-36.190030500000006,santo antonio das queimadas,PE


In [42]:
# Remove dupes
final_df = final_df.drop_duplicates(subset = subset, keep = 'first').reset_index(drop = True)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720154 entries, 0 to 720153
Data columns (total 5 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   geolocation_zip_code_prefix  720154 non-null  object
 1   geolocation_lat              720154 non-null  object
 2   geolocation_lng              720154 non-null  object
 3   geolocation_city             720154 non-null  object
 4   geolocation_state            720154 non-null  object
dtypes: object(5)
memory usage: 27.5+ MB


# Rename Cols to remove 'geolocation'

In [43]:
# Rename columns
final_df.rename(columns={'geolocation_zip_code_prefix': 'zip_code_prefix', 'geolocation_lat': 'lat', 'geolocation_lng': 'lng', 'geolocation_city': 'city', 'geolocation_state': 'state',}, inplace=True)
final_df.head()

Unnamed: 0,zip_code_prefix,lat,lng,city,state
0,1037,-23.54562128115268,-46.63929204800168,sao paulo,SP
1,1046,-23.54608112703553,-46.64482029837157,sao paulo,SP
2,1046,-23.54612896641469,-46.64295148361138,sao paulo,SP
3,1041,-23.5443921648681,-46.63949930627844,sao paulo,SP
4,1035,-23.541577961711493,-46.64160722329613,sao paulo,SP


In [44]:
# Inspect unique values in city
city = final_df['city'].unique().tolist()
print('Unique city values: ', final_df['city'].nunique(), '\n', city)

Unique city values:  5831 
 ['sao paulo', 'jundiai', 'taboao da serra', 'saopaulo', 'sp', 'sao jose dos campos', 'osasco', 'carapicuiba', 'barueri', 'santana de parnaiba', 'pirapora do bom jesus', 'jandira', 'itapevi', 'cotia', 'vargem grande paulista', 'embu das artes', 'itapecerica da serra', 'embu', 'sao lourenco da serra', 'embu-guacu', 'embu guacu', 'juquitiba', 'embuguacu', 'guarulhos', 'adamantina', 'guarulhos-sp', 'aruja', 'santa isabel', 'mairipora', 'cajamar', 'caieiras', 'jordanesia', 'polvilho', 'maua', 'franco da rocha', 'francisco morato', 'poa', 'itaquaquecetuba', 'ferraz de vasconcelos', 'suzano', 'mogi das cruzes', 'salesopolis', 'biritiba-mirim', 'guararema', 'biritiba mirim', 'santo andre', 'ribeirao pires', 'rio grande da serra', 'sao caetano do sul', 'sao bernardo do campo', 'sbcampo', 'diadema', 'santos', 'bertioga', 'caruara', 'sao vicente', 'guaruja', 'cubatao', 'sao sebastiao', 'ubatuba', 'ilhabela', 'caraguatatuba', 'maresias', 'mongagua', 'praia grande', 'ita

In [45]:
# Check for duplicates based on location info only
subset = ['lat', 'lng']
loc_dupe = final_df.duplicated(subset = subset, keep = False)
final_df[loc_dupe].sort_values(by = 'lat')

Unnamed: 0,zip_code_prefix,lat,lng,city,state
547948,68925,-0.0359277645283952,-51.16788044715518,santana,AP
548052,68927,-0.0359277645283952,-51.16788044715518,santana,AP
547904,68926,-0.0535231382433362,-51.16074622168749,santana,AP
547805,68925,-0.0535231382433362,-51.16074622168749,santana,AP
547631,68700,-1.2106477480160116,-47.17575463799356,capanema,PA
...,...,...,...,...,...
548223,68903,0.0037175935876586,-51.06095157995029,macapa,AP
548234,68904,0.0186163640328727,-51.087058404002775,macapa,AP
548150,68902,0.0186163640328727,-51.087058404002775,macapa,AP
548075,68906,0.0349717437610148,-51.062614502768845,macapa,AP


In [46]:
# Final sort by zip_code_prefix and re-index
final_df.sort_values(['zip_code_prefix'], ignore_index=True, inplace=True)
final_df

Unnamed: 0,zip_code_prefix,lat,lng,city,state
0,01001,-23.54929199999999,-46.633559478233785,sao paulo,SP
1,01001,-23.550497706907517,-46.63433817805407,sao paulo,SP
2,01001,-23.55064182209015,-46.63440979032252,sao paulo,SP
3,01001,-23.54969829946912,-46.63390859285005,sao paulo,SP
4,01001,-23.550263371631395,-46.63419639384839,sao paulo,SP
...,...,...,...,...,...
720149,99980,-28.389731581235868,-51.85075455842014,david canabarro,RS
720150,99980,-28.38921783052984,-51.84601199170674,david canabarro,RS
720151,99980,-28.38893187660049,-51.84687132274168,david canabarro,RS
720152,99990,-28.329471980815523,-51.76910882440838,muliterno,RS


# Export to csv

In [52]:
# Export to .csv file
today = date.today()
final_df.to_csv(f'{export_path_local}geolocation_{today}.csv')