In [1]:
# Import libraries
from joblib import Parallel, delayed
from unidecode import unidecode
from sqlalchemy import create_engine
from dotenv import load_dotenv 
import json
import urllib
import os
import datetime
import psycopg2
import requests
import numpy
import pandas as pd

In [2]:
# Read credentials
load_dotenv()

user = os.getenv('user')
password = os.getenv('password')
host = os.getenv('host')
port = os.getenv('port')
db = os.getenv('db')

In [3]:
# Connect to PostgreSQL database
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{db}')
#engine = create_engine(f'postgresql://postgres:password@localhost:5432/final_project')

In [4]:
# Import CSVs.
# Database of Brazilian cities obtained from https://www.ibge.gov.br/geociencias/organizacao-do-territorio/estrutura-territorial/23701-divisao-territorial-brasileira.html?=&t=downloads
# Use 2018 data in accordance with the timestamps on reviews.

df_sellers = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_sellers_dataset.csv")
#df_geolocation = pd.read_csv("C:/Users/Clinton/Desktop/Final Project/olist_geolocation_dataset.csv")
#df_cities = pd.read_excel("C:/Users/Clinton/Desktop/Final Project/RELATORIO_DTB_BRASIL_MUNICIPIO.xls")

In [5]:
# Inspect head of sellers df
print(df_sellers.head())

                          seller_id  seller_zip_code_prefix  \
0  3442f8959a84dea7ee197c632cb2df15                   13023   
1  d1b65fc7debc3361ea86b5f14c68d2e2                   13844   
2  ce3ad9de960102d0677a81f5d0bb7b2d                   20031   
3  c0f3eea2e14555b6faeea3dd58c1b1c3                    4195   
4  51a04a8a6bdcb23deccc82b0b80742cf                   12914   

         seller_city seller_state  
0           campinas           SP  
1         mogi guacu           SP  
2     rio de janeiro           RJ  
3          sao paulo           SP  
4  braganca paulista           SP  


In [6]:
# Inspect sellers df
print(df_sellers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: int64(1), object(3)
memory usage: 96.8+ KB
None


In [7]:
print(df_sellers.describe())

       seller_zip_code_prefix
count             3095.000000
mean             32291.059451
std              32713.453830
min               1001.000000
25%               7093.500000
50%              14940.000000
75%              64552.500000
max              99730.000000


In [8]:
# Check for missing data
print(df_sellers.isna().sum())

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64


In [9]:
# Check for uniqueness. Note that a state can have multiple cities, a city can have multiple zipcode prefixes, 
# and a zipcode prefix can have multiple sellers. Each seller is its own entry.
print(df_sellers.nunique())

seller_id                 3095
seller_zip_code_prefix    2246
seller_city                611
seller_state                23
dtype: int64


In [10]:
# # Check for variations on same city. There is a number here, and some obvious errors.
# print(df_sellers['seller_city'].sort_values().unique())

['04482255' 'abadia de goias' 'afonso claudio' 'aguas claras df'
 'alambari' 'alfenas' 'almirante tamandare' 'alvares machado' 'alvorada'
 'americana' 'amparo' 'ampere' 'anapolis' 'andira-pr' 'andradas'
 'angra dos reis' 'angra dos reis rj' 'ao bernardo do campo' 'aparecida'
 'aparecida de goiania' 'aperibe' 'apucarana' 'aracaju' 'aracatuba'
 'araguari' 'arapongas' 'araquari' 'ararangua' 'araraquara' 'araras'
 'araucaria' 'araxa' 'arinos' 'armacao dos buzios'
 "arraial d'ajuda (porto seguro)" 'artur nogueira' 'aruja' 'arvorezinha'
 'assis' 'atibaia' 'auriflama' 'auriflama/sp' 'avare' 'bady bassitt'
 'baependi' 'bage' 'bahia' 'balenario camboriu' 'balneario camboriu'
 'bandeirantes' 'barbacena' 'barbacena/ minas gerais' 'bariri'
 'barra mansa' 'barra velha' 'barretos' 'barrinha' 'barro alto' 'barueri'
 'batatais' 'bauru' 'bebedouro' 'belford roxo' 'belo horizont'
 'belo horizonte' 'bento goncalves' 'bertioga' 'betim' 'birigui'
 'blumenau' 'bocaiuva do sul' 'bofete' 'boituva' 'bom jardim

In [11]:
# We should note that Brazilian zipcodes are actually 5 digits with a leading 0 in cases represented by a 4-digit code here.
# We cannot store numbers with leading zeros, so we should convert to string.

# Convert zipcodes to strings and add '0' where zipcode is 4 digits
def zip_to_str(x):
    if len(str(x)) == 4:
        return '0' + str(x)
    else:
        return str(x)
    
df_sellers['seller_zip_code_prefix'] = df_sellers['seller_zip_code_prefix'].apply(zip_to_str)

In [12]:
# # API call to load Brazilian zip-codes, cities, and state codes.
# url = 'https://parseapi.back4app.com/classes/CEP?limit=1000000&excludeKeys=bairro,logradouro,numero,info'
# headers = {
#     'X-Parse-Application-Id': '0yGhkskBgC6LMtROXg0SoyHMyl6yYa4SStdCLBpX', # This is the fake app's application id
#     'X-Parse-Master-Key': 'Dv9aEYXQtwEQRmeR4BMXX8YadeE9CyNy6PJFJPQe' # This is the fake app's readonly master key
# }
# cep = json.loads(requests.get(url, headers=headers).content.decode('utf-8')) # Here you have the data that you need
# #print(json.dumps(data, indent=2))

In [13]:
# # Store state codes/cities/zip-code prefixes in a nested dictionary
# zip_codes = {}
# for i in cep['results']:
#     zip_code = str(i['CEP'])[:5]
#     state = i['estado'][:2]
#     if state not in zip_codes: 
#         zip_codes[state] = {i['cidade']: set()}
#     elif i['cidade'] not in zip_codes[state]:
#         zip_codes[state][i['cidade']] = set()
#     zip_codes[state][i['cidade']].add(zip_code)

In [15]:
# # Check for mismatching state-codes for given zip-code.
# # We manually verify that the incorrect city entries here have the wrong state codes.
# # We conclude these entries are incorrect and subsequently replace them.
# incorrect = set()
# count_city = {}
# count_state = {}
# for state in zip_codes:
#     for city in zip_codes[state]:
#         for zip_code in zip_codes[state][city]:
#             if zip_code in df_sellers['seller_zip_code_prefix'].values:
#                 name = df_sellers.loc[df_sellers['seller_zip_code_prefix'] == zip_code, 'seller_city'].iloc[0]
#                 state2 = df_sellers.loc[df_sellers['seller_zip_code_prefix'] == zip_code, 'seller_state'].iloc[0]
#                 try:
#                     assert state2 == state
#                 except:
#                     try:
#                         count_state[state2] +=1
#                         count_city[name] +=1
#                     except:
#                         count_state[state2] =1
#                         count_city[name] =1
#                     incorrect.add((state2, name))
# print(incorrect)
# print(count_state)
# print(count_city)

{('SP', 'aguas claras df'), ('SP', 'goioere'), ('SP', 'rio bonito'), ('SP', 'chapeco'), ('SP', 'caxias do sul'), ('SP', 'marechal candido rondon'), ('SP', 'laranjeiras do sul'), ('SP', 'laguna'), ('SP', 'rio de janeiro'), ('SP', 'porto alegre'), ('SP', 'tocantins'), ('SP', 'sao jose dos pinhais'), ('SP', 'andradas'), ('SP', 'curitiba'), ('SP', 'volta redonda'), ('SP', 'belo horizonte'), ('SP', 'vila velha'), ('SP', 'sertanopolis'), ('SP', 'juiz de fora'), ('RN', 'rio de janeiro'), ('SP', 'londrina')}
{'RN': 1, 'SP': 2}
{'rio de janeiro': 1, 'volta redonda': 1, 'rio bonito': 1, 'vila velha': 1, 'belo horizonte': 1, 'juiz de fora': 1, 'tocantins': 1, 'andradas': 1, 'aguas claras df': 1, 'curitiba': 1, 'sao jose dos pinhais': 1, 'laranjeiras do sul': 1, 'marechal candido rondon': 1, 'londrina': 1, 'sertanopolis': 1, 'goioere': 1, 'laguna': 1, 'chapeco': 1, 'porto alegre': 1, 'caxias do sul': 2}


In [16]:
# Remove accents (Optional step)
#df_sellers['seller_city'] = df_sellers['seller_city'].apply(lambda x: unidecode(x))

#print(df_sellers['seller_city'].sort_values().unique())

In [18]:
# Remove redundant columns
df_sellers.drop(labels=['seller_city', 'seller_state'], axis=1, inplace=True)

In [17]:
# Write DataFrame to SQL
df_sellers.to_sql('sellers', con=engine, if_exists='replace', index=False)

InternalError: (psycopg2.errors.DependentObjectsStillExist) cannot drop table sellers because other objects depend on it
DETAIL:  view seller_count_by_zip depends on table sellers
view seller_count_by_location depends on view seller_count_by_zip
HINT:  Use DROP ... CASCADE to drop the dependent objects too.

[SQL: 
DROP TABLE sellers]
(Background on this error at: https://sqlalche.me/e/14/2j85)