# Initial exploratory

In [1]:
import pandas as pd

## Import CSV file

In [2]:
# import file 'olist_customers_dataset' (1/9 files)
df = pd.read_csv('olist_customers_dataset.csv')

## How many Rows and Columns?

In [3]:
# Explore number of rows and columns
shape = df.shape
print(f'There are {shape[0]} rows and {shape[1]} columns.')

There are 99441 rows and 5 columns.


## Are there any duplications?

In [4]:
df.duplicated().any()

False

## What are the Datatypes of each column?

In [5]:
df.dtypes

customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

---
- The column 'customer_zip_code_prefix' is integer datatype
- The zipcode should be string datatype
---

### Check the content of dataset

In [6]:
df.head(4)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP


---
- Need to add leading zero the the zipcode to the row wher zipcode is only 4 digits
---

## Changing zipcode column to string and add missing leading zero

In [7]:
df.customer_zip_code_prefix = df.customer_zip_code_prefix.astype(str).str.pad(5, side='left', fillchar='0')

In [8]:
df.head(4)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP


## Adding full name of states column

### Import state name dataset from csv file

In [9]:
state_names = pd.read_csv('states_name.csv')

In [10]:
state_names.state.values

array(['AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG',
       'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR',
       'RS', 'SC', 'SE', 'SP', 'TO'], dtype=object)

### Adding new column by joining dataset

In [11]:
# Merge DataFrames based on the 'state' column
df = df.merge(state_names, left_on='customer_state', right_on='state', how='left')

In [12]:
# Drop unused column 'state' for state code values duplicated from merging
df.drop(['state'], axis=1, inplace=True)

In [13]:
# Checking new column 'state_full_name'
df.sample(15)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,state_full_name
76962,258167dc1951e74259a1b26f7313a04d,097c28892c718f31ed9aded0fd0a8991,37730,campestre,MG,Minas Gerais
31976,2683f227b4ca959e6183a3ebbd98b3d4,e88baae856b093e436aef60537fa28d8,91787,porto alegre,RS,Rio Grande do Sul
88835,52857131a4ec3f7a9ffcfdfa5d417ae4,f96464832b8cf295888b6035b62c6fd4,75800,jatai,GO,Goiás
2376,df4c4273ca572a5dccb721c358e92c8d,f971b51b5cde228a6212927455e6a973,40325,salvador,BA,Bahia
48228,1e88b5f20d4c915f63c41dae61f1c23f,3274f2b2174433e889854e791974e48c,38055,uberaba,MG,Minas Gerais
65564,1285e7456591a89200ad22c4015c09ea,a1644581211b1c1d4933064a411ad7de,62051,sobral,CE,Ceará
36373,fd354b19b1fd11c045837f4beda41f6a,54610a858799a89ccd365119745512fd,93601,estancia velha,RS,Rio Grande do Sul
1664,f7bc089803fdef571f770ab60bbb8745,701ada356e78d72bfd5b279280762b79,51230,recife,PE,Pernambuco
24291,5ae4eee4964d6a3bb060f0758b9a900c,a90a0376ae36782a3717853cb03e37e6,42738,lauro de freitas,BA,Bahia
57784,135ed82d00d601dd15e861f2bf1deb93,39ee665787cdce6191c4b41431bdc4c2,90030,porto alegre,RS,Rio Grande do Sul


In [16]:
# Checking missing value
df.isna().sum()

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
state_full_name             0
dtype: int64

## Load DataFrame to database as a SQL

In [None]:
# use sqlalchemy to connect to the database
import sqlalchemy
from sqlalchemy import create_engine

# Define database connection details
    engine = create_engine('postgresql+psycopg2://adminadmin:InsightJDE03@jde03.postgres.database.azure.com:5432/postgres')

# Write the DataFrame to the database table
customers.to_sql('customers', engine, index=False, if_exists='replace')

print(f'DataFrame loaded to table "customers" successfully!')