In [19]:
import pandas as pd

# Configuração para acesso ao MinIO
s3_url = "http://54.161.237.227:9000/"
access_key = "minio_access_key"
secret_key = "minio_secret_key"

storage_options = {
    "key": access_key,
    "secret": secret_key,
    "client_kwargs": {
        "endpoint_url": s3_url
    }
}

file_path = 's3://bronze/data/orders.csv'

# Tentar ler o arquivo CSV com tratamento para aspas e delimitadores, e lidando com linhas ruins
try:
    df = pd.read_csv(
        file_path,
        sep=',',
        quotechar='"',  # Assegura que as vírgulas dentro de aspas sejam tratadas corretamente
        storage_options=storage_options,
        on_bad_lines='skip'  # Pula linhas que não têm o número correto de delimitadores
    )
    display(df)
except Exception as e:
    print("Erro ao ler o arquivo:", e)


Unnamed: 0,orderID,customerID,employeeID,orderDate,requiredDate,shippedDate,shipVia,freight,shipName,shipAddress,shipCity,shipRegion,shipPostalCode,shipCountry
0,10248,VINET,5,1996-07-04 00:00:00.000,1996-08-01 00:00:00.000,1996-07-16 00:00:00.000,3,32.38,Vins et alcools Chevalier,59 rue de l'Abbaye,Reims,,51100,France
1,10249,TOMSP,6,1996-07-05 00:00:00.000,1996-08-16 00:00:00.000,1996-07-10 00:00:00.000,1,11.61,Toms Spezialitäten,Luisenstr. 48,Münster,,44087,Germany
2,10254,CHOPS,5,1996-07-11 00:00:00.000,1996-08-08 00:00:00.000,1996-07-23 00:00:00.000,2,22.98,Chop-suey Chinese,Hauptstr. 31,Bern,,3012,Switzerland
3,10255,RICSU,9,1996-07-12 00:00:00.000,1996-08-09 00:00:00.000,1996-07-15 00:00:00.000,3,148.33,Richter Supermarkt,Starenweg 5,Genève,,1204,Switzerland
4,10257,HILAA,4,1996-07-16 00:00:00.000,1996-08-13 00:00:00.000,1996-07-22 00:00:00.000,3,81.91,HILARION-Abastos,Carrera 22 con Ave. Carlos Soublette #8-35,San Cristóbal,Táchira,5022,Venezuela
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,11072,ERNSH,4,1998-05-05 00:00:00.000,1998-06-02 00:00:00.000,,2,258.64,Ernst Handel,Kirchgasse 6,Graz,,8010,Austria
650,11073,PERIC,2,1998-05-05 00:00:00.000,1998-06-02 00:00:00.000,,2,24.95,Pericles Comidas clásicas,Calle Dr. Jorge Cash 321,México D.F.,,05033,Mexico
651,11074,SIMOB,7,1998-05-06 00:00:00.000,1998-06-03 00:00:00.000,,2,18.44,Simons bistro,Vinbæltet 34,Kobenhavn,,1734,Denmark
652,11075,RICSU,8,1998-05-06 00:00:00.000,1998-06-03 00:00:00.000,,2,6.19,Richter Supermarkt,Starenweg 5,Genève,,1204,Switzerland


In [16]:
df_renamed = df.rename(columns={
    'productID': 'cd_produto', 
    'productName': 'Nome_produto' 
                   })

In [17]:

df_formatted = df_renamed.astype({
    'cd_produto': 'int',
    'Nome_produto': 'string',
    'supplierID': 'int',
    'categoryID': 'int',
    'quantityPerUnit': 'string',
    'unitPrice': 'float',
    'unitsInStock': 'int',
    'unitsOnOrder': 'int',
    'reorderLevel': 'int',
    'discontinued': 'int'
})

display(df_formatted)

Unnamed: 0,cd_produto,Nome_produto,supplierID,categoryID,quantityPerUnit,unitPrice,unitsInStock,unitsOnOrder,reorderLevel,discontinued
0,1,Chai,1,1,10 boxes x 20 bags,18.00,39,0,10,0
1,2,Chang,1,1,24 - 12 oz bottles,19.00,17,40,25,0
2,3,Aniseed Syrup,1,2,12 - 550 ml bottles,10.00,13,70,25,0
3,4,Chef Anton's Cajun Seasoning,2,2,48 - 6 oz jars,22.00,53,0,0,0
4,5,Chef Anton's Gumbo Mix,2,2,36 boxes,21.35,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
72,73,Röd Kaviar,17,8,24 - 150 g jars,15.00,101,0,5,0
73,74,Longlife Tofu,4,7,5 kg pkg.,10.00,4,20,5,0
74,75,Rhönbräu Klosterbier,12,1,24 - 0.5 l bottles,7.75,125,0,25,0
75,76,Lakkalikööri,23,1,500 ml,18.00,57,0,20,0


In [20]:
parquet_path = 's3://silver/orders.parquet'


try:
    df.to_parquet(
        parquet_path,
        engine='pyarrow',
        index=False,       
        storage_options=storage_options
    )
    print("Arquivo Parquet salvo com sucesso em:", parquet_path)
except Exception as e:
    print("Erro ao salvar o arquivo Parquet:", e)


Arquivo Parquet salvo com sucesso em: s3://silver/orders.parquet
