In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import chardet

class CSVToParquetConverter:
    def __init__(self, csv_file, parquet_file, chunksize=100_000):
        self.csv_file = csv_file
        self.parquet_file = parquet_file
        self.chunksize = chunksize
        self.codificacao = self.detectar_codificacao()
        self.parquet_writer = None

    def detectar_codificacao(self):
        with open(self.csv_file, 'rb') as f:
            resultado = chardet.detect(f.read(100000))
        print(f"Codificação detectada: {resultado['encoding']}")
        return resultado['encoding']

    def converter(self):
        try:
            csv_stream = pd.read_csv(
                self.csv_file, sep=';', chunksize=self.chunksize, low_memory=False, encoding=self.codificacao, on_bad_lines='skip'
            )

            for i, chunk in enumerate(csv_stream):
                print(f"Processando chunk {i}")
                if i == 0:
                    self.inicializar_parquet_writer(chunk)
                
                self.escrever_chunk(chunk)

            self.finalizar()

        except UnicodeDecodeError as e:
            print(f"Erro ao decodificar o arquivo: {e}")

    def inicializar_parquet_writer(self, chunk):
        parquet_schema = pa.Table.from_pandas(df=chunk).schema
        self.parquet_writer = pq.ParquetWriter(self.parquet_file, parquet_schema, compression='gzip')

    def escrever_chunk(self, chunk):
        table = pa.Table.from_pandas(chunk, schema=self.parquet_writer.schema)
        self.parquet_writer.write_table(table)

    def finalizar(self):
        if self.parquet_writer:
            self.parquet_writer.close()


In [None]:
# Exemplo de uso:
file_name = 'nome_do_arquivo'
csv_file = f'./files/csv/{file_name}.csv'
parquet_file = f'./files/parquet/{file_name}.parquet'

converter = CSVToParquetConverter(csv_file, parquet_file)
converter.converter()

In [None]:
import pandas as pd

# Exemplo de uso:
parquet_file = f'./files/parquet/{file_name}.parquet'
df = pd.read_parquet(parquet_file)

df
