## Install

In [1]:
%pip install -q py7zr
%pip install -q xmltodict

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Imports

In [2]:
import os
import zipfile
import pandas
import requests
import py7zr
import glob
import xmltodict

## Data acquisition

### Data to classify

In [7]:
def get_class_items():
  folder_path = '../data/raw'
  processed_files = 0
  for file_name in os.listdir(folder_path):
    if not file_name.endswith('.zip'):
      continue
    zip_path = os.path.join(folder_path, file_name)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      file_items_name = f'{file_name.removesuffix('.zip')}_NotaFiscalItem.csv'
      with zip_ref.open(file_items_name) as file:
        file.seek(0)
        data_frame = pandas.read_csv(file, delimiter=';', encoding='latin1')
        data_frame.rename(columns={
          'CHAVE DE ACESSO': 'access_key',
          'DATA EMISSÃO': 'emission_date',
          'CPF/CNPJ Emitente': 'emission_owner',
          'NÚMERO PRODUTO': 'product_index',
          'DESCRIÇÃO DO PRODUTO/SERVIÇO': 'description',
          'CÓDIGO NCM/SH': 'ncm',
          'NCM/SH (TIPO DE PRODUTO)': 'ncm_description',
          'CFOP': 'cfop',
          'QUANTIDADE': 'quantity',
          'UNIDADE': 'unit_kind',
          'VALOR UNITÁRIO': 'unitary_value',
          'VALOR TOTAL': 'total_value',
        }, inplace=True)
        selected_fields = data_frame \
          .loc[:, [
            'access_key',
            'emission_date',
            'emission_owner',
            'product_index',
            'description',
            'ncm',
            'ncm_description',
            'cfop',
            'quantity',
            'unit_kind',
            'unitary_value',
            'total_value',
          ]]
        selected_fields['quantity'] = selected_fields['quantity'].str.replace(',', '.')
        selected_fields['total_value'] = selected_fields['total_value'].str.replace(',', '.')
        selected_fields['unitary_value'] = selected_fields['unitary_value'].str.replace(',', '.')
        yield selected_fields \
          .astype({
            'access_key': 'str',
            'emission_date': 'datetime64[ms]',
            'emission_owner': 'str',
            'product_index': 'int32',
            'description': 'str',
            'ncm': 'str',
            'ncm_description': 'str',
            'cfop': 'str',
            'quantity': 'float64',
            'unit_kind': 'str',
            'unitary_value': 'float64',
            'total_value': 'float64',
          })
        processed_files += 1
        print(f'Processed files {processed_files}')


def get_class_combined():
  combined_path = '../data/combined.parquet.br'
  if os.path.exists(combined_path):
    print('Reading cache combined data')
    return pandas.read_parquet(combined_path)
  combined_data = pandas.concat(get_class_items(), ignore_index=True)
  print('Start compression')
  combined_data.to_parquet(combined_path, index=False, compression='brotli')
  return combined_data

data_frame = get_class_combined()

Reading cache combined data


#### Analyze dataset

In [8]:
total_rows = len(data_frame)
data_frame.info() 

data_frame.drop_duplicates(inplace=True)

unique_rows = len(data_frame)
removed_rows = total_rows - unique_rows
print(f'Total rows: {total_rows}')
print(f'Unique rows: {unique_rows}')
print(f'Removed rows: {removed_rows}')
print(f'Removed rows percentage: {round(removed_rows / total_rows, 4)}%')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17752229 entries, 0 to 17752228
Data columns (total 12 columns):
 #   Column           Dtype         
---  ------           -----         
 0   access_key       object        
 1   emission_date    datetime64[ms]
 2   emission_owner   object        
 3   product_index    int32         
 4   description      object        
 5   ncm              object        
 6   ncm_description  object        
 7   cfop             object        
 8   quantity         float64       
 9   unit_kind        object        
 10  unitary_value    float64       
 11  total_value      float64       
dtypes: datetime64[ms](1), float64(3), int32(1), object(7)
memory usage: 1.5+ GB
Total rows: 17752229
Unique rows: 17673249
Removed rows: 78980
Removed rows percentage: 0.0044%


In [9]:
table = dict(map(lambda x: (str(x), {'unique': 0}), range(0, 100)))
for table_item in data_frame['access_key'].unique():
  table[table_item[0:2]]['unique'] += 1
  
table = dict(filter(lambda x: x[1]['unique'] > 0, table.items()))
for key in table.keys():
  current_state = data_frame.where(data_frame['access_key'].str.startswith(key))
  total_value = current_state['total_value']
  table[key] |= {
    'total': total_value.count(),
    'total_value': round(total_value.sum(), 2),
  }

In [10]:
state_codes = dict(map(
  lambda x: (str(x['id']), x),
  requests.get('https://servicodados.ibge.gov.br/api/v1/localidades/estados').json(),
))

In [11]:
meta_table_rows = list(table.items())
meta_table_rows.sort(key=lambda x: x[1]['total'], reverse=True)
for key, value in meta_table_rows:
  print('\t'.join([
    state_codes.get(key)['sigla'],
    state_codes.get(key)['nome'],
    "{:,}".format(value['unique']),
    "{:,}".format(value['total']),
    "{:,}".format(int(value['total_value'])),
  ]).replace(',', '.'))

SP	São Paulo	968.433	4.372.236	99.859.384.457
RJ	Rio de Janeiro	849.573	2.581.830	46.588.402.728
RS	Rio Grande do Sul	426.544	1.904.333	8.236.053.721
MG	Minas Gerais	353.555	1.342.135	18.058.849.286
DF	Distrito Federal	385.766	1.063.719	15.600.639.407
PR	Paraná	272.867	943.222	8.510.054.005
SC	Santa Catarina	223.651	575.369	5.383.060.717
PE	Pernambuco	147.843	510.864	6.940.477.382
BA	Bahia	112.289	442.067	1.592.486.551
MS	Mato Grosso do Sul	91.216	420.025	945.018.198
PA	Pará	101.956	407.554	1.028.692.878
AM	Amazonas	99.436	397.293	1.335.224.610
RN	Rio Grande do Norte	97.188	330.951	449.921.854
GO	Goiás	119.522	322.409	23.389.753.224
CE	Ceará	83.192	290.382	672.199.675
MA	Maranhão	72.172	287.946	381.705.137
ES	Espírito Santo	89.008	250.336	3.452.116.826
MT	Mato Grosso	106.433	247.252	490.956.647
PB	Paraíba	59.998	173.599	463.042.991
PI	Piauí	35.788	170.278	220.885.632
RO	Rondônia	48.759	156.884	371.488.485
RR	Roraima	26.410	121.510	598.998.213
AC	Acre	22.273	87.804	132.491.524
AL	Alagoa

#### Pre processing

In [12]:
data_frame.drop_duplicates(inplace=True)

In [14]:
filtered_df = data_frame[data_frame['description'].str.contains('coca', case=False)]


In [15]:
filtered_df

Unnamed: 0,access_key,emission_date,emission_owner,product_index,description,ncm,ncm_description,cfop,quantity,unit_kind,unitary_value,total_value
2300,31211029455568000189550010000088621399676822,2021-10-28 00:00:00,29455568000189,1,PACOCA,20079990,"Doces, pures e pastas, de outras frutas",5102,10.0,UNIDAD,9.60,96.00
6286,42211033154286000119550010000017641952962790,2021-10-28 09:48:55,33154286000119,4,DOCE DE COCO BRANCO COCADA CREMOSA 3.7 KG,20079990,"Doces, pures e pastas, de outras frutas",5102,24.0,LITRO,68.00,1632.00
11520,22211008318561000145550010000054771100353639,2021-10-28 11:41:43,08318561000145,16,COCA COLA 2 LT,22021000,"Águas, incluindo as águas minerais e as águas ...",5101,5.0,UNIDAD,10.50,52.50
12695,21211033567458000186550010000004611821118979,2021-10-28 12:28:08,33567458000186,42,DOCE SAABOR PACOCA,20081100,Amendoins preparados ou conservados,5102,15.0,KG,25.60,384.00
13106,50211006982507000174550010000012461000005252,2021-10-28 12:52:00,06982507000174,1,COCA COLA 2LTS,22021000,"Águas, incluindo as águas minerais e as águas ...",5405,4.0,UNIDAD,10.00,40.00
...,...,...,...,...,...,...,...,...,...,...,...,...
17749044,28240948429852000125550010000026401585698265,2024-09-13 20:05:13,48429852000125,2,REFRIGERANTE SAB. COCA COLA 350ML SCHIN,22021000,"Águas, incluindo as águas minerais e as águas ...",5405,100.0,UNIDAD,2.39,239.00
17749045,28240948429852000125550010000026401585698265,2024-09-13 20:05:13,48429852000125,3,REFRIGERANTE COCA COLA 350ML,22021000,"Águas, incluindo as águas minerais e as águas ...",5405,25.0,UNIDAD,2.39,59.75
17750476,53240942633567000145550010000001351928509150,2024-09-14 08:18:56,42633567000145,1,COCA COLA 310 ML,3024990,"Outros peixes frescos ou refrigerados, exceto ...",5949,8475.0,UNIDAD,1.99,16865.25
17750477,53240942633567000145550010000001351928509150,2024-09-14 08:18:56,42633567000145,2,COCA COLA S/A 310 ML,3024990,"Outros peixes frescos ou refrigerados, exceto ...",5949,2664.0,UNIDAD,2.09,5567.76


In [4]:
all_descriptions = dict()
for descriptions_dict in data_frame['description']:
  all_descriptions[descriptions_dict] = all_descriptions.get(descriptions_dict, 0) + 1
  


In [12]:
items = list(all_descriptions.items())
items.sort(key=lambda x: x[1], reverse=True)
principal = items[:1000]

In [13]:
principal

[('PACOM DPF NOVO BRASAO - PROMASP', 107107),
 ('PERSONALIZACAO NOVO PASSAPORTE BRASILEIRO PACOM DPF 32 PAG.', 44078),
 ('QUEROSENE DE AVIACAO JET A', 43634),
 ('CENOURA', 43500),
 ('MELANCIA', 37840),
 ('BATATA DOCE', 37379),
 ('TOMATE', 36161),
 ('BANANA PRATA', 35066),
 ('OLEO DIESEL B S10', 34021),
 ('Prestacao de Servicos', 29860),
 ('JET A NAO TABELADO - LI', 29832),
 ('GASOLINA COMUM', 29301),
 ('CEBOLA', 29291),
 ('BETERRABA', 29179),
 ('PEPINO', 27633),
 ('BATATA INGLESA', 24528),
 ('ALFACE', 24044),
 ('BANANA', 23979),
 ('COENTRO', 22673),
 ('MACA', 22259),
 ('ABACAXI', 21979),
 ('ABOBORA', 21182),
 ('PE DE BRINCADEIRA PRE ESCOLA 4 E 5 ANOS E 11 MESES', 20883),
 ('LIMAO', 20828),
 ('CEBOLINHA', 20004),
 ('LARANJA', 19527),
 ('FILTRO DE OLEO', 18511),
 ('LARANJA PERA', 18497),
 ('ALHO', 18245),
 ('PIMENTAO VERDE', 17514),
 ('CHUCHU', 17021),
 ('FILTRO DE COMBUSTIVEL', 16898),
 ('COUVE', 16454),
 ('MACAXEIRA', 16040),
 ('QUIABO', 15617),
 ('HORTIFRUTIGRANJEIROS/COUVE', 14377),


In [9]:
list(filter(lambda x: 'coca' in x[0] and 'cola' in x[0], items))

[('refri coca cola 2,5L', 5),
 ('625 refrigerantes cocacola 2,5 litros', 1),
 ('refrigerante coca cola 2 lt', 1),
 ('coca-cola zero 2L', 1),
 ('Ref coca cola pet 200ml', 1),
 ('Doce Pacoca Nbonn 136g Rolha Chocolate Ao Leite Nbonn CX', 1)]

### Classes data

In [3]:

def get_class_items():
  folder_path = '../data/tce_rs'
  processed_files = 0
  for file_name in os.listdir(folder_path):
    if not file_name.endswith('.xml.7z'):
      continue
    zip_path = os.path.join(folder_path, file_name)
    
    with py7zr.SevenZipFile(zip_path, mode='r') as zip_ref:
      zip_ref.extractall(folder_path)
      
  for invoice_xml in glob.glob(f'{folder_path}/**/*.xml'):
    processed_files += 1
    with open(invoice_xml, 'r', encoding='utf-8') as file:
      data = xmltodict.parse(file.read())
      products = data.get('nfeProc', {}).get('NFe', {}).get('infNFe', {}).get('det', [])
      product_list = products \
        if isinstance(products, list) \
        else [products]
      
      if not product_list:
        continue
        
      access_key = data['nfeProc']['protNFe']['infProt']['chNFe']
      cnpj = data['nfeProc']['NFe']['infNFe']['emit']['CNPJ']
      emission_date = pandas.to_datetime(data['nfeProc']['NFe']['infNFe']['ide']['dhEmi']) \
        .tz_convert(None)
      for product in product_list:
        ean = product['prod']['cEAN']
        if not ean or ean == 'SEM GTIN':
          continue
        
        yield {
          'access_key': access_key,
          'emission_date': emission_date,
          'emission_owner': cnpj,
          'product_index': int(product['@nItem']),
          'description': product['prod']['xProd'],
          'ean': ean,
          'ncm': product['prod']['NCM'],
          'cfop': product['prod']['CFOP'],
          'quantity': float(product['prod']['qCom']),
          'unit_kind': product['prod']['uCom'],
          'unitary_value': float(product['prod']['vUnCom']),
          'total_value': float(product['prod']['vProd']),
        }
        
    if processed_files % 100 == 0:
      print(f'Processed files: {processed_files}')

def get_class_combined():
  class_path = '../data/class.parquet.br'
  if os.path.exists(class_path):
    print('Reading cache combined data')
    return pandas.read_parquet(class_path)

  class_data = pandas.DataFrame(get_class_items()) \
    .astype({
      'access_key': 'str',
      'emission_date': 'datetime64[ms]',
      'emission_owner': 'str',
      'product_index': 'int32',
      'description': 'str',
      'ncm': 'str',
      'ean': 'str',
      'cfop': 'str',
      'quantity': 'float64',
      'unit_kind': 'str',
      'unitary_value': 'float64',
      'total_value': 'float64',
    })
  print('Start compression')
  class_data.to_parquet(class_path, index=False, compression='brotli')
  return class_data

class_data = get_class_combined()

Reading cache combined data


In [4]:
class_data.drop_duplicates(inplace=True)

In [5]:
descriptions_dict = dict()
for row in class_data.itertuples():
  if not descriptions_dict.get(row.description):
    descriptions_dict[row.description] = set()
  descriptions_dict[row.description].add(row.ean)
  
selected_eans = list(map(lambda x: x[0], filter(lambda x: len(x[1]) > 1, descriptions_dict.items())))

In [6]:
descriptions_dict

{'TOALHEIRO INTERFOLHAS INOX POLIDO NOBRE SELECT 41885': {'7899682779860'},
 'SABONETEIRA LIQUIDA INOX POLIDO C/RESERV NOBRE SELECT 53235': {'7899682727991'},
 'CABO FLEX 1X 2,50MM 750V VM C/ 100MTS - CORFIO': {'7898258651081'},
 'CABO FLEX 1X 2,50MM 750V AZ C/ 100MTS - CORFIO': {'7898258651074'},
 'CABO FLEX 1X 2,50MM 750V VD C/ 100MTS - CORFIO': {'7898258651104'},
 'PAINEL LED SLIM EMBUTIR 172X172X17,85mm 12W 6500K BIV. 840LM': {'7899440032701'},
 'BICARBONATO DE SODIO MENTA 200G CLEAN OK-Ped Anex:20373': {'7898057430054'},
 'LENCOL DE BORRACHA MADEITEX-Ped Anex:20373': {'7898903991012'},
 'PAPEL HIG. VELVET NEUTRO DUETTO 30M C/12': {'7896026800990'},
 'ARGAMASSA NIVELA RAPIDO CINZA 20KG': {'7897255926789'},
 'SELF ESM COREPROT ACET SOLVBASE NOVA-B2-0,81': {'7891260420355'},
 'ROLO ATLAS ESPUMA 406/09 - 9CM': {'7896380140299'},
 'CONDUITE CORRUGADO 25 TRAMONTINA': {'7891435049077'},
 'CABO FLEX 1X 2,50MM 750V BR C/ 100MTS - CORFIO': {'7898258651098'},
 'FOGAO INDUCAO FISCHER 1B PR*26