In [21]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\renato.valentim\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


### Imports & Paths

In [22]:
from utils import *

In [23]:

CORPUS_PATH = f"{MAIN_PATH}/ofertas_matching_enriquecidas.csv"

### Corpus Data & Transformations

In [24]:
df_corpus = (
  spark.read
  .option("header", True)
  .option("inferSchema", True)
  .option("multiline", True)
  .option("quote", '"')
  .option("escape", "\\")
  .option("sep", ";")
  .option("ignoreLeadingWhiteSpace", True)
  .option("ignoreTrailingWhiteSpace", True)
  .csv(CORPUS_PATH)

# Transformações importantes das columns
  .withColumnRenamed('CATEGORIA', 'categoria')
  .withColumnRenamed('Marca', 'marca')
  .withColumnRenamed('Cor', 'cor')
  .withColumnRenamed('PRECO MEDIO', 'preco_medio')
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), "R\\$ ", ""))
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), "\\.", ""))
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), ",", "."))
  .withColumn('preco_medio', f.col('preco_medio').cast('double'))
  .withColumnRenamed('DESC_PRODUCT', 'desc_product')
  .withColumnRenamed('ID_PRODUCT', 'id_product')
  .withColumnRenamed('atributo 3', 'atributo_1')
  .withColumnRenamed('atributo 4', 'atributo_2')
  .withColumnRenamed('atributo 5', 'atributo_3')
  .withColumnRenamed('atributo 6', 'atributo_4')
  .withColumnRenamed('atributo 7', 'atributo_5')
  .withColumnRenamed('atributo 8', 'atributo_6')
)

In [25]:
df = remove_accents_df(
    df=df_corpus, 
    columns=[
        "desc_product", "categoria", "marca", "cor", 
        "atributo_1", "atributo_2", "atributo_3", 
        "atributo_4", "atributo_5", "atributo_6"]
    )
df = convert_string_columns_to_upper(df)

In [None]:
terms = ["OUTLET", "ANUNCIO", "OLD"]
df = remove_terms_from_column(df, terms, column_name="desc_product")

In [26]:
columns = ["categoria", "marca", "cor"]

# Iterar por todas as colunas no DataFrame
for column in columns:
        # Converter a string column para uppercase
        df = clean_text_column(df, column)

In [None]:
print("Número de Ofertas:", f"{df.count():,}")

In [30]:
def generate_actions_attributes(rows):
    """
    Converter cada linha em um documento Elasticsearch
    """    
    for row in rows:
        yield {
            "_index": "ofertas_corpus_attributes",
            "_id": str(row['id_product']),  # Usar id_product como _id
            "_source": {
                "desc_product": row['desc_product'],
                "categoria": row['categoria'],
                "preco_medio": str(row['preco_medio']),
                "cor": row['cor'],
                "atributo_1": row['atributo_1'],
                "atributo_2": row['atributo_2'],
                "atributo_3": row['atributo_3'],
                "atributo_4": row['atributo_4'],
                "atributo_5": row['atributo_5'],
                "atributo_6": row['atributo_6'],
            }
        }

### Elastic Search

In [32]:
# Coletar as linhas do DataFrame e preparar para envio em lote
rows = df.collect()
actions = list(generate_actions_attributes(rows))

In [33]:
print("Aqui estão alguns exemplos das actions criadas para index no ES"), actions[:2]

Aqui estão alguns exemplos das actions criadas para index no ES


(None,
 [{'_index': 'ofertas_corpus_attributes',
   '_id': '61567317',
   '_source': {'desc_product': 'BALCAO DE COZINHA 3 PORTAS 1 GAVETA DAMA DEMOBILE AMENDOLA BRANCO',
    'categoria': 'ARMARIOS BALCOES GUARDA ROUPAS ESTANTES RACKS E PAINEIS DE TV',
    'preco_medio': '248.32',
    'cor': 'BRANCO',
    'atributo_1': 'NAO INFORMADO',
    'atributo_2': '3',
    'atributo_3': 'NAO INFORMADO',
    'atributo_4': 'NAO INFORMADO',
    'atributo_5': 'NAO INFORMADO',
    'atributo_6': 'NAO INFORMADO'}},
  {'_index': 'ofertas_corpus_attributes',
   '_id': '125712474',
   '_source': {'desc_product': 'GUARDA ROUPA SOLTEIRO RAVI 2 PORTAS 2 GAVETAS 100 MDF OLMO ITALIANO MAMBEL',
    'categoria': 'ARMARIOS BALCOES GUARDA ROUPAS ESTANTES RACKS E PAINEIS DE TV',
    'preco_medio': '1697.74',
    'cor': 'OLMO ITALIANO MAMBEL',
    'atributo_1': 'NAO INFORMADO',
    'atributo_2': '2',
    'atributo_3': '2',
    'atributo_4': 'NAO INFORMADO',
    'atributo_5': 'NAO INFORMADO',
    'atributo_6': 'NAO IN