### Installing Dependencies

In [1]:
%pip install -r requirements.txt




You should consider upgrading via the 'c:\Users\renato.valentim\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


### Imports and Paths

In [2]:
from utils import *

In [3]:
CATALOG_PATH = f"{MAIN_PATH}/base_catalogo_mobly_enriquecida_13032025.csv"

### Loading Query Data

In [4]:
df_catalog = (
  spark.read
  .option("header", True)
  .option("inferSchema", True)
  .option("multiline", True)
  .option("quote", '"')
  .option("escape", "\\")
  .option("sep", ",")
  .option("ignoreLeadingWhiteSpace", True)
  .option("ignoreTrailingWhiteSpace", True)
  .csv(CATALOG_PATH)

  # Transformações importantes das columns
  .withColumnRenamed('Categoria', 'categoria')
  .withColumnRenamed('Marca', 'marca')
  .withColumnRenamed('Cor', 'cor')
  .withColumnRenamed('Preço', 'preco_medio')
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), "R\\$ ", ""))
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), "\\.", ""))
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), ",", "."))
  .withColumn('preco_medio', f.col('preco_medio').cast('double'))
  .withColumnRenamed('Nome', 'desc_product')
  .withColumnRenamed('SKU', 'sku')
  .withColumn('sku', f.col('sku').cast(StringType()))
  .withColumnRenamed('ATRIBUTO 3', 'atributo_1')
  .withColumnRenamed('ATRIBUTO 4', 'atributo_2')
  .withColumnRenamed('ATRIBUTO 5', 'atributo_3')
  .withColumnRenamed('ATRIBUTO 6', 'atributo_4')
  .withColumnRenamed('ATRIBUTO 7', 'atributo_5')
  .withColumnRenamed('ATRIBUTO 8', 'atributo_6')
)

In [5]:
df = remove_accents_df(
    df=df_catalog, 
    columns=[
        "desc_product", "categoria", "marca", "cor", 
        "atributo_1", "atributo_2", "atributo_3", 
        "atributo_4", "atributo_5", "atributo_6"]
    )
df = convert_string_columns_to_upper(df)

In [6]:
columns = ["desc_product", "categoria", "marca", "cor"]

# Iterar por todas as colunas no DataFrame
for column in columns:
        # Aplicar a função para cada column
        df = clean_text_column(df, column)

### Performing Elastic Search

In [7]:
df_rows = df.collect()

In [8]:
NUM_RESULTS_PER_QUERY = 20
final_results = perform_elastic_search_attributes(df_rows, num_results_per_query=NUM_RESULTS_PER_QUERY)

Searching: 100%|██████████| 140/140 [00:08<00:00, 16.87it/s]


### Saving Results in .CSV

In [9]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
output_path = f"outputs/attributes_{NUM_RESULTS_PER_QUERY}_most_similar_descriptions_{timestamp}_.csv"

In [10]:
column_names = [
    'sku', 'catalog_desc', 'id_product', 'similar_desc', 'elastic_score', 'normalized_score', 'preco_medio', 
    'categoria', 'marca', 'cor', 'atributo_1', 'atributo_2', 'atributo_3', 'atributo_4', 'atributo_5', 'atributo_6'
]

save_csv(final_results, output_path, column_names)

### Loading Final Results

In [11]:
json_data = load_json('attributes-map.json')
json_map = {item["categoria"]: [item["atributos"].get(f"atributo_{i}", "NAO SE APLICA") for i in range(1, 7)] for item in json_data}

In [12]:
df_result = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiline", True)
    .option("quote", '"')
    .option("escape", "\\")
    .option("sep", ",")
    .option("ignoreLeadingWhiteSpace", True)
    .option("ignoreTrailingWhiteSpace", True)
    .csv(output_path)
)

In [13]:

df_result = df_result.toPandas().apply(lambda row: prefixar_atributos(row, json_map), axis=1)
df = df.toPandas().apply(lambda row: prefixar_atributos(row, json_map), axis=1)

In [14]:
# Realizar o join interno
df_delivery = pd.merge(
    df_result.rename_axis('df_'),  
    df.rename_axis('df_cat'),
    how='inner',
    on='sku'                
)

# Selecionar e renomear colunas
df_delivery = df_delivery[[
    'sku', 'id_product', 'catalog_desc', 'similar_desc', 'elastic_score', 'normalized_score',
    'preco_medio_y', 'preco_medio_x', 'categoria_y', 'categoria_x', 'marca_y', 'marca_x',
    'cor_y', 'cor_x', 'atributo_1_y', 'atributo_1_x', 'atributo_2_y', 'atributo_2_x',
    'atributo_3_y', 'atributo_3_x', 'atributo_4_y', 'atributo_4_x', 'atributo_5_y', 'atributo_5_x',
    'atributo_6_y', 'atributo_6_x'
]].rename(columns={
    'sku': 'catalog_sku',
    'preco_medio_y': 'catalog_preco_medio',
    'preco_medio_x': 'preco_medio',
    'categoria_y': 'catalog_categoria',
    'categoria_x': 'categoria',
    'marca_y': 'catalog_marca',
    'marca_x': 'marca',
    'cor_y': 'catalog_cor',
    'cor_x': 'cor',
    'atributo_1_y': 'catalog_atributo_1',
    'atributo_1_x': 'atributo_1',
    'atributo_2_y': 'catalog_atributo_2',
    'atributo_2_x': 'atributo_2',
    'atributo_3_y': 'catalog_atributo_3',
    'atributo_3_x': 'atributo_3',
    'atributo_4_y': 'catalog_atributo_4',
    'atributo_4_x': 'atributo_4',
    'atributo_5_y': 'catalog_atributo_5',
    'atributo_5_x': 'atributo_5',
    'atributo_6_y': 'catalog_atributo_6',
    'atributo_6_x': 'atributo_6'
})

# Remover duplicatas
df_delivery = df_delivery.drop_duplicates(subset=['catalog_sku', 'id_product'])

# Ordenar os resultados
df_delivery = df_delivery.sort_values(by=['catalog_sku', 'id_product', 'normalized_score'])

In [15]:
# Contar o número de resultados
print('Número de resultados retornado:', f"{len(df_delivery):,}")

Número de resultados retornado: 2,718


In [16]:

timestamp = datetime.now().strftime("%Y%m%d_%H%M")
refined_output_path = f"{ELASTIC_PATH}/outputs/refined_outputs/{timestamp}_elastic_search_attributes.csv"

In [17]:
# Salvar como CSV
df_delivery.to_csv(refined_output_path, index=False, encoding="utf-8-sig")
