### Installing Dependencies

In [1]:
%pip install -r requirements.txt




You should consider upgrading via the 'c:\Users\renato.valentim\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


### Imports and Paths

In [2]:
from utils import *

In [3]:
CATALOG_PATH = f"{MAIN_PATH}/ofertas_catalogo_mobly_enriquecidas.csv"

### Loading Query Data

In [4]:
df_catalog = (
  spark.read
  .option("header", True)
  .option("inferSchema", True)
  .option("multiline", True)
  .option("quote", '"')
  .option("escape", "\\")
  .option("sep", ";")
  .option("ignoreLeadingWhiteSpace", True)
  .option("ignoreTrailingWhiteSpace", True)
  .csv(CATALOG_PATH)

  # Transformações importantes das columns
  .withColumnRenamed('Categoria', 'categoria')
  .withColumnRenamed('Marca', 'marca')
  .withColumnRenamed('Cor', 'cor')
  .withColumnRenamed('Preço', 'preco_medio')
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), "R\\$ ", ""))
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), "\\.", ""))
  .withColumn("preco_medio", f.regexp_replace(f.col("preco_medio"), ",", "."))
  .withColumn('preco_medio', f.col('preco_medio').cast('double'))
  .withColumnRenamed('Nome', 'desc_product')
  .withColumnRenamed('SKU', 'sku')
  .withColumnRenamed('ATRIBUTO 3', 'atributo_1')
  .withColumnRenamed('ATRIBUTO 4', 'atributo_2')
  .withColumnRenamed('ATRIBUTO 5', 'atributo_3')
  .withColumnRenamed('ATRIBUTO 6', 'atributo_4')
  .withColumnRenamed('ATRIBUTO 7', 'atributo_5')
  .withColumnRenamed('ATRIBUTO 8', 'atributo_6')
)

In [5]:
df = remove_accents_df(
    df=df_catalog, 
    columns=[
        "desc_product", "categoria", "marca", "cor", 
        "atributo_1", "atributo_2", "atributo_3", 
        "atributo_4", "atributo_5", "atributo_6"]
    )
df = convert_string_columns_to_upper(df)

In [6]:
columns = ["categoria", "marca", "cor"]

# Iterar por todas as colunas no DataFrame
for column in columns:
        # Aplicar a função para cada column
        df = clean_text_column(df, column)

### Performing Elastic Search

In [7]:
df_rows = df.collect()

In [8]:
NUM_RESULTS_PER_QUERY = 10
final_results = perform_elastic_search_attributes(df_rows, num_results_per_query=NUM_RESULTS_PER_QUERY)

Searching: 100%|██████████| 99/99 [00:07<00:00, 13.49it/s]


### Saving Results in .CSV

In [9]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
output_path = f"outputs/attributes_{NUM_RESULTS_PER_QUERY}_most_similar_descriptions_{timestamp}_.csv"

In [10]:
column_names = [
    'sku', 'catalog_desc', 'id_product', 'similar_desc', 'elastic_score', 'normalized_score', 'preco_medio', 
    'categoria', 'marca', 'cor', 'atributo_1', 'atributo_2', 'atributo_3', 'atributo_4', 'atributo_5', 'atributo_6'
]

save_csv(final_results, output_path, column_names)

### Loading Final Results

In [11]:
df_result = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiline", True)
    .option("quote", '"')
    .option("escape", "\\")
    .option("sep", ",")
    .option("ignoreLeadingWhiteSpace", True)
    .option("ignoreTrailingWhiteSpace", True)
    .csv(output_path)
)

In [12]:
df_result.show(truncate=False)

+----------------------+-----------------------------------+----------+------------------------------------------------------------------------------------+-------------+-----------------+-----------+-------------------------------------------------------------+-------------+--------------+-------------+-------------+-------------+-------------+-------------+-------------+
|sku                   |catalog_desc                       |id_product|similar_desc                                                                        |elastic_score|normalized_score |preco_medio|categoria                                                    |marca        |cor           |atributo_1   |atributo_2   |atributo_3   |atributo_4   |atributo_5   |atributo_6   |
+----------------------+-----------------------------------+----------+------------------------------------------------------------------------------------+-------------+-----------------+-----------+------------------------------------------------

In [13]:
print('Número de resultados retornado:', f'{df_result.count():,}')

Número de resultados retornado: 990
