### Installing Dependencies

In [21]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\renato.valentim\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


### Imports and Paths

In [22]:
from utils import *

In [23]:
# CATALOG_PATH = f"{MAIN_PATH}/mobly_top_100.csv"
CATALOG_PATH = f"{MAIN_PATH}/ofertas_base_ouro_07032025.csv"

### Loading Query Data

In [24]:
df = (
  spark.read
  .option("header", True)
  .option("inferSchema", True)
  .option("multiline", True)
  .option("quote", '"')
  .option("escape", "\\")
  .option("sep", ";")
  .option("ignoreLeadingWhiteSpace", True)
  .option("ignoreTrailingWhiteSpace", True)
  .csv(CATALOG_PATH)

  # Adaptar o campo "Nome" para o nome apropriado dependendo do arquivo de entrada
  .withColumnRenamed("Nome", "desc_product")
  .withColumnRenamed("EAN", "sku")
  .withColumn("sku", f.col("sku").cast(StringType()))
)

df = remove_accents_df(df=df, columns=["desc_product"])
df = clean_text_column(df, "desc_product").withColumn('desc_product', f.upper('desc_product'))

df.show(truncate=False)

+-------------+----------------------------------------------------------------------------------------+
|sku          |desc_product                                                                            |
+-------------+----------------------------------------------------------------------------------------+
|7909903272879|ESTANTE HOME RIPADA PARA TV ATE 85 POLEGADAS COM LED 3 GAVETAS NOBRE CLEAN GELIUS MOVEIS|
|7908308742543|GUARDA ROUPA 2 PORTAS 3 GAVETAS AVILA NOVO HORIZONTE                                    |
|7909903075975|GUARDA ROUPA 4 PORTAS 2 GAVETAS LOGAN MAMBEL                                            |
|7908308742420|GUARDA ROUPA 6 PORTAS 6 GAVETAS COM ESPELHO RENO NOVO HORIZONTE                         |
|7908308742451|GUARDA ROUPA 6 PORTAS 6 GAVETAS COM ESPELHO RENO NOVO HORIZONTE                         |
|7908308742277|GUARDA ROUPA 6 PORTAS 6 GAVETAS RENO NOVO HORIZONTE                                     |
|7908167227984|GUARDA ROUPA CASAL 3 PORTAS 1 ESPELHO 10

### Performing Elastic Search

In [25]:
df_rows = df.collect()

In [26]:
NUM_RESULTS_PER_QUERY = 100
final_results = perform_elastic_search(df_rows, num_results_per_query=NUM_RESULTS_PER_QUERY)

Searching: 100%|██████████| 129/129 [00:08<00:00, 14.91it/s]


### Saving Results in .CSV

In [27]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
output_path = f"outputs/{NUM_RESULTS_PER_QUERY}_most_similar_descriptions_{timestamp}_.csv"

In [28]:
save_csv(
    final_results, 
    output_path, ["sku", 
                  "catalog_desc", 
                  "id_product", 
                  "similar_desc", 
                  "elastic_score", 
                  "normalized_score"
                  ]
)

### Loading Final Results

In [29]:
df_result = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("multiline", True)
    .option("quote", '"')
    .option("escape", "\\")
    .option("sep", ";")
    .option("ignoreLeadingWhiteSpace", True)
    .option("ignoreTrailingWhiteSpace", True)
    .csv(output_path)
)

In [30]:
df_result.show(truncate=False)

+-------------+----------------------------------------------------------------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------------+
|sku          |catalog_desc                                                                            |id_product|similar_desc                                                                                                                                               |elastic_score|normalized_score |
+-------------+----------------------------------------------------------------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------------+
|7909903272879|ESTANTE HOME RIPADA PARA TV ATE 85 POLEGADAS COM LED 3 GAVETAS NOBRE CLEA