# Config

Change root to src/, so that modules are visible to this notebook and install libraries

In [0]:
%run "./includes/config"

In [0]:
!pip install unidecode

Import libraries and functions

In [0]:
import pyspark.pandas as ps
from dbc.functions.df_helpers import replace_decimal_character
from dbc.services.read_data import read_tabular
from dbc.services.save_data import save_df
from unidecode import unidecode
from utils.constants import *
from dbc.utils.helpers import get_year_month_params

Get year and month parameters

In [0]:
year, month =  get_year_month_params()

# Identify products contained in each sale

Read sales and costs DataFrames

In [0]:
sales_df = read_tabular(ADLS_LAYER_GOLD, ADLS_CATEGORY_SALES, year, month)
products_costs_df = read_tabular(ADLS_LAYER_BRONZE, ADLS_CATEGORY_PRODUCTS_COSTS, year, month)
other_costs_df = read_tabular(ADLS_LAYER_BRONZE, ADLS_CATEGORY_OTHER_COSTS, year, month)

In [0]:
# Create normalized column names
normalized_columns_names = [unidecode(col_name.lower()) for col_name in COSTS_COLS]

# Normalize and lower costs columns names
products_costs_df.columns = normalized_columns_names
other_costs_df.columns = normalized_columns_names

In [0]:
# Create dictionary with values per product
product_costs_dict = dict(zip(products_costs_df['descricao'].to_list(), products_costs_df['custo'].to_list()))

# Identify products in each sale

Select columns and filter rows of interest in sales df

In [0]:
# Define list of columns to select
columns_to_select = ['id_venda', 'unidades_vendidas', 'titulo_anuncio', 'count_produto']

# Select columns and filter
sales_df = sales_df[columns_to_select]
sales_df = sales_df[sales_df['count_produto']==True]

# Remove count_produto column, which will no longer be needed
columns_to_select.remove('count_produto')
sales_df = sales_df[columns_to_select]

Iterate through sales DataFrame to identify products in each sale

In [0]:
# Create list to store the rows of products_per_sale_df
rows = []

In [0]:
for row in sales_df.itertuples():

    # Define auxiliary list to store sale products and ad title
    product_list = []
    ad_title = row.titulo_anuncio.lower()

    # According to products found in ad title, add product to product list
    if ('akai' in ad_title) or ('feminin' in ad_title):
        product_list.append('AKAI/FEM')
    else:
        product_list.append('STR/BZ')
    if 'faixa' in ad_title:
        product_list.append('FX')

    # For each product, create a row in products per sale df
    for product in product_list:
        rows.append({
            'id_venda': row.id_venda,
            'unidades_vendidas': row.unidades_vendidas,
            'categoria_produto': product,
            'custo_producao': row.unidades_vendidas * product_costs_dict[product]
        })

In [0]:
# Create products per sale dataframe
products_per_sale_df = ps.DataFrame(rows)

Save DataFrames

In [0]:
# Apply decimal character correction to each dataframe
other_costs_df = replace_decimal_character(other_costs_df)
products_per_sale_df = replace_decimal_character(products_per_sale_df)

# Save dataframes
save_df(other_costs_df, ADLS_LAYER_GOLD, ADLS_CATEGORY_OTHER_COSTS, year, month)
save_df(products_per_sale_df, ADLS_LAYER_GOLD, ADLS_CATEGORY_PRODUCTS_PER_SALE, year, month)