# Notebook - Gold Analysis - Part 2
Notebook utilizado para realizar as análises dos dados da camada Gold - Parte 2 do Case.

No caso:
- Geração das métricas Taxa de rentenção, Número médio de pedidos, Ticket médio e Receita média (com Segmentação)
- Execução dos testes T (indicadores de média) e Z (indicadores de proporção) para análise de impacto significativo (com Segmentação)

In [0]:
# Import das bibliotecas
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

In [0]:
%sql
-- Geração das métricas Taxa de rentenção, Número médio de pedidos, Ticket médio e Receita média (Segmento Estado)
select
  delivery_address_state
  , is_target
  , sum(is_retained)/count(*) as retention_rate
  , avg(n_order) as avg_n_order
  , sum(sum_amount)/sum(n_order) as avg_amount_per_order
  , sum(sum_amount)/count(*) as avg_sum_amount
  , sum(sum_amount) as total_sum_amount
  , count(*) as n_consumers
  , case when is_target = 'target' then n_consumers*20 else 0 end as campaign_cost
  , total_sum_amount*0.175 as profit
  , case when is_target = 'target' then (profit - campaign_cost) / campaign_cost else 0 end as roi
from gold.consumer_segmented_metrics
group by delivery_address_state, is_target
order by delivery_address_state, is_target

In [0]:
%sql
-- -- Geração das métricas Taxa de rentenção, Número médio de pedidos, Ticket médio e Receita média (Segmento Faixa de Preço)
select
  price_range
  , is_target
  , sum(is_retained)/count(*) as retention_rate
  , avg(n_order) as avg_n_order
  , sum(sum_amount)/sum(n_order) as avg_amount_per_order
  , sum(sum_amount)/count(*) as avg_sum_amount
  , sum(sum_amount) as total_sum_amount
  , count(*) as n_consumers
  , case when is_target = 'target' then n_consumers*20 else 0 end as campaign_cost
  , total_sum_amount*0.175 as profit
  , case when is_target = 'target' then (profit - campaign_cost) / campaign_cost else 0 end as roi
from gold.consumer_segmented_metrics
group by price_range, is_target
order by price_range, is_target

In [0]:
# Carregando os dados da camada Gold e transformando em DataFrames Pandas
df_metrics = spark.sql("""select * from gold.consumer_segmented_metrics""")
df_order_metrics = spark.sql("""select * from gold.order_metrics""")

df_metrics_pandas = df_metrics.toPandas()
df_order_metrics_pandas = df_order_metrics.toPandas()

In [0]:
# Definindo os grupos que serão analisados
groups = [
    {"column_name": "delivery_address_state", "value": "DF"},
    {"column_name": "delivery_address_state", "value": "MS"},
    {"column_name": "delivery_address_state", "value": "MT"},
    {"column_name": "delivery_address_state", "value": "PI"},
    {"column_name": "delivery_address_state", "value": "RJ"},
    {"column_name": "delivery_address_state", "value": "SC"},
    {"column_name": "delivery_address_state", "value": "SP"},
    {"column_name": "price_range", "value": 1},
    {"column_name": "price_range", "value": 5}
]

# Loop para realizar os testes de cada grupo
for group in groups:
    print(f"Group: {group['column_name']} = {group['value']}")
    # Taxa de retenção

    count = [
        len(df_metrics_pandas[(df_metrics_pandas["is_target"] == "target") 
                              & (df_metrics_pandas["is_retained"] == 1) 
                              & (df_metrics_pandas[group["column_name"]] == group["value"])]), 
        len(df_metrics_pandas[(df_metrics_pandas["is_target"] == "control") 
                              & (df_metrics_pandas["is_retained"] == 1) 
                              & (df_metrics_pandas[group["column_name"]] == group["value"])]),
    ]
    nobs = [
        len(df_metrics_pandas[(df_metrics_pandas["is_target"] == "target") 
                              & (df_metrics_pandas[group["column_name"]] == group["value"])]), 
        len(df_metrics_pandas[(df_metrics_pandas["is_target"] == "control") 
                              & (df_metrics_pandas[group["column_name"]] == group["value"])])
    ]

    # Teste Z
    stat, p_val = proportions_ztest(count, nobs)
    print(f"P-valor (Taxa de retenção): {p_val:.4f}")

    # Número médio de pedidos
    target_n_order = df_metrics_pandas[(df_metrics_pandas["is_target"] == "target") 
                                       & (df_metrics_pandas[group["column_name"]] == group["value"])]["n_order"]
    control_n_order = df_metrics_pandas[(df_metrics_pandas["is_target"] == "control") 
                                        & (df_metrics_pandas[group["column_name"]] == group["value"])]["n_order"]

    # Teste T
    t_stat, p_val = ttest_ind(target_n_order, control_n_order, equal_var=False)
    print(f"P-valor (Número médio de pedidos): {p_val:4f}")

    # Ticket médio
    target_total_amount = df_order_metrics_pandas[(df_order_metrics_pandas["is_target"] == "target") 
                                                  & (df_metrics_pandas[group["column_name"]] == group["value"])]["order_total_amount"]
    control_total_amount = df_order_metrics_pandas[(df_order_metrics_pandas["is_target"] == "control") 
                                                   & (df_metrics_pandas[group["column_name"]] == group["value"])]["order_total_amount"]

    # Teste T
    t_stat, p_val = ttest_ind(target_total_amount, control_total_amount, equal_var=False)
    print(f"P-valor (Ticket médio): {p_val:4f}")

    # Receita Média
    target_sum_amount = df_metrics_pandas[(df_metrics_pandas["is_target"] == "target") 
                                          & (df_metrics_pandas[group["column_name"]] == group["value"])]["sum_amount"]
    control_sum_amount = df_metrics_pandas[(df_metrics_pandas["is_target"] == "control") 
                                           & (df_metrics_pandas[group["column_name"]] == group["value"])]["sum_amount"]

    # Teste T
    t_stat, p_val = ttest_ind(target_sum_amount, control_sum_amount, equal_var=False)
    print(f"P-valor (Receita Média): {p_val:4f}\n")