In [214]:
#import pandas as pd
import polars as pl
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os

fase = '01_LecturaDatos'

In [215]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [216]:
folder = gen_config['folder']

#entradas

#salidas
path_group = gen_config['path_group']
path_prod_stats = gen_config['path_prod_stats']
path_prod_data = gen_config['path_prod_data']
path_stock_data = gen_config['path_stock_data']
path_overall_prod_stats = gen_config['path_overall_prod_stats']
#variables

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
------------------------------------------01_LecturaDatos-------------------------------------------


In [217]:
df = pl.read_csv('../sell-in.txt', separator='\t')
prod_a_predecir = pl.read_csv('../productos_a_predecir.txt', separator='\t')
prod_data = pl.read_csv('../tb_productos_descripcion.txt', separator='\t')
stock_data = pl.read_csv('../tb_stocks.txt', separator='\t')

In [218]:
df = df.filter(pl.col('product_id').is_in(prod_a_predecir['product_id']))

In [219]:
df_grouped_prodcust = (
    df.group_by(['periodo', 'product_id', 'customer_id'])
    .agg(pl.col('tn').sum())
)

In [220]:
prod_stats = df_grouped_prodcust.group_by(['product_id','customer_id']).agg(
    [
        pl.min('periodo').alias('primer_periodo'),
        pl.max('periodo').alias('ultimo_periodo'),
        pl.len().alias('values'),
        pl.sum('tn').alias('total_tn'),
        pl.min('tn').alias('min_tn'),
        pl.mean('tn').alias('average_tn'),
        pl.median('tn').alias('median_tn'),
        pl.std('tn').alias('std_dev_tn'),
        (pl.col('tn').quantile(0.75) - pl.col('tn').quantile(0.25)).alias('iqr_tn'),
        pl.max('tn').alias('max_tn')
    ]
).sort(['product_id', 'customer_id'])

prod_stats = prod_stats.with_columns(
    (pl.col("primer_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d"),
    (pl.col("ultimo_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")
)

In [221]:
df_grouped_prod = (
    df.group_by(['periodo', 'product_id'])
    .agg(pl.col('tn').sum())
)

In [222]:
weight_stats = df_grouped_prod.filter(pl.col('periodo') >= 201901).group_by(['product_id']).agg(
    [
        pl.sum('tn').alias('tot_weight'),
        pl.mean('tn').alias('avg_weight'),
        pl.median('tn').alias('med_weight'),
    ]
).sort(['product_id'])

In [223]:
overall_prod_stats = df_grouped_prod.group_by(['product_id']).agg(
    [
        pl.min('periodo').alias('primer_periodo'),
        pl.max('periodo').alias('ultimo_periodo'),
        pl.len().alias('values'),
        pl.sum('tn').alias('total_tn'),
        pl.min('tn').alias('min_tn'),
        pl.mean('tn').alias('average_tn'),
        pl.median('tn').alias('median_tn'),
        pl.std('tn').alias('std_dev_tn'),
        (pl.col('tn').quantile(0.75) - pl.col('tn').quantile(0.25)).alias('iqr_tn'),
        pl.max('tn').alias('max_tn'),
    ]
).sort(['product_id'])

overall_prod_stats = overall_prod_stats.with_columns(
    (pl.col("primer_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d"),
    (pl.col("ultimo_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")
)

overall_prod_stats = overall_prod_stats.join(weight_stats, on='product_id', how='left', coalesce=True)

In [224]:
combinaciones = df_grouped_prodcust.select(["product_id", "customer_id"]).unique().sort(["product_id",'customer_id'])
todos_periodos = df_grouped_prodcust.select(["periodo"]).unique().sort("periodo")

In [225]:
df_completo = combinaciones.join(todos_periodos, how="cross")
df_completo = df_completo.join(overall_prod_stats, on=["product_id"], how="left", coalesce=True).select(["product_id", "customer_id", "periodo", "primer_periodo", "ultimo_periodo"])
df_completo = df_completo.with_columns([
    ((pl.col("periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")).alias("periodo_dt"),  
])

df_completo = df_completo.filter((pl.col('periodo_dt') >= pl.col('primer_periodo')) & (pl.col('periodo_dt') <= pl.col('ultimo_periodo')))

df_completo = df_completo.join(df_grouped_prodcust, on=['product_id', 'customer_id','periodo'], how='left', coalesce=True)
df_completo = df_completo.with_columns(
    pl.col("tn").fill_null(0)
)
df_completo = df_completo.drop(['primer_periodo','ultimo_periodo','periodo_dt'])

In [226]:
prod_data_skuinfo = prod_data.select(['cat1', 'cat2', 'cat3', 'brand','descripcion','sku_size'])
prod_data_skuinfo = prod_data_skuinfo.set_sorted("cat1", "cat2", "cat3", "brand", "descripcion")
prod_data_skuinfo = prod_data_skuinfo.group_by(['cat1', 'cat2', 'cat3', 'brand', 'descripcion']).agg([
    pl.col("sku_size").max().alias("max_skusize"),
    pl.col("sku_size").min().alias("min_skusize"),
    ])
prod_data = prod_data.join(prod_data_skuinfo, on=['cat1', 'cat2', 'cat3', 'brand', 'descripcion'], how='left', coalesce=True)


In [227]:
conditional_expr = (
    pl.when((pl.col('sku_size') == pl.col('max_skusize')) & (pl.col('sku_size') == pl.col('min_skusize')) )
    .then(pl.lit("Unico"))
    .when((pl.col('sku_size') == pl.col('max_skusize')))
    .then(pl.lit("Grande"))
    .when((pl.col('sku_size') == pl.col('min_skusize')))
    .then(pl.lit("Chico"))
    .otherwise(pl.lit("Medio"))
)

prod_data = prod_data.with_columns(conditional_expr.cast(pl.Categorical).alias('presentacion'))

In [228]:
#df_completo.filter((pl.col('product_id') == 20667) & (pl.col('customer_id') == 10427))

In [229]:
rep_tot = prod_stats['total_tn'].sum()
rep_tot_tn = []
rep_count = []
rep_tot_pre = []
rep_count_pre = []


for i in range(12):
    rep_tot_tn.append(      prod_stats.filter((pl.col('values') <= i+1))['total_tn'].sum())
    rep_count.append(       prod_stats.filter((pl.col('values') <= i+1))['total_tn'].count())
    rep_tot_pre.append(     prod_stats.filter((pl.col('values') <= i+1) & (pl.col('ultimo_periodo') < pl.date(2019,6,1)))['total_tn'].sum())
    rep_count_pre.append(   prod_stats.filter((pl.col('values') <= i+1) & (pl.col('ultimo_periodo') < pl.date(2019,6,1)))['total_tn'].count())

rep = pl.DataFrame({'Datos presentes': range(1, 13), 'Total tn': rep_tot_tn, 'Cuenta ProdCust': rep_count, 'Total tn (pre junio)': rep_tot_pre, 'Cuenta ProdCust (pre junio)': rep_count_pre})
rep = rep.with_columns((pl.col('Total tn') / rep_tot).alias('Total tn %'))
rep = rep.with_columns((pl.col('Total tn (pre junio)') / rep_tot).alias('Total tn (pre junio) %'))

pl.Config.set_tbl_rows(20)
print("Reporte de datos a descartar")
print(rep)


Reporte de datos a descartar
shape: (12, 7)
┌───────────┬───────────────┬──────────┬───────────────┬───────────────┬────────────┬──────────────┐
│ Datos     ┆ Total tn      ┆ Cuenta   ┆ Total tn (pre ┆ Cuenta        ┆ Total tn % ┆ Total tn     │
│ presentes ┆ ---           ┆ ProdCust ┆ junio)        ┆ ProdCust (pre ┆ ---        ┆ (pre junio)  │
│ ---       ┆ f64           ┆ ---      ┆ ---           ┆ junio)        ┆ f64        ┆ %            │
│ i64       ┆               ┆ i64      ┆ f64           ┆ ---           ┆            ┆ ---          │
│           ┆               ┆          ┆               ┆ i64           ┆            ┆ f64          │
╞═══════════╪═══════════════╪══════════╪═══════════════╪═══════════════╪════════════╪══════════════╡
│ 1         ┆ 2526.49537    ┆ 44518    ┆ 1245.37808    ┆ 29204         ┆ 0.002251   ┆ 0.001109     │
│ 2         ┆ 6815.40544    ┆ 73667    ┆ 3031.92855    ┆ 45345         ┆ 0.006071   ┆ 0.002701     │
│ 3         ┆ 12256.17277   ┆ 96002    ┆ 4985.7

In [230]:
# calculo las filas a descartar

condition1 = pl.col('values') <= 12
condition2 = pl.col('ultimo_periodo') < pl.date(2019,6,1)

drop_prodcusts = prod_stats.filter((condition1 & condition2))[['product_id', 'customer_id']]

In [231]:
print(f"en df_completo se descartaron {drop_prodcusts.shape[0]:_d} / {prod_stats.shape[0]:_d} combinaciones de ProdCusts")
df_completo = df_completo.join(drop_prodcusts, on=['product_id', 'customer_id'], how='anti', coalesce=True)
print(f"Shape df_completo: {df_completo.shape[0]:_d}, {df_completo.shape[1]:_d}, productos unicos: {df_completo['product_id'].unique().shape[0]}")

print(f"en prod_stats se descartaron {drop_prodcusts.shape[0]:_d} / {prod_stats.shape[0]:_d} combinaciones de ProdCusts")
prod_stats = prod_stats.join(drop_prodcusts, on=['product_id', 'customer_id'], how='anti', coalesce=True)
print(f"Shape prod_stats: {prod_stats.shape[0]:_d}, {prod_stats.shape[1]:_d}, productos unicos: {prod_stats['product_id'].unique().shape[0]}")

en df_completo se descartaron 84_121 / 262_805 combinaciones de ProdCusts
Shape df_completo: 5_303_555, 4, productos unicos: 780
en prod_stats se descartaron 84_121 / 262_805 combinaciones de ProdCusts
Shape prod_stats: 178_684, 12, productos unicos: 780


In [232]:
df_completo.write_parquet(f'{folder}/{path_group}')
prod_stats.write_parquet(f'{folder}/{path_prod_stats}')
overall_prod_stats.write_parquet(f'{folder}/{path_overall_prod_stats}')
prod_data.write_parquet(f'{folder}/{path_prod_data}')
stock_data.write_parquet(f'{folder}/{path_stock_data}')

In [233]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}")

------------------------------------------01_LecturaDatos-------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------
