In [41]:
#import pandas as pd
import polars as pl
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os

fase = '01_LecturaDatos'

In [42]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [43]:
folder = gen_config['folder']

#salidas
path_group = gen_config['path_group']
path_prod_stats = gen_config['path_prod_stats']
path_prod_data = gen_config['path_prod_data']
path_stock_data = gen_config['path_stock_data']
path_overall_prod_stats = gen_config['path_overall_prod_stats']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
------------------------------------------01_LecturaDatos-------------------------------------------


In [44]:
df = pl.read_csv('../sell-in.txt', separator='\t')
prod_a_predecir = pl.read_csv('../productos_a_predecir.txt', separator='\t')
prod_data = pl.read_csv('../tb_productos_descripcion.txt', separator='\t')
stock_data = pl.read_csv('../tb_stocks.txt', separator='\t')

In [45]:
df = df.filter(pl.col('product_id').is_in(prod_a_predecir['product_id']))

In [46]:
df_grouped_prodcust = (
    df.group_by(['periodo', 'product_id', 'customer_id'])
    .agg(pl.col('tn').sum())
)

In [47]:
prod_stats = df_grouped_prodcust.group_by(['product_id','customer_id']).agg(
    [
        pl.min('periodo').alias('primer_periodo'),
        pl.max('periodo').alias('ultimo_periodo'),
        pl.len().alias('values'),
        pl.sum('tn').alias('total_tn'),
        pl.min('tn').alias('min_tn'),
        pl.mean('tn').alias('average_tn'),
        pl.median('tn').alias('median_tn'),
        pl.std('tn').alias('std_dev_tn'),
        (pl.col('tn').quantile(0.75) - pl.col('tn').quantile(0.25)).alias('iqr_tn'),
        pl.max('tn').alias('max_tn')
    ]
).sort(['product_id', 'customer_id'])

prod_stats = prod_stats.with_columns(
    (pl.col("primer_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d"),
    (pl.col("ultimo_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")
)

In [48]:
df_grouped_prod = (
    df.group_by(['periodo', 'product_id'])
    .agg(pl.col('tn').sum())
)

In [49]:
weight_stats = df_grouped_prod.filter(pl.col('periodo') >= 201901).group_by(['product_id']).agg(
    [
        pl.sum('tn').alias('tot_weight'),
        pl.mean('tn').alias('avg_weight'),
        pl.median('tn').alias('med_weight'),
    ]
).sort(['product_id'])

In [50]:
overall_prod_stats = df_grouped_prod.group_by(['product_id']).agg(
    [
        pl.min('periodo').alias('primer_periodo'),
        pl.max('periodo').alias('ultimo_periodo'),
        pl.len().alias('values'),
        pl.sum('tn').alias('total_tn'),
        pl.min('tn').alias('min_tn'),
        pl.mean('tn').alias('average_tn'),
        pl.median('tn').alias('median_tn'),
        pl.std('tn').alias('std_dev_tn'),
        (pl.col('tn').quantile(0.75) - pl.col('tn').quantile(0.25)).alias('iqr_tn'),
        pl.max('tn').alias('max_tn'),
    ]
).sort(['product_id'])

overall_prod_stats = overall_prod_stats.with_columns(
    (pl.col("primer_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d"),
    (pl.col("ultimo_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")
)

overall_prod_stats = overall_prod_stats.join(weight_stats, on='product_id', how='left', coalesce=True)

In [51]:
combinaciones = df_grouped_prodcust.select(["product_id", "customer_id"]).unique().sort(["product_id",'customer_id'])
todos_periodos = df_grouped_prodcust.select(["periodo"]).unique().sort("periodo")

In [52]:
df_completo = combinaciones.join(todos_periodos, how="cross")
df_completo = df_completo.join(overall_prod_stats, on=["product_id"], how="left", coalesce=True).select(["product_id", "customer_id", "periodo", "primer_periodo", "ultimo_periodo"])
df_completo = df_completo.with_columns([
    ((pl.col("periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")).alias("periodo_dt"),  
])

df_completo = df_completo.filter((pl.col('periodo_dt') >= pl.col('primer_periodo')) & (pl.col('periodo_dt') <= pl.col('ultimo_periodo')))

df_completo = df_completo.join(df_grouped_prodcust, on=['product_id', 'customer_id','periodo'], how='left', coalesce=True)
df_completo = df_completo.with_columns(
    pl.col("tn").fill_null(0)
)
df_completo = df_completo.drop(['primer_periodo','ultimo_periodo','periodo_dt'])

In [53]:
prod_data_skuinfo = prod_data.select(['cat1', 'cat2', 'cat3', 'brand','descripcion','sku_size'])
prod_data_skuinfo = prod_data_skuinfo.set_sorted("cat1", "cat2", "cat3", "brand", "descripcion")
prod_data_skuinfo = prod_data_skuinfo.group_by(['cat1', 'cat2', 'cat3', 'brand', 'descripcion']).agg([
    pl.col("sku_size").max().alias("max_skusize"),
    pl.col("sku_size").min().alias("min_skusize"),
    ])
prod_data = prod_data.join(prod_data_skuinfo, on=['cat1', 'cat2', 'cat3', 'brand', 'descripcion'], how='left', coalesce=True)


cat1,cat2,cat3,brand,sku_size,product_id,descripcion,max_skusize,min_skusize
str,str,str,str,i64,i64,str,i64,i64
"""FOODS""","""ADEREZOS""","""Aji Picante""","""NATURA""",240,20609,"""Salsa Aji Picante""",240,240
"""FOODS""","""ADEREZOS""","""Barbacoa""","""NATURA""",250,20266,"""Salsa Barbacoa""",500,250
"""FOODS""","""ADEREZOS""","""Barbacoa""","""NATURA""",400,20325,"""Salsa Barbacoa""",500,250
"""FOODS""","""ADEREZOS""","""Barbacoa""","""NATURA""",500,20503,"""Salsa Barbacoa""",500,250
"""FOODS""","""ADEREZOS""","""Chimichurri""","""NATURA""",350,20797,"""Chimichurri""",350,350
…,…,…,…,…,…,…,…,…
"""REF""","""TE""","""Frutas""","""TWININGS""",20,21271,"""Frutas""",20,20
"""REF""","""TE""","""Hierbas""","""TWININGS""",20,21202,"""Manzanilla""",20,20
"""REF""","""TE""","""Hierbas""","""TWININGS""",20,21218,"""Menta""",20,20
"""REF""","""TE""","""Verde""","""TWININGS""",20,21192,"""Verde""",20,20


In [56]:
prod_data

cat1,cat2,cat3,brand,sku_size,product_id,descripcion,max_skusize,min_skusize
str,str,str,str,i64,i64,str,i64,i64
"""FOODS""","""ADEREZOS""","""Aji Picante""","""NATURA""",240,20609,"""Salsa Aji Picante""",240,240
"""FOODS""","""ADEREZOS""","""Barbacoa""","""NATURA""",250,20266,"""Salsa Barbacoa""",500,250
"""FOODS""","""ADEREZOS""","""Barbacoa""","""NATURA""",400,20325,"""Salsa Barbacoa""",500,250
"""FOODS""","""ADEREZOS""","""Barbacoa""","""NATURA""",500,20503,"""Salsa Barbacoa""",500,250
"""FOODS""","""ADEREZOS""","""Chimichurri""","""NATURA""",350,20797,"""Chimichurri""",350,350
…,…,…,…,…,…,…,…,…
"""REF""","""TE""","""Frutas""","""TWININGS""",20,21271,"""Frutas""",20,20
"""REF""","""TE""","""Hierbas""","""TWININGS""",20,21202,"""Manzanilla""",20,20
"""REF""","""TE""","""Hierbas""","""TWININGS""",20,21218,"""Menta""",20,20
"""REF""","""TE""","""Verde""","""TWININGS""",20,21192,"""Verde""",20,20


In [61]:
conditional_expr = (
    pl.when((pl.col('sku_size') == pl.col('max_skusize')) & (pl.col('sku_size') == pl.col('min_skusize')) )
    .then(pl.lit("Unico"))
    .when((pl.col('sku_size') == pl.col('max_skusize')))
    .then(pl.lit("Grande"))
    .when((pl.col('sku_size') == pl.col('min_skusize')))
    .then(pl.lit("Chico"))
    .otherwise(pl.lit("Medio"))
)

prod_data = prod_data.with_columns(conditional_expr.cast(pl.Categorical).alias('presentacion'))

In [62]:
df_completo.write_parquet(f'{folder}/{path_group}')
prod_stats.write_parquet(f'{folder}/{path_prod_stats}')
overall_prod_stats.write_parquet(f'{folder}/{path_overall_prod_stats}')
prod_data.write_parquet(f'{folder}/{path_prod_data}')
stock_data.write_parquet(f'{folder}/{path_stock_data}')

In [63]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}")

------------------------------------------01_LecturaDatos-------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------
