In [1]:
#import pandas as pd
import polars as pl
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os

fase = '01_LecturaDatos'

In [2]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [3]:
folder = gen_config['folder']

#salidas
path_group = gen_config['path_group']
path_prod_stats = gen_config['path_prod_stats']
path_prod_data = gen_config['path_prod_data']
path_stock_data = gen_config['path_stock_data']
path_overall_prod_stats = gen_config['path_overall_prod_stats']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
------------------------------------------01_LecturaDatos-------------------------------------------


In [4]:
df = pl.read_csv('../sell-in.txt', separator='\t')
prod_a_predecir = pl.read_csv('../productos_a_predecir.txt', separator='\t')
prod_data = pl.read_csv('../tb_productos_descripcion.txt', separator='\t')
stock_data = pl.read_csv('../tb_stocks.txt', separator='\t')

In [5]:
df = df.filter(pl.col('product_id').is_in(prod_a_predecir['product_id']))

In [6]:
df_grouped_prodcust = (
    df.group_by(['periodo', 'product_id', 'customer_id'])
    .agg(pl.col('tn').sum())
)

In [7]:
prod_stats = df_grouped_prodcust.group_by(['product_id','customer_id']).agg(
    [
        pl.min('periodo').alias('primer_periodo'),
        pl.max('periodo').alias('ultimo_periodo'),
        pl.len().alias('values'),
        pl.sum('tn').alias('total_tn'),
        pl.min('tn').alias('min_tn'),
        pl.mean('tn').alias('average_tn'),
        pl.median('tn').alias('median_tn'),
        pl.std('tn').alias('std_dev_tn'),
        (pl.col('tn').quantile(0.75) - pl.col('tn').quantile(0.25)).alias('iqr_tn'),
        pl.max('tn').alias('max_tn')
    ]
).sort(['product_id', 'customer_id'])

prod_stats = prod_stats.with_columns(
    (pl.col("primer_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d"),
    (pl.col("ultimo_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")
)

In [8]:
df_grouped_prod = (
    df.group_by(['periodo', 'product_id'])
    .agg(pl.col('tn').sum())
)

In [9]:
weight_stats = df_grouped_prod.filter(pl.col('periodo') >= 201901).group_by(['product_id']).agg(
    [
        pl.sum('tn').alias('tot_weight'),
        pl.mean('tn').alias('avg_weight'),
        pl.median('tn').alias('med_weight'),
    ]
).sort(['product_id'])

In [10]:
overall_prod_stats = df_grouped_prod.group_by(['product_id']).agg(
    [
        pl.min('periodo').alias('primer_periodo'),
        pl.max('periodo').alias('ultimo_periodo'),
        pl.len().alias('values'),
        pl.sum('tn').alias('total_tn'),
        pl.min('tn').alias('min_tn'),
        pl.mean('tn').alias('average_tn'),
        pl.median('tn').alias('median_tn'),
        pl.std('tn').alias('std_dev_tn'),
        (pl.col('tn').quantile(0.75) - pl.col('tn').quantile(0.25)).alias('iqr_tn'),
        pl.max('tn').alias('max_tn'),
    ]
).sort(['product_id'])

overall_prod_stats = overall_prod_stats.with_columns(
    (pl.col("primer_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d"),
    (pl.col("ultimo_periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")
)

overall_prod_stats = overall_prod_stats.join(weight_stats, on='product_id', how='left', coalesce=True)

In [11]:
combinaciones = df_grouped_prodcust.select(["product_id", "customer_id"]).unique().sort(["product_id",'customer_id'])
todos_periodos = df_grouped_prodcust.select(["periodo"]).unique().sort("periodo")

In [12]:
df_completo = combinaciones.join(todos_periodos, how="cross")
df_completo = df_completo.join(overall_prod_stats, on=["product_id"], how="left", coalesce=True).select(["product_id", "customer_id", "periodo", "primer_periodo", "ultimo_periodo"])
df_completo = df_completo.with_columns([
    ((pl.col("periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")).alias("periodo_dt"),  
])

df_completo = df_completo.filter((pl.col('periodo_dt') >= pl.col('primer_periodo')) & (pl.col('periodo_dt') <= pl.col('ultimo_periodo')))

df_completo = df_completo.join(df_grouped_prodcust, on=['product_id', 'customer_id','periodo'], how='left', coalesce=True)
df_completo = df_completo.with_columns(
    pl.col("tn").fill_null(0)
)
df_completo = df_completo.drop(['primer_periodo','ultimo_periodo','periodo_dt'])

In [13]:
df_completo.write_parquet(f'{folder}/{path_group}')
prod_stats.write_parquet(f'{folder}/{path_prod_stats}')
overall_prod_stats.write_parquet(f'{folder}/{path_overall_prod_stats}')
prod_data.write_parquet(f'{folder}/{path_prod_data}')
stock_data.write_parquet(f'{folder}/{path_stock_data}')

In [14]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}")

------------------------------------------01_LecturaDatos-------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------
