In [5]:
#import pandas as pd
import polars as pl
from datetime import datetime
import warnings

import json

warnings.filterwarnings('ignore')

fase = '03_features'

In [6]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [7]:
folder = gen_config['folder']

#entradas
path_norm = gen_config['path_norm']
path_prod_data = gen_config['path_prod_data']
path_dtw = gen_config['path_dtw']
#salidas
path_lag = gen_config['path_lag']

#variables config
var_lags = gen_config['var_lags']
var_cates_feat = gen_config['var_cates_feat']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------------03_features---------------------------------------------


In [8]:
df = pl.read_parquet(f"{folder}/{path_norm}")
prod_data = pl.read_parquet(f"{folder}/{path_prod_data}")
prod_a_predecir = pl.read_csv('..\productos_a_predecir.txt', separator='\t')
df_dtw = pl.read_parquet(f"{folder}/{path_dtw}")

## Pendientes
---
### operaciones sobre tn
* HECHO - razones con lags
* HECHO - marcar cero real
* HECHO - date features:
    - HECHO - mes
    - HECHO - quarter
    - HECHO - mes en el quarter
* HECHO - tiempos de monotonia de la funcion
* maximo salto
* expanding mean

### basadas en producto
* HECHO - info del producto
* HECHO porcentual de venta
    - HECHO del mes
    - HECHO del quarter
    - HECHO del año
* porcentual de categoria
* HECHO - stats de categoria1 y 2
* HECHO - stats de brand

## Generales

In [9]:
df = df.with_columns([
    ((pl.col("periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")).alias("periodo_dt"),  
])

df = df.with_columns([
    pl.col('periodo_dt').dt.month().alias('mes'),
    pl.col('periodo_dt').dt.year().alias('year'),
    pl.col('periodo_dt').dt.quarter().alias('quarter'),
    ((pl.col('periodo_dt').dt.month() -1)% 3+1).alias('month_in_quarter'),
])
df = df.with_columns([
    (pl.col("year").cast(pl.Utf8) + "-Q" + pl.col("quarter").cast(pl.Utf8)).alias("yearquarter"),
])
df = df.sort(by=['product_id', 'customer_id', 'periodo_dt'])

## Operaciones sobre tn

In [10]:
# marcar ceros
df = df.with_columns([
    (pl.col('tn') < pl.col('median_tn') / 10 ).alias('tn_cero')
])

In [11]:
lags = var_lags

In [12]:
df = df.sort(by=["product_id", "customer_id", "periodo"])
df = df.set_sorted(["product_id", "customer_id", "periodo"])

In [13]:
start_time = datetime.now()

# lags normales
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        pl.col("tn_norm").shift(lag).over(["product_id", "customer_id"]).alias(f'tn_lag_{lag}'),
    ])
print(f"Lags creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# primer derivada / delta
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        pl.col("tn_norm").diff(lag).over(["product_id", "customer_id"]).alias(f'tn_diff_{lag}')
    ])
print(f"Diff creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# segunda derivada
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        pl.col(f'tn_diff_{lag}').diff(lag).over(["product_id", "customer_id"]).alias(f'tn_diff_2_{lag}')
    ])
print(f"Diff2 creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# ratios
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        ((-pl.col(f'tn_diff_{lag}'))/pl.col(f'tn_lag_{lag}')).alias(f'ratio_{lag}')
    ])
print(f"Ratios creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Promedios
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        (pl.col('tn_norm').rolling_mean(lag).over(["product_id", "customer_id"])).alias(f'avg_{lag}')
    ])
print(f"Promedios creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Maximos
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        (pl.col('tn_norm').rolling_max(lag).over(["product_id", "customer_id"]) == pl.col('tn_norm')).alias(f'max_{lag}')
    ])
print(f"Maximos creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Minimos
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        (pl.col('tn_norm').rolling_min(lag).over(["product_id", "customer_id"]) == pl.col('tn_norm')).alias(f'max_{lag}')
    ])
print(f"Minimos creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Monotonia_creciente
start_time2 = datetime.now()
for lag in range(1,12):
    df = df.with_columns([
        (((pl.col(f'tn_lag_{lag}'))-pl.col(f'tn_lag_{lag+1}')) > 0).alias(f'crece_{lag+1}')
    ])
print(f"Monotonia_creciente creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Monotonia_decreciente
start_time2 = datetime.now()
for lag in range(1,12):
    df = df.with_columns([
        (((pl.col(f'tn_lag_{lag}'))-pl.col(f'tn_lag_{lag+1}')) > 0).alias(f'decrece_{lag+1}')
    ])
print(f"Monotonia_decreciente creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

print(f"\nFin de creacion de metricas {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

Lags creados en: 8.19 a las 15:01:22
Diff creados en: 11.17 a las 15:01:33
Diff2 creados en: 14.03 a las 15:01:47
Ratios creados en: 1.08 a las 15:01:48
Promedios creados en: 8.81 a las 15:01:57
Maximos creados en: 8.85 a las 15:02:06
Minimos creados en: 8.30 a las 15:02:14
Monotonia_creciente creados en: 0.32 a las 15:02:14
Monotonia_decreciente creados en: 0.26 a las 15:02:15

Fin de creacion de metricas 61.02 a las 15:02:15


## nivel producto

In [14]:
df = df.join(prod_data, on=['product_id'], how='left', coalesce=True, suffix='')

In [15]:
catcols = [col for col in df.columns if df[col].dtype == pl.Utf8]

df = df.with_columns([
    pl.col(col).cast(pl.Categorical) for col in catcols
])

print(f"Columnas cambiadas a categorical: {catcols}")

Columnas cambiadas a categorical: ['yearquarter', 'cat1', 'cat2', 'cat3', 'brand', 'descripcion']


In [16]:
var_stats = []

for variable in var_cates_feat:
    var_stats = df.groupby('product_id').agg([
        pl.sum('tn').alias(f'{variable}_total'),
        pl.median('tn').alias(f'{variable}_median'),
        pl.max('tn').alias(f'{variable}_max'),
    ])
    df = df.join(var_stats, on='product_id', how='left', coalesce=True, suffix='')

In [17]:
tiempos = ['periodo','yearquarter','year']

for tiempo in tiempos:
    tiempo_sums = df.group_by([tiempo,'product_id']).agg([pl.sum('tn').alias(f'tn_total_{tiempo}')])
    df = df.join(tiempo_sums, on=[tiempo,'product_id'], how='left', coalesce=True, suffix='')
    df = df.with_columns([
        (pl.col(f'tn') / pl.col(f'tn_total_{tiempo}')).alias(f'prop_product_{tiempo}')
    ])
    df = df.drop(columns=f'tn_total_{tiempo}')

In [18]:
for tiempo in tiempos:
    for variable in var_cates_feat:
        combi_sums = df.group_by([tiempo,'product_id', variable]).agg([pl.sum('tn').alias(f'tn_total_{tiempo}_{variable}')])
        df = df.join(combi_sums, on=[tiempo,'product_id', variable], how='left', coalesce=True, suffix='')
        df = df.with_columns([
            (pl.col(f'tn') / pl.col(f'tn_total_{tiempo}_{variable}')).alias(f'prop_product_{tiempo}_{variable}')
        ])
        df = df.drop(columns=f'tn_total_{tiempo}_{variable}')

# DTW

In [19]:
df = df.join(df_dtw, on=['product_id', 'customer_id'], how='left', coalesce=True, suffix='')

## ajustes finales pre export

In [20]:
df = df.sort(by=["product_id", "customer_id", "periodo"])
df = df.set_sorted(["product_id", "customer_id", "periodo"])

In [21]:
#df.to_parquet('sell_in_lag.parquet', index=False)
df.write_parquet(f'{folder}/{path_lag}')

In [22]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------------03_features---------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



