In [1]:
#import pandas as pd
import polars as pl
from datetime import datetime
import warnings

import json

warnings.filterwarnings('ignore')

fase = '03_features'

In [2]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [3]:
folder = gen_config['folder']

#entradas
path_norm = gen_config['path_norm']
path_prod_data = gen_config['path_prod_data']
path_dtw = gen_config['path_dtw']
path_prod_stats = gen_config['path_prod_stats']
path_overall_prod_stats = gen_config['path_overall_prod_stats']
#salidas
path_lag = gen_config['path_lag']

#variables config
var_lags = gen_config['var_lags']
var_cates_feat = gen_config['var_cates_feat']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------------03_features---------------------------------------------


In [4]:
df = pl.read_parquet(f"{folder}/{path_norm}")
prod_data = pl.read_parquet(f"{folder}/{path_prod_data}")
prod_stats = pl.read_parquet(f"{folder}/{path_prod_stats}")
overall_prod_stats = pl.read_parquet(f"{folder}/{path_overall_prod_stats}")
prod_a_predecir = pl.read_csv('..\productos_a_predecir.txt', separator='\t')
df_dtw = pl.read_parquet(f"{folder}/{path_dtw}")

## Pendientes
---
### operaciones sobre tn
* HECHO - razones con lags
* HECHO - marcar cero real
* HECHO - date features:
    - HECHO - mes
    - HECHO - quarter
    - HECHO - mes en el quarter
* HECHO - tiempos de monotonia de la funcion
* maximo salto
* expanding mean

* HECHO - basadas en producto
* HECHO - info del producto
* HECHO porcentual de venta
    - HECHO del mes
    - HECHO del quarter
    - HECHO del año
* porcentual de categoria
* HECHO - stats de categoria1 y 2
* HECHO - stats de brand

## Generales

In [5]:
df = df.with_columns([
    ((pl.col("periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")).alias("periodo_dt"),  
])

df = df.with_columns([
    pl.col('periodo_dt').dt.month().alias('mes'),
    pl.col('periodo_dt').dt.year().alias('year'),
    pl.col('periodo_dt').dt.quarter().alias('quarter'),
    ((pl.col('periodo_dt').dt.month() -1)% 3+1).alias('month_in_quarter'),
])
df = df.with_columns([
    (pl.col("year").cast(pl.Utf8) + "-Q" + pl.col("quarter").cast(pl.Utf8)).alias("yearquarter"),
])
df = df.sort(by=['product_id', 'customer_id', 'periodo_dt'])

In [6]:
periodos = overall_prod_stats.select(['product_id', 'primer_periodo', 'ultimo_periodo'])
df = df.join(periodos, on=['product_id'], how='left', coalesce=False, suffix='_overall')

In [7]:
# df = df.with_columns([
#     (pl.col('periodo_dt') - pl.col('primer_periodo_overall')).dt.days().alias('edad_dias'),
# ])
df = df.with_columns([
    (pl.col('periodo_dt') - pl.col('primer_periodo')).dt.days().alias('edad_dias'),
])

conditional_expr = (
    pl.when((pl.col('primer_periodo') == pl.datetime(2017, 1, 1)))
    .then(pl.lit("Adulto"))
    .when((pl.col('edad_dias') < 35))
    .then(pl.lit("Infante"))
    .when((pl.col('edad_dias') < 190))
    .then(pl.lit("Joven"))
    .when((pl.col('edad_dias') >= 190))
    .then(pl.lit("Adulto"))
    .otherwise(pl.lit("Otro"))
)

df = df.with_columns(conditional_expr.cast(pl.Categorical).alias('edad'))

check = df.select(['periodo_dt', 'primer_periodo', 'edad_dias', 'edad'])
check

periodo_dt,primer_periodo,edad_dias,edad
date,date,i64,cat
2017-01-01,2017-01-01,0,"""Adulto"""
2017-02-01,2017-01-01,31,"""Adulto"""
2017-03-01,2017-01-01,59,"""Adulto"""
2017-04-01,2017-01-01,90,"""Adulto"""
2017-05-01,2017-01-01,120,"""Adulto"""
…,…,…,…
2019-08-01,2019-03-01,153,"""Joven"""
2019-09-01,2019-03-01,184,"""Joven"""
2019-10-01,2019-03-01,214,"""Adulto"""
2019-11-01,2019-03-01,245,"""Adulto"""


In [8]:
# df.filter((pl.col('product_id')==20001) & (pl.col('customer_id')==10001))['tn'].sum()

In [9]:
# df.filter(pl.col('product_id')==20001)['tn'].sum()

## Operaciones sobre tn

In [10]:
df = df.join(prod_stats, on=['product_id', 'customer_id'], how='left', coalesce=True, suffix='_prodcust')

In [11]:
# marcar ceros
df = df.with_columns([
    (pl.col('tn') < pl.col('median_tn') / 10 ).alias('tn_cero')
])

In [12]:
lags = var_lags

In [13]:
df = df.sort(by=["product_id", "customer_id", "periodo"])
df = df.set_sorted(["product_id", "customer_id", "periodo"])

In [14]:
start_time = datetime.now()

# lags normales
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        pl.col("tn_norm").shift(lag).over(["product_id", "customer_id"]).alias(f'tn_lag_{lag}'),
    ])
print(f"Lags creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# primer derivada / delta
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        pl.col("tn_norm").diff(lag).over(["product_id", "customer_id"]).alias(f'tn_diff_{lag}')
    ])
print(f"Diff creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# segunda derivada
start_time2 = datetime.now()
for lag in [1,2,3]:
    df = df.with_columns([
        pl.col(f'tn_diff_{lag}').diff(lag).over(["product_id", "customer_id"]).alias(f'tn_diff_2_{lag}')
    ])
print(f"Diff2 creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# ratios
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        ((-pl.col(f'tn_diff_{lag}'))/pl.col(f'tn_lag_{lag}')).alias(f'ratio_{lag}')
    ])
print(f"Ratios creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Promedios
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        (pl.col('tn_norm').rolling_mean(lag).over(["product_id", "customer_id"])).alias(f'avg_{lag}')
    ])
print(f"Promedios creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Maximos
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        (pl.col('tn_norm').rolling_max(lag).over(["product_id", "customer_id"]) == pl.col('tn_norm')).alias(f'max_{lag}')
    ])
print(f"Maximos creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Minimos
start_time2 = datetime.now()
for lag in lags:
    df = df.with_columns([
        (pl.col('tn_norm').rolling_min(lag).over(["product_id", "customer_id"]) == pl.col('tn_norm')).alias(f'max_{lag}')
    ])
print(f"Minimos creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Monotonia_creciente
start_time2 = datetime.now()
for lag in range(1,12):
    df = df.with_columns([
        (((pl.col(f'tn_lag_{lag}'))-pl.col(f'tn_lag_{lag+1}')) > 0).alias(f'crece_{lag+1}')
    ])
print(f"Monotonia_creciente creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# Monotonia_decreciente
start_time2 = datetime.now()
for lag in range(1,12):
    df = df.with_columns([
        (((pl.col(f'tn_lag_{lag}'))-pl.col(f'tn_lag_{lag+1}')) > 0).alias(f'decrece_{lag+1}')
    ])
print(f"Monotonia_decreciente creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

print(f"\nFin de creacion de metricas {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

Lags creados en: 4.13 a las 20:45:00
Diff creados en: 6.03 a las 20:45:06
Diff2 creados en: 1.65 a las 20:45:07
Ratios creados en: 0.58 a las 20:45:08
Promedios creados en: 4.78 a las 20:45:13
Maximos creados en: 4.66 a las 20:45:17
Minimos creados en: 4.33 a las 20:45:22
Monotonia_creciente creados en: 0.23 a las 20:45:22
Monotonia_decreciente creados en: 0.17 a las 20:45:22

Fin de creacion de metricas 26.57 a las 20:45:22


## nivel producto

In [15]:
df = df.join(prod_data, on=['product_id'], how='left', coalesce=True, suffix='')

In [16]:
catcols = [col for col in df.columns if df[col].dtype == pl.Utf8]

df = df.with_columns([
    pl.col(col).cast(pl.Categorical) for col in catcols
])

print(f"Columnas cambiadas a categorical: {catcols}")

Columnas cambiadas a categorical: ['yearquarter', 'cat1', 'cat2', 'cat3', 'brand', 'descripcion']


In [17]:
var_stats = []

for variable in var_cates_feat:
    var_stats = df.groupby('product_id',variable).agg([
        pl.sum('tn').alias(f'{variable}_total'),
        pl.median('tn').alias(f'{variable}_median'),
        pl.max('tn').alias(f'{variable}_max'),
    ])
    df = df.join(var_stats, on=['product_id', variable], how='left', coalesce=True, suffix='')

In [18]:
tiempos = ['periodo','yearquarter','year']

for tiempo in tiempos:
    tiempo_sums = df.group_by([tiempo,'product_id']).agg([pl.sum('tn').alias(f'tn_total_{tiempo}')])
    df = df.join(tiempo_sums, on=[tiempo,'product_id'], how='left', coalesce=True, suffix='')
    df = df.with_columns([
        (pl.col(f'tn') / pl.col(f'tn_total_{tiempo}')).alias(f'prop_product_{tiempo}')
    ])
    df = df.drop(columns=f'tn_total_{tiempo}')

In [19]:
for tiempo in tiempos:
    for variable in var_cates_feat:
        combi_sums = df.group_by([tiempo,'product_id', variable]).agg([pl.sum('tn').alias(f'tn_total_{tiempo}_{variable}')])
        df = df.join(combi_sums, on=[tiempo,'product_id', variable], how='left', coalesce=True, suffix='')
        df = df.with_columns([
            (pl.col(f'tn') / pl.col(f'tn_total_{tiempo}_{variable}')).alias(f'prop_product_{tiempo}_{variable}')
        ])
        df = df.drop(columns=f'tn_total_{tiempo}_{variable}')

In [20]:
# props = df.columns
# props = [prop for prop in props if prop.startswith("prop_")]

# start_time = datetime.now()

# # lags normales
# start_time2 = datetime.now()



# print(f"Lags creados en: {(datetime.now() - start_time2).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# print(f"\nFin de creacion de metricas {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\n')

# DTW

In [21]:
df = df.join(df_dtw, on=['product_id', 'customer_id'], how='left', coalesce=True, suffix='')

## ajustes finales pre export

In [22]:
df = df.sort(by=["product_id", "customer_id", "periodo"])
df = df.set_sorted(["product_id", "customer_id", "periodo"])

In [23]:
#df.to_parquet('sell_in_lag.parquet', index=False)
df.write_parquet(f'{folder}/{path_lag}')

In [24]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------------03_features---------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



