In [22]:
import pandas as pd
from datetime import datetime
import warnings

import json

warnings.filterwarnings('ignore')

fase = '03_features'

In [23]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [24]:
folder = gen_config['folder']

path_norm = gen_config['path_norm']
path_prod_data = gen_config['path_prod_data']
path_lag = gen_config['path_lag']
path_dtw = gen_config['path_dtw']

var_lags = gen_config['var_lags']
var_cates_feat = gen_config['var_cates_feat']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------------03_features---------------------------------------------


In [25]:
df = pd.read_parquet(f"{folder}/{path_norm}")
prod_data = pd.read_parquet(f"{folder}/{path_prod_data}")
prod_a_predecir = pd.read_csv('..\productos_a_predecir.txt', sep='\t')
df_dtw = pd.read_parquet(f"{folder}/{path_dtw}")

## Pendientes
---
### operaciones sobre tn
* HECHO - razones con lags
* HECHO - marcar cero real
* HECHO - date features:
    - HECHO - mes
    - HECHO - quarter
    - HECHO - mes en el quarter
* HECHO - tiempos de monotonia de la funcion
* maximo salto
* expanding mean

### basadas en producto
* HECHO - info del producto
* HECHO porcentual de venta
    - HECHO del mes
    - HECHO del quarter
    - HECHO del año
* porcentual de categoria
* HECHO - stats de categoria1 y 2
* HECHO - stats de brand

## Generales

In [26]:
# date features
df['periodo_dt'] = pd.to_datetime(df['periodo'], format='%Y%m')
df = df.sort_values(by=['product_id', 'periodo_dt'])
df['mes'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter
df['month_in_quarter'] = (df['periodo_dt'].dt.month - 1) % 3 + 1
df['year'] = df['periodo_dt'].dt.year
df['yearquarter'] = df['periodo_dt'].dt.to_period('Q').astype('str').astype('category')

## Operaciones sobre tn

In [27]:
# marcar ceros
df['tn_cero'] = df['tn'] < df['median_tn'] / 10

In [28]:
lags = var_lags
#print(f"Lags/Variables: {lags}")

In [29]:
# lag features
for lag in lags:
    # rolling max
    df[f'max_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).max() == df['tn_norm']
    # rolling min
    df[f'min_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).min() == df['tn_norm']
    # rolling avg
    df[f'avg_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).mean()
    # lags
    df[f'tn_lag_{lag}'] = df.groupby('product_id')['tn_norm'].shift(lag)
    # primer derivada / delta
    df[f'tn_diff_{lag}'] = df['tn_norm'].diff(lag)
    # ratios con lags
    df[f'ratio_{lag}'] = (-df[f'tn_diff_{lag}'])/df[f'tn_lag_{lag}']

#for deriv in range(1, 13):
    # segunda derivada
    #df[f'tn_diff2_{deriv}'] = df[f'tn_diff_{deriv}'].diff()

In [30]:
# monotonia

for i in lags:
    df[f'crece_{i+1}'] = (df.groupby('product_id')['tn_norm'].shift(i) - df.groupby('product_id')['tn_norm'].shift(i+1)) > 0

crece_columns = df.filter(like='crece_')

df['crece_sum'] = crece_columns.sum(axis=1)
df['decrece_sum'] = 12 - crece_columns.sum(axis=1) #corregir para los primeros

## nivel producto

In [31]:
df = pd.merge(df, prod_data, on='product_id', how='left', suffixes=('', ''))

In [32]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [33]:
var_stats = []

for variable in var_cates_feat:
    var_stat = df.groupby(variable).apply(lambda group: pd.Series({
        f'{variable}_total': group['tn'].sum(),
        #f'{variable}_min': group['tn'].min(),
        #f'{variable}_average': group['tn'].mean(),
        f'{variable}_median': group['tn'].median(),
        #f'{variable}_std_dev': group['tn'].std(),
        #f'{variable}_iqr': (group['tn'].quantile(0.75) - group['tn'].quantile(0.25)),
        f'{variable}_max': group['tn'].max()
    }))
    var_stat.reset_index(inplace=True)
    var_stats.append(var_stat)

# Merge yearly sales with the products dataframe
#df_prod = df_prod.merge(df_temp, on='product_id', how='left')

In [34]:
for i in range(len(var_cates_feat)):
    var_stat = var_stats[i]
    df = pd.merge(df, var_stat, on=var_cates_feat[i], how='left')

In [35]:
tiempos = ['periodo','yearquarter','year']

for tiempo in tiempos:
    tiempo_sums = df.groupby(tiempo)['tn'].sum().reset_index()
    tiempo_sums.rename(columns={'tn': f'tn_total_{tiempo}'}, inplace=True)
    df = df.merge(tiempo_sums, on=tiempo)
    df[f'prop_product_{tiempo}'] = (df['tn'] / df[f'tn_total_{tiempo}']) * 100
    df.drop(columns=f'tn_total_{tiempo}', inplace=True)

In [36]:
for tiempo in tiempos:
    for variable in var_cates_feat:
        combi_sums = df.groupby([tiempo, variable])['tn'].sum().reset_index()
        combi_sums = df.groupby([tiempo, variable])['tn'].sum().reset_index()
        combi_sums.rename(columns={'tn': f'tn_total_{tiempo}_{variable}'}, inplace=True)
        df = df.merge(combi_sums, on=[tiempo, variable])
        df[f'prop_product_{tiempo}_{variable}'] = (df['tn'] / df[f'tn_total_{tiempo}_{variable}']) * 100
        df.drop(columns=f'tn_total_{tiempo}_{variable}', inplace=True)

# DTW

In [41]:
for col in df_dtw.iloc[:,1:].columns:
    df_dtw[col] = df_dtw[col].astype('category')

In [17]:
df = df.merge(df_dtw, on=['product_id'], how='left')

## ajustes finales pre export

In [19]:
df = df.sort_values(by=['product_id', 'periodo'])

In [20]:
#df.to_parquet('sell_in_lag.parquet', index=False)
df.to_parquet(f'{folder}/{path_lag}', index=False)

In [21]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------------03_features---------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------

