In [44]:
import pandas as pd

import warnings

warnings.filterwarnings('ignore')

In [45]:
df = pd.read_parquet("sell_in_norm.parquet")

In [46]:
df['periodo_dt'] = pd.to_datetime(df['periodo'], format='%Y%m')
df = df.sort_values(by=['product_id', 'periodo_dt'])

In [47]:
# date features

df['mes'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter
df['month_in_quarter'] = (df['periodo_dt'].dt.month - 1) % 3 + 1
df['year'] = df['periodo_dt'].dt.year

In [48]:
# marcar ceros
df['tn_cero'] = df['tn'] < df['median_tn'] / 10

In [49]:
# lag features
for lag in range(1, 13):
    # rolling max
    df[f'max_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).max() == df['tn_norm']
    # rolling min
    df[f'min_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).min() == df['tn_norm']
    # rolling avg
    df[f'avg_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).mean()
    # lags
    df[f'tn_lag_{lag}'] = df.groupby('product_id')['tn_norm'].shift(lag)
    # primer derivada / delta
    df[f'tn_diff_{lag}'] = df['tn_norm'].diff(lag)
    # ratios con lags
    df[f'ratio_{lag}'] = (-df[f'tn_diff_{lag}'])/df[f'tn_lag_{lag}']

for deriv in range(1, 13):
    # segunda derivada
    df[f'tn_diff2_{deriv}'] = df[f'tn_diff_{deriv}'].diff()

In [50]:
for i in range(13):
    df[f'crece_{i+1}'] = (df.groupby('product_id')['tn_norm'].shift(i) - df.groupby('product_id')['tn_norm'].shift(i+1)) > 0

crece_columns = df.filter(like='crece_')

df['crece_sum'] = crece_columns.sum(axis=1)
df['decrece_sum'] = 12 - crece_columns.sum(axis=1) #corregir para los primeros

In [51]:
#df[['periodo', 'tn_norm', 'crece_1', 'crece_2', 'crece_3', 'crece_4', 'crece_5','crece_sum','decrece_sum']]

# Pendientes
---
* razones con lags  ----------HECHO
* marcar cero real  ----------HECHO
* date features:    ----------HECHO
    - mes           ----------HECHO
    - quarter       ----------HECHO
    - mes en el quarter ------HECHO
* porcentual de venta
    - del mes
    - del quarter
* porcentual de la categoria en total
* tiempos de monotonia de la funcion ------HECHO
* maximo salto

In [52]:
df = df.sort_values(by=['product_id', 'periodo'])

In [53]:
df.head(15)

Unnamed: 0,product_id,periodo,tn,primer_periodo,ultimo_periodo,values,total_tn,min_tn,average_tn,median_tn,...,crece_6,crece_7,crece_8,crece_9,crece_10,crece_11,crece_12,crece_13,crece_sum,decrece_sum
0,20001,201701,934.77222,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,0,12
1,20001,201702,798.0162,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,0,12
2,20001,201703,1303.35771,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,1,11
3,20001,201704,1069.9613,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,1,11
4,20001,201705,1502.20132,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,2,10
5,20001,201706,1520.06539,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,3,9
6,20001,201707,1030.67391,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,False,False,False,False,False,False,False,3,9
7,20001,201708,1267.39462,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,True,False,False,False,False,False,False,False,4,8
8,20001,201709,1316.94604,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,False,True,False,False,False,False,False,False,5,7
9,20001,201710,1439.75563,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,True,False,True,False,False,False,False,False,6,6


In [54]:
df.to_parquet('sell_in_lag.parquet', index=False)