In [199]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [200]:
df = pd.read_parquet("sell_in_norm.parquet")
prod_a_predecir = pd.read_csv('..\productos_a_predecir.txt', sep='\t')
prod_data = pd.read_parquet('prod_data.parquet')

## Pendientes
---
### operaciones sobre tn
* HECHO - razones con lags
* HECHO - marcar cero real
* HECHO - date features:
    - HECHO - mes
    - HECHO - quarter
    - HECHO - mes en el quarter
* HECHO - tiempos de monotonia de la funcion
* maximo salto
* expanding mean

### basadas en producto
* HECHO - info del producto
* porcentual de venta
    - del mes
    - del quarter
* porcentual de categoria
* HECHO - stats de categoria1 y 2
* HECHO - stats de brand

## Generales

In [201]:
# date features
df['periodo_dt'] = pd.to_datetime(df['periodo'], format='%Y%m')
df = df.sort_values(by=['product_id', 'periodo_dt'])
df['mes'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter
df['month_in_quarter'] = (df['periodo_dt'].dt.month - 1) % 3 + 1
df['year'] = df['periodo_dt'].dt.year
df['yearquarter'] = df['periodo_dt'].dt.to_period('Q').astype('category')

In [202]:
df

Unnamed: 0,product_id,periodo,tn,primer_periodo,ultimo_periodo,values,total_tn,min_tn,average_tn,median_tn,std_dev_tn,iqr_tn,max_tn,tn_norm,periodo_dt,mes,quarter,month_in_quarter,year,yearquarter
0,20001,201701,934.77222,2017-01-01,2019-12-01,36.0,50340.39558,798.01620,1398.344322,1418.02343,298.145460,335.515348,2295.19832,-1.554852,2017-01-01,1,1,1,2017,2017Q1
1,20001,201702,798.01620,2017-01-01,2019-12-01,36.0,50340.39558,798.01620,1398.344322,1418.02343,298.145460,335.515348,2295.19832,-2.013541,2017-02-01,2,1,2,2017,2017Q1
2,20001,201703,1303.35771,2017-01-01,2019-12-01,36.0,50340.39558,798.01620,1398.344322,1418.02343,298.145460,335.515348,2295.19832,-0.318592,2017-03-01,3,1,3,2017,2017Q1
3,20001,201704,1069.96130,2017-01-01,2019-12-01,36.0,50340.39558,798.01620,1398.344322,1418.02343,298.145460,335.515348,2295.19832,-1.101419,2017-04-01,4,2,1,2017,2017Q2
4,20001,201705,1502.20132,2017-01-01,2019-12-01,36.0,50340.39558,798.01620,1398.344322,1418.02343,298.145460,335.515348,2295.19832,0.348343,2017-05-01,5,2,2,2017,2017Q2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22344,21276,201908,0.01265,2019-03-01,2019-12-01,10.0,0.45447,0.00223,0.045447,0.02710,0.043618,0.065710,0.12249,-0.751911,2019-08-01,8,3,2,2019,2019Q3
22345,21276,201909,0.01856,2019-03-01,2019-12-01,10.0,0.45447,0.00223,0.045447,0.02710,0.043618,0.065710,0.12249,-0.616417,2019-09-01,9,3,3,2019,2019Q3
22346,21276,201910,0.02079,2019-03-01,2019-12-01,10.0,0.45447,0.00223,0.045447,0.02710,0.043618,0.065710,0.12249,-0.565291,2019-10-01,10,4,1,2019,2019Q4
22347,21276,201911,0.03341,2019-03-01,2019-12-01,10.0,0.45447,0.00223,0.045447,0.02710,0.043618,0.065710,0.12249,-0.275963,2019-11-01,11,4,2,2019,2019Q4


## Operaciones sobre tn

In [203]:
# marcar ceros
df['tn_cero'] = df['tn'] < df['median_tn'] / 10

In [204]:
# lag features
for lag in range(1, 13):
    # rolling max
    df[f'max_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).max() == df['tn_norm']
    # rolling min
    df[f'min_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).min() == df['tn_norm']
    # rolling avg
    df[f'avg_{lag}'] = df['tn_norm'].rolling(lag, min_periods=1).mean()
    # lags
    df[f'tn_lag_{lag}'] = df.groupby('product_id')['tn_norm'].shift(lag)
    # primer derivada / delta
    df[f'tn_diff_{lag}'] = df['tn_norm'].diff(lag)
    # ratios con lags
    df[f'ratio_{lag}'] = (-df[f'tn_diff_{lag}'])/df[f'tn_lag_{lag}']

for deriv in range(1, 13):
    # segunda derivada
    df[f'tn_diff2_{deriv}'] = df[f'tn_diff_{deriv}'].diff()

In [205]:
# monotonia

for i in range(13):
    df[f'crece_{i+1}'] = (df.groupby('product_id')['tn_norm'].shift(i) - df.groupby('product_id')['tn_norm'].shift(i+1)) > 0

crece_columns = df.filter(like='crece_')

df['crece_sum'] = crece_columns.sum(axis=1)
df['decrece_sum'] = 12 - crece_columns.sum(axis=1) #corregir para los primeros

## nivel producto

In [206]:
df = pd.merge(df, prod_data, on='product_id', how='left', suffixes=('', ''))

In [207]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [208]:
variables = ['cat1', 'cat2', 'brand']
var_stats = []

for variable in variables:
    var_stat = df.groupby(variable).apply(lambda group: pd.Series({
        f'{variable}_total': group['tn'].sum(),
        #f'{variable}_min': group['tn'].min(),
        #f'{variable}_average': group['tn'].mean(),
        f'{variable}_median': group['tn'].median(),
        #f'{variable}_std_dev': group['tn'].std(),
        #f'{variable}_iqr': (group['tn'].quantile(0.75) - group['tn'].quantile(0.25)),
        f'{variable}_max': group['tn'].max()
    }))
    var_stat.reset_index(inplace=True)
    var_stats.append(var_stat)

# Merge yearly sales with the products dataframe
#df_prod = df_prod.merge(df_temp, on='product_id', how='left')

In [209]:
for i in range(len(variables)):
    var_stat = var_stats[i]
    df = pd.merge(df, var_stat, on=variables[i], how='left')

In [213]:
tiempos = ['periodo','yearquarter','year']

for tiempo in tiempos:
    tiempo_sums = df.groupby(tiempo)['tn'].sum().reset_index()
    tiempo_sums.rename(columns={'tn': f'tn_total_{tiempo}'}, inplace=True)
    df = df.merge(tiempo_sums, on=tiempo)
    df[f'prop_product_{tiempo}'] = (df['tn'] / df[f'tn_total_{tiempo}']) * 100
    df.drop(columns=f'tn_total_{tiempo}', inplace=True)

In [214]:
variables = ['cat1','cat2','brand']

for tiempo in tiempos:
    for variable in variables:
        combi_sums = df.groupby([tiempo, variable])['tn'].sum().reset_index()
        combi_sums = df.groupby([tiempo, variable])['tn'].sum().reset_index()
        combi_sums.rename(columns={'tn': f'tn_total_{tiempo}_{variable}'}, inplace=True)
        df = df.merge(combi_sums, on=[tiempo, variable])
        df[f'prop_product_{tiempo}_{variable}'] = (df['tn'] / df[f'tn_total_{tiempo}_{variable}']) * 100
        df.drop(columns=f'tn_total_{tiempo}_{variable}', inplace=True)

## ajustes finales pre export

In [210]:
df = df.sort_values(by=['product_id', 'periodo'])

In [211]:
df.head(15)

Unnamed: 0,product_id,periodo,tn,primer_periodo,ultimo_periodo,values,total_tn,min_tn,average_tn,median_tn,...,sku_size,cat1_total,cat1_median,cat1_max,cat2_total,cat2_median,cat2_max,brand_total,brand_median,brand_max
0,20001,201701,934.77222,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
1,20001,201702,798.0162,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
2,20001,201703,1303.35771,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
3,20001,201704,1069.9613,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
4,20001,201705,1502.20132,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
5,20001,201706,1520.06539,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
6,20001,201707,1030.67391,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
7,20001,201708,1267.39462,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
8,20001,201709,1316.94604,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832
9,20001,201710,1439.75563,2017-01-01,2019-12-01,36.0,50340.39558,798.0162,1398.344322,1418.02343,...,3000,739462.55101,50.57541,2295.19832,416986.2853,174.456945,2295.19832,66195.75601,66.56284,2295.19832


In [212]:
df.to_parquet('sell_in_lag.parquet', index=False)