In [2]:
import pandas as pd
from datetime import datetime
import warnings

import json

warnings.filterwarnings('ignore')

fase = '03_features'

In [3]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [4]:
folder = gen_config['folder']

path_norm = gen_config['path_norm']
path_prod_data = gen_config['path_prod_data']
path_lag = gen_config['path_lag']
path_dtw = gen_config['path_dtw']

var_lags = gen_config['var_lags']
var_cates_feat = gen_config['var_cates_feat']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------------03_features---------------------------------------------


In [5]:
df = pd.read_parquet(f"{folder}/{path_norm}")
prod_data = pd.read_parquet(f"{folder}/{path_prod_data}")
prod_a_predecir = pd.read_csv('..\productos_a_predecir.txt', sep='\t')
df_dtw = pd.read_parquet(f"{folder}/{path_dtw}")

## Pendientes
---
### operaciones sobre tn
* HECHO - razones con lags
* HECHO - marcar cero real
* HECHO - date features:
    - HECHO - mes
    - HECHO - quarter
    - HECHO - mes en el quarter
* HECHO - tiempos de monotonia de la funcion
* maximo salto
* expanding mean

### basadas en producto
* HECHO - info del producto
* HECHO porcentual de venta
    - HECHO del mes
    - HECHO del quarter
    - HECHO del año
* porcentual de categoria
* HECHO - stats de categoria1 y 2
* HECHO - stats de brand

## Generales

In [6]:
# date features
df['periodo_dt'] = pd.to_datetime(df['periodo'], format='%Y%m')
df = df.sort_values(by=['product_id', 'customer_id', 'periodo_dt'])
df['mes'] = df['periodo_dt'].dt.month
df['quarter'] = df['periodo_dt'].dt.quarter
df['month_in_quarter'] = (df['periodo_dt'].dt.month - 1) % 3 + 1
df['year'] = df['periodo_dt'].dt.year
df['yearquarter'] = df['periodo_dt'].dt.to_period('Q').astype('str').astype('category')

## Operaciones sobre tn

In [7]:
# marcar ceros
df['tn_cero'] = df['tn'] < df['median_tn'] / 10

In [8]:
lags = var_lags
#print(f"Lags/Variables: {lags}")

In [68]:
# lag features

for lag in lags:
    # rolling max
    start_time = datetime.now()
    df[f'max_{lag}'] = df.groupby(['product_id','customer_id'],as_index=False)['tn_norm'].rolling(lag).max()['tn_norm'] == df['tn_norm']
    print(f"Creado max_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')

for lag in lags:
    # rolling min
    start_time = datetime.now()
    df[f'min_{lag}'] = df.groupby(['product_id','customer_id'],as_index=False)['tn_norm'].rolling(lag).min()['tn_norm'] == df['tn_norm']
    print(f"Creado min_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')

for lag in lags:
    # rolling avg
    start_time = datetime.now()
    df[f'avg_{lag}'] = df.groupby(['product_id','customer_id'],as_index=False)['tn_norm'].rolling(lag).mean()['tn_norm']
    print(f"Creado avg_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')

for lag in lags:
    # lags
    start_time = datetime.now()
    df[f'tn_lag_{lag}'] = df.groupby(['product_id','customer_id'])['tn_norm'].shift(lag)
    print(f"Creado tn_lag_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')

for lag in lags:
    # primer derivada / delta
    start_time = datetime.now()
    df[f'tn_diff_{lag}'] = df.groupby(['product_id','customer_id'])['tn_norm'].diff(lag)
    print(f"Creado tn_diff_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')

for lag in lags:
    # ratios con lags
    start_time = datetime.now()
    df[f'ratio_{lag}'] = (-df[f'tn_diff_{lag}'])/df[f'tn_lag_{lag}']
    print(f"Creado ratio_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')

for deriv in range(1, 13):
    # segunda derivada
    start_time = datetime.now()
    df[f'tn_diff2_{deriv}'] = df[f'tn_diff_{deriv}'].diff()
    print(f"Creado tn_diff2_{lag} en {(datetime.now() - start_time).total_seconds():.2f} a las {datetime.now().strftime('%H:%M:%S')}", end='\r')
print('')


Creado max_36 en 3.72 a las 20:56:51
Creado min_36 en 3.64 a las 20:57:56
Creado avg_36 en 3.67 a las 20:59:02
Creado tn_lag_36 en 0.16 a las 20:59:04
Creado tn_diff_36 en 0.17 a las 20:59:07
Creado ratio_36 en 0.03 a las 20:59:08
Creado tn_diff2_36 en 0.02 a las 20:59:08


In [69]:
# monotonia

for i in lags:
    df[f'crece_{i+1}'] = (df.groupby(['product_id','customer_id'])['tn_norm'].shift(i) - df.groupby('product_id')['tn_norm'].shift(i+1)) > 0

crece_columns = df.filter(like='crece_')

df['crece_sum'] = crece_columns.sum(axis=1)
df['decrece_sum'] = 12 - crece_columns.sum(axis=1) #corregir para los primeros

## nivel producto

In [9]:
df = pd.merge(df, prod_data, on=['product_id'], how='left', suffixes=('', ''))

In [10]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [11]:
var_stats = []

for variable in var_cates_feat:
    var_stat = df.groupby(variable).apply(lambda group: pd.Series({
        f'{variable}_total': group['tn'].sum(),
        #f'{variable}_min': group['tn'].min(),
        #f'{variable}_average': group['tn'].mean(),
        f'{variable}_median': group['tn'].median(),
        #f'{variable}_std_dev': group['tn'].std(),
        #f'{variable}_iqr': (group['tn'].quantile(0.75) - group['tn'].quantile(0.25)),
        f'{variable}_max': group['tn'].max()
    }))
    var_stat.reset_index(inplace=True)
    var_stats.append(var_stat)

# Merge yearly sales with the products dataframe
#df_prod = df_prod.merge(df_temp, on='product_id', how='left')

In [12]:
for i in range(len(var_cates_feat)):
    var_stat = var_stats[i]
    df = pd.merge(df, var_stat, on=var_cates_feat[i], how='left')

In [13]:
tiempos = ['periodo','yearquarter','year']

for tiempo in tiempos:
    tiempo_sums = df.groupby([tiempo,'customer_id'])['tn'].sum().reset_index()
    tiempo_sums.rename(columns={'tn': f'tn_total_{tiempo}'}, inplace=True)
    df = df.merge(tiempo_sums, on=[tiempo,'customer_id'])
    df[f'prop_product_{tiempo}'] = (df['tn'] / df[f'tn_total_{tiempo}']) * 100
    df.drop(columns=f'tn_total_{tiempo}', inplace=True)

In [14]:
for tiempo in tiempos:
    for variable in var_cates_feat:
        combi_sums = df.groupby([tiempo,'customer_id', variable])['tn'].sum().reset_index()
        combi_sums = df.groupby([tiempo,'customer_id', variable])['tn'].sum().reset_index()
        combi_sums.rename(columns={'tn': f'tn_total_{tiempo}_{variable}'}, inplace=True)
        df = df.merge(combi_sums, on=[tiempo,'customer_id', variable])
        df[f'prop_product_{tiempo}_{variable}'] = (df['tn'] / df[f'tn_total_{tiempo}_{variable}']) * 100
        df.drop(columns=f'tn_total_{tiempo}_{variable}', inplace=True)

# DTW

In [15]:
for col in df_dtw.iloc[:,1:].columns:
    df_dtw[col] = df_dtw[col].astype('category')

In [16]:
df = df.merge(df_dtw, on=['product_id','customer_id'], how='left')

## ajustes finales pre export

In [17]:
df = df.sort_values(by=['product_id','customer_id', 'periodo'])

In [18]:
#df.to_parquet('sell_in_lag.parquet', index=False)
df.to_parquet(f'{folder}/{path_lag}', index=False)

In [19]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------------03_features---------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



