In [38]:
import pandas as pd
import polars as pl
import lightgbm as lgb
from datetime import datetime
import matplotlib.pyplot as plt
import json
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

fase = '05_lightgbm (un intento)'

In [39]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [40]:
folder = gen_config['folder']

#entradas
path_train = gen_config['path_train']
path_test = gen_config['path_test']
path_futuro = gen_config['path_futuro']
path_prod_stats = gen_config['path_prod_stats']
path_transform_stats = gen_config['path_transform_stats']
#salidas
path_pred_test = gen_config['path_pred_test']
path_pred_futuro = gen_config['path_pred_futuro']
#variables
lgbm_params = gen_config['var_lgbm_params']
exclusiones = gen_config['var_exclusiones']
dibujar_pesos = gen_config['var_dibujar_pesos']
var_num_boost_round = gen_config['var_num_boost_round']
clusters = gen_config['var_clusters']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------05_lightgbm (un intento)--------------------------------------


In [4]:
df_train = pd.read_parquet(f"{folder}/{path_train}")
df_test = pd.read_parquet(f"{folder}/{path_test}")
df_futuro = pd.read_parquet(f"{folder}/{path_futuro}")

prod_stats = pd.read_parquet(f"{folder}/{path_prod_stats}")
prod_stats = prod_stats[['product_id','customer_id', 'average_tn', 'std_dev_tn', 'total_tn', 'iqr_tn', 'median_tn']]
transform_stats = pd.read_parquet(f"{folder}/{path_transform_stats}")

print(f"{'Shape df_train':.<25}: {df_train.shape}")
print(f"{'Shape df_test':.<25}: {df_test.shape}")
print(f"{'Shape df_futuro':.<25}: {df_futuro.shape}")

Shape df_train...........: (4767503, 171)
Shape df_test............: (178684, 171)
Shape df_futuro..........: (178684, 171)


In [5]:
print(f"Nulos en tn_futuro de Test: {df_test['tn_futuro'].isna().sum()}")
#df_test['tn_futuro'] = df_test['tn_futuro'].fillna(0)
df_test.dropna(subset=['tn_futuro'], inplace=True)
print(f"{'Shape df_test dropna':.<25}: {df_test.shape}")

Nulos en tn_futuro de Test: 0
Shape df_test dropna.....: (178684, 171)


In [6]:
pprint(lgbm_params)

{'bagging_fraction': 0.9,
 'bagging_freq': 1,
 'boosting_type': 'gbdt',
 'early_stopping_rounds': 10,
 'feature_fraction': 0.9,
 'force_col_wise': True,
 'learning_rate': 0.01,
 'max_bin': 1023,
 'max_depth': -1,
 'metric': ['l2', 'rmse'],
 'num_leaves': 40,
 'num_threads': 8,
 'objective': 'regression',
 'verbose': 1,
 'weight_column': 'avg_weight'}


In [7]:
distribution_report = pd.DataFrame(range(0,clusters[0]), columns=['cluster'])
distribution_report['train'] = df_train[[f'cluster_dtw_{clusters[0]}','periodo']].groupby(f'cluster_dtw_{clusters[0]}').count()
distribution_report['train_prop'] = distribution_report['train'] / distribution_report['train'].sum()
distribution_report['test'] = df_test[[f'cluster_dtw_{clusters[0]}','periodo']].groupby(f'cluster_dtw_{clusters[0]}').count()
distribution_report['test_prop'] = distribution_report['test'] / distribution_report['test'].sum()
distribution_report['futuro'] = df_futuro[[f'cluster_dtw_{clusters[0]}','periodo']].groupby(f'cluster_dtw_{clusters[0]}').count()
distribution_report['futuro_prop'] = distribution_report['futuro'] / distribution_report['futuro'].sum()
distribution_report.set_index('cluster', inplace=True)

print(f"DISTRIBUCION DE DATOS EN CLUSTERS:\n{distribution_report.head(clusters[0])}")

DISTRIBUCION DE DATOS EN CLUSTERS:
           train  train_prop    test  test_prop  futuro  futuro_prop
cluster                                                             
0         396428    0.083152   13031   0.072928   13031     0.072928
1          13209    0.002771     407   0.002278     407     0.002278
2         373202    0.078280   11702   0.065490   11702     0.065490
3         248142    0.052049    7583   0.042438    7583     0.042438
4         152397    0.031966    4743   0.026544    4743     0.026544
5          87936    0.018445    4699   0.026298    4699     0.026298
6         148173    0.031080    4860   0.027199    4860     0.027199
7          46238    0.009699    2298   0.012861    2298     0.012861
8           5997    0.001258     192   0.001075     192     0.001075
9           1836    0.000385      97   0.000543      97     0.000543
10       2814318    0.590313  114275   0.639537  114275     0.639537
11           290    0.000061      10   0.000056      10     0.000056

In [8]:
clusters_lgbm = df_train[f'cluster_dtw_{clusters[0]}'].unique()

In [9]:
categorical_features = df_train.select_dtypes(['category']).columns.tolist()
for col in categorical_features:
    df_train[col] = df_train[col].cat.codes
    df_test[col] = df_test[col].cat.codes
    df_futuro[col] = df_futuro[col].cat.codes
print(f"Convertidas a categorical: {categorical_features}")

Convertidas a categorical: ['yearquarter', 'edad', 'cat1', 'cat2', 'cat3', 'brand', 'descripcion', 'presentacion']


In [10]:
categorical_features = df_train.select_dtypes(['object']).columns.tolist()
for col in categorical_features:
    df_train[col] = df_train[col].astype('bool')
    df_test[col] = df_test[col].astype('bool')
    df_futuro[col] = df_futuro[col].astype('bool')
print(f"Convertidas a Boolean: {categorical_features}")

Convertidas a Boolean: ['primer_periodo_overall', 'ultimo_periodo_overall', 'max_2', 'max_3', 'max_4', 'max_5', 'max_6', 'max_7', 'max_8', 'max_9', 'max_10', 'max_11', 'max_12', 'max_13', 'max_15', 'max_18', 'crece_2', 'crece_3', 'crece_4', 'crece_5', 'crece_6', 'crece_7', 'crece_8', 'crece_9', 'crece_10', 'crece_11', 'crece_12', 'decrece_2', 'decrece_3', 'decrece_4', 'decrece_5', 'decrece_6', 'decrece_7', 'decrece_8', 'decrece_9', 'decrece_10', 'decrece_11', 'decrece_12']


In [11]:
def separar_cluster_ttf(df_train, df_test, df_futuro, cluster_col, cluster):
    X_train = df_train[df_train[cluster_col] == cluster].iloc[:,:-1]
    X_test = df_test[df_test[cluster_col] == cluster].iloc[:,:-1]
    X_futuro = df_futuro[df_futuro[cluster_col] == cluster].iloc[:,:-1]

    y_train = df_train[df_train[cluster_col] == cluster].iloc[:,-1]
    y_test = df_test[df_test[cluster_col] == cluster].iloc[:,-1]
    y_futuro = df_futuro[df_futuro[cluster_col] == cluster].iloc[:,-1]

    print(f"{'Cluster Column':.<25}: {cluster_col}")
    print(f"{'Cluster':.<25}: {cluster}")
    print(f"{'Shape X_train':.<25}: {X_train.shape}")
    print(f"{'Shape X_test':.<25}: {X_test.shape}")
    print(f"{'Shape X_futuro':.<25}: {X_futuro.shape}")

    print(f"{'Shape y_train':.<25}: {y_train.shape}")
    print(f"{'Shape y_test':.<25}: {y_test.shape}")
    print(f"{'Shape y_futuro':.<25}: {y_futuro.shape}")
    print(f"\n")

    return X_train, X_test, X_futuro, y_train, y_test, y_futuro

In [12]:
def train_cluster(X_train, X_test, X_futuro, y_train, y_test, y_futuro):

    train_data = lgb.Dataset(X_train.drop(columns=exclusiones), label=y_train)
    test_data = lgb.Dataset(X_test.drop(columns=exclusiones), label=y_test)
    #futuro_data = lgb.Dataset(X_futuro.drop(columns=exclusiones), label=y_futuro)

    params = lgbm_params

    model = lgb.train(params,
                    train_data,
                    num_boost_round=var_num_boost_round,
                    valid_sets=[test_data, train_data],
                    )

    y_pred = model.predict(X_test.drop(columns=exclusiones), num_iteration=model.best_iteration)
    y_pred_futuro = model.predict(X_futuro.drop(columns=exclusiones), num_iteration=model.best_iteration)

    return model, y_pred, y_pred_futuro
    

In [13]:
modelos = []
pred_final = pd.DataFrame()
pred_final_futuro = pd.DataFrame()

for cluster in clusters_lgbm:
    X_train, X_test, X_futuro, y_train, y_test, y_futuro = separar_cluster_ttf(df_train, df_test, df_futuro, f'cluster_dtw_{clusters[0]}', cluster)
    model, y_pred, y_pred_futuro = train_cluster(X_train, X_test, X_futuro, y_train, y_test, y_futuro)

    modelos.append(model)

    pred = X_test[['periodo','product_id','customer_id','tn_norm']]
    pred['cluster'] = cluster
    pred['tn_futuro'] = y_test
    pred['tn_prediccion'] = y_pred
    pred_final = pd.concat([pred_final, pred], ignore_index=True, axis=0)

    pred_futuro =X_futuro[['periodo','product_id','customer_id','tn_norm']]
    pred_futuro['cluster'] = cluster
    pred_futuro['tn_futuro'] = y_futuro
    pred_futuro['tn_prediccion'] = y_pred_futuro
    pred_final_futuro = pd.concat([pred_final_futuro, pred_futuro], ignore_index=True, axis=0)


Cluster Column...........: cluster_dtw_15
Cluster..................: 6
Shape X_train............: (148173, 170)
Shape X_test.............: (4860, 170)
Shape X_futuro...........: (4860, 170)
Shape y_train............: (148173,)
Shape y_test.............: (4860,)
Shape y_futuro...........: (4860,)


[LightGBM] [Info] Total Bins 87632
[LightGBM] [Info] Number of data points in the train set: 148173, number of used features: 154
[LightGBM] [Info] Start training from score 0.072894
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[181]	training's l2: 9.69007	training's rmse: 3.11289	valid_0's l2: 11.4173	valid_0's rmse: 3.37894
Cluster Column...........: cluster_dtw_15
Cluster..................: 0
Shape X_train............: (396428, 170)
Shape X_test.............: (13031, 170)
Shape X_futuro...........: (13031, 170)
Shape y_train............: (396428,)
Shape y_test.............: (13031,)
Shape y_futuro...........: (13031,)


[LightGBM] [Info] T

In [14]:
def power_inverse_transform(x_trans, lambda_, mean_, var_):
    try:
        x = x_trans.to_numpy().reshape(-1, 1)
    except:
        x = np.array(x_trans)

    x = x * var_ ** 0.5 + mean_

    x_inv = np.zeros_like(x)
    pos = x >= 0

    # when x >= 0
    if abs(lambda_) < np.spacing(1.0):
        x_inv[pos] = np.exp(x[pos]) - 1
    else:  # lambda_ != 0
        x_inv[pos] = np.power(x[pos] * lambda_ + 1, 1 / lambda_) - 1

    # when x < 0
    if abs(lambda_ - 2) > np.spacing(1.0):
        x_inv[~pos] = 1 - np.power(-(2 - lambda_) * x[~pos] + 1, 1 / (2 - lambda_))
    else:  # lambda_ == 2
        x_inv[~pos] = 1 - np.exp(-x[~pos])

    x_orig = x_inv.flatten()

    return x_orig

In [44]:
final = pred_final.merge(prod_stats, how='left', on=['product_id','customer_id'])
#final = pred_final.merge(transform_stats, how='left', on=['product_id','customer_id'])

In [29]:
# #power transform
# final['tn_futuro_real'] = final.apply(lambda row: power_inverse_transform((row['tn_norm'] + row['tn_futuro']), row['pwr_lambda'], row['pwr_mean'], row['pwr_var']), axis=1)
# final['tn_futuro_real'] = [tn_inv[0] for tn_inv in final['tn_futuro_real']]
# final['tn_prediccion_real'] = final.apply(lambda row: power_inverse_transform((row['tn_norm'] + row['tn_prediccion']), row['pwr_lambda'], row['pwr_mean'], row['pwr_var']), axis=1)
# final['tn_prediccion_real'] = [tn_inv[0] for tn_inv in final['tn_prediccion_real']]

In [45]:
#robusta
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['iqr_tn'] + final['median_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['iqr_tn'] + final['median_tn']

In [46]:
final_futuro = pred_final.merge(prod_stats, how='left', on=['product_id','customer_id'])
#final_futuro = pred_final_futuro.merge(transform_stats, how='left', on=['product_id','customer_id'])

In [47]:
# #power transform
# final_futuro['tn_futuro_real'] = final_futuro.apply(lambda row: power_inverse_transform((row['tn_norm'] + row['tn_futuro']), row['pwr_lambda'], row['pwr_mean'], row['pwr_var']), axis=1)
# final_futuro['tn_futuro_real'] = [tn_inv[0] for tn_inv in final_futuro['tn_futuro_real']]
# final_futuro['tn_prediccion_real'] = final_futuro.apply(lambda row: power_inverse_transform((row['tn_norm'] + row['tn_prediccion']), row['pwr_lambda'], row['pwr_mean'], row['pwr_var']), axis=1)
# final_futuro['tn_prediccion_real'] = [tn_inv[0] for tn_inv in final_futuro['tn_prediccion_real']]

In [48]:
#robusta
final_futuro['tn_futuro_real'] = (final_futuro['tn_norm'] + final_futuro['tn_futuro']) * final_futuro['iqr_tn'] + final_futuro['median_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
final_futuro['tn_prediccion_real'] = (final_futuro['tn_norm'] + final_futuro['tn_prediccion']) * final_futuro['iqr_tn'] + final_futuro['median_tn']

In [49]:
# final = pred_final.merge(prod_stats, how='left', on=['product_id','customer_id'])
# final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
# final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']
final.to_parquet(f'{folder}/{path_pred_test}', index=False)

In [50]:
# final_futuro = pred_final_futuro.merge(prod_stats, how='left', on=['product_id','customer_id'])
# final_futuro['tn_futuro_real'] = (final_futuro['tn_norm'] + final_futuro['tn_futuro']) * final_futuro['std_dev_tn'] + final_futuro['average_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
# final_futuro['tn_prediccion_real'] = (final_futuro['tn_norm'] + final_futuro['tn_prediccion']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']
final_futuro.to_parquet(f'{folder}/{path_pred_futuro}', index=False)

In [51]:
#estado_control = f"05_lightgbm Terminado - {nombrefile} - {datetime.now()}"

In [52]:
# lgb.plot_importance(model, max_num_features=20, figsize=(10,10))
# plt.show()

In [53]:
# importance_df = (
#     pd.DataFrame({
#         'feature_name': model.feature_name(),
#         'importance_gain': model.feature_importance(importance_type='gain'),
#         'importance_split': model.feature_importance(importance_type='split'),
#     })
#     .sort_values('importance_gain', ascending=False)
#     .reset_index(drop=True)
# )
# importance_df.sort_values('importance_split', ascending=False, inplace=True)
# feat_dibujar = importance_df[0:20]['feature_name'].reset_index(drop=True)

In [54]:
# importance_df

In [55]:
if dibujar_pesos==True:
    fig, axs = plt.subplots(5, 4, figsize=(20, 25))
    d = -1
    for i in range(4):
        for j in range(5):
            d+=1
            lgb.plot_split_value_histogram(model,
                            feature=feat_dibujar[d],
                            bins="auto",
                            ax=axs[j, i]
                            ,title=f"Feat: {feat_dibujar[d]}")
    plt.show()

In [56]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------05_lightgbm (un intento)--------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



