In [57]:
import pandas as pd
import polars as pl
import lightgbm as lgb
from datetime import datetime
import matplotlib.pyplot as plt
import json
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

fase = '05_lightgbm (un intento)'

In [58]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [59]:
folder = gen_config['folder']

path_pred_test = gen_config['path_pred_test']
path_pred_futuro = gen_config['path_pred_futuro']
path_prod_stats = gen_config['path_prod_stats']

path_train = gen_config['path_train']
path_test = gen_config['path_test']
path_futuro = gen_config['path_futuro']

lgbm_params = gen_config['var_lgbm_params']
exclusiones = gen_config['var_exclusiones']
dibujar_pesos = gen_config['var_dibujar_pesos']
var_num_boost_round = gen_config['var_num_boost_round']
clusters = gen_config['var_clusters']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------05_lightgbm (un intento)--------------------------------------


In [60]:
df_train = pd.read_parquet(f"{folder}/{path_train}")
df_test = pd.read_parquet(f"{folder}/{path_test}")
df_futuro = pd.read_parquet(f"{folder}/{path_futuro}")

prod_stats = pd.read_parquet(f"{folder}/{path_prod_stats}")
prod_stats = prod_stats[['product_id','customer_id', 'average_tn', 'std_dev_tn', 'total_tn']]

print(f"{'Shape df_train':.<25}: {df_train.shape}")
print(f"{'Shape df_test':.<25}: {df_test.shape}")
print(f"{'Shape df_futuro':.<25}: {df_futuro.shape}")

Shape df_train...........: (2105042, 190)
Shape df_test............: (68823, 190)
Shape df_futuro..........: (53008, 190)


In [61]:
print(f"Nulos en tn_futuro de Test: {df_test['tn_futuro'].isna().sum()}")
#df_test['tn_futuro'] = df_test['tn_futuro'].fillna(0)
df_test.dropna(subset=['tn_futuro'], inplace=True)
print(f"{'Shape df_test dropna':.<25}: {df_test.shape}")

Nulos en tn_futuro de Test: 51082
Shape df_test dropna.....: (17741, 190)


In [62]:
pprint(lgbm_params)

{'bagging_fraction': 0.9,
 'bagging_freq': 1,
 'boosting_type': 'gbdt',
 'early_stopping_rounds': 10,
 'feature_fraction': 0.9,
 'force_col_wise': True,
 'learning_rate': 0.01,
 'max_bin': 1023,
 'max_depth': -1,
 'metric': ['l2', 'rmse'],
 'num_leaves': 40,
 'num_threads': 8,
 'objective': 'regression',
 'verbose': 1,
 'weight_column': 'avg_weight'}


In [63]:
distribution_report = pd.DataFrame(range(0,clusters[0]), columns=['cluster'])
distribution_report['train'] = df_train[[f'cluster_dtw_{clusters[0]}','periodo']].groupby(f'cluster_dtw_{clusters[0]}').count()
distribution_report['train_prop'] = distribution_report['train'] / distribution_report['train'].sum()
distribution_report['test'] = df_test[[f'cluster_dtw_{clusters[0]}','periodo']].groupby(f'cluster_dtw_{clusters[0]}').count()
distribution_report['test_prop'] = distribution_report['test'] / distribution_report['test'].sum()
distribution_report['futuro'] = df_futuro[[f'cluster_dtw_{clusters[0]}','periodo']].groupby(f'cluster_dtw_{clusters[0]}').count()
distribution_report['futuro_prop'] = distribution_report['futuro'] / distribution_report['futuro'].sum()
distribution_report.set_index('cluster', inplace=True)

print(f"DISTRIBUCION DE DATOS EN CLUSTERS:\n{distribution_report.head(clusters[0])}")

DISTRIBUCION DE DATOS EN CLUSTERS:
          train  train_prop  test  test_prop  futuro  futuro_prop
cluster                                                          
0         35809    0.020373   212   0.011950    1107     0.022224
1         51551    0.029329   951   0.053605    1360     0.027304
2         66039    0.037572   494   0.027845    1580     0.031721
3         60514    0.034428   428   0.024125    3328     0.066814
4         66069    0.037589   652   0.036751    1488     0.029874
5         54427    0.030965   654   0.036864    1354     0.027183
6         58743    0.033421   381   0.021476    1425     0.028609
7         75021    0.042682   303   0.017079    1738     0.034893
8         48484    0.027584   979   0.055183    1318     0.026461
9         50936    0.028979   385   0.021701    2242     0.045011
10        49795    0.028330   505   0.028465    1094     0.021963
11        58565    0.033319   298   0.016797    1331     0.026722
12       107370    0.061086   793   0.044

In [64]:
categorical_features = df_train.select_dtypes(['category']).columns.tolist()
for col in categorical_features:
    df_train[col] = df_train[col].cat.codes
    df_test[col] = df_test[col].cat.codes
    df_futuro[col] = df_futuro[col].cat.codes
print(f"Convertidas a categorical: {categorical_features}")

Convertidas a categorical: ['yearquarter', 'cat1', 'cat2', 'cat3', 'brand', 'descripcion']


In [65]:
categorical_features = df_train.select_dtypes(['object']).columns.tolist()
for col in categorical_features:
    df_train[col] = df_train[col].astype('bool')
    df_test[col] = df_test[col].astype('bool')
    df_futuro[col] = df_futuro[col].astype('bool')
print(f"Convertidas a Boolean: {categorical_features}")

Convertidas a Boolean: ['max_1', 'max_2', 'max_3', 'max_4', 'max_5', 'max_6', 'max_7', 'max_8', 'max_9', 'max_10', 'max_11', 'max_12', 'max_15', 'max_18', 'max_21', 'max_24', 'max_30', 'max_36', 'crece_2', 'crece_3', 'crece_4', 'crece_5', 'crece_6', 'crece_7', 'crece_8', 'crece_9', 'crece_10', 'crece_11', 'crece_12', 'decrece_2', 'decrece_3', 'decrece_4', 'decrece_5', 'decrece_6', 'decrece_7', 'decrece_8', 'decrece_9', 'decrece_10', 'decrece_11', 'decrece_12']


In [66]:
def separar_cluster_ttf(df_train, df_test, df_futuro, cluster_col, cluster):
    X_train = df_train[df_train[cluster_col] == cluster].iloc[:,:-1]
    X_test = df_test[df_test[cluster_col] == cluster].iloc[:,:-1]
    X_futuro = df_futuro[df_futuro[cluster_col] == cluster].iloc[:,:-1]

    y_train = df_train[df_train[cluster_col] == cluster].iloc[:,-1]
    y_test = df_test[df_test[cluster_col] == cluster].iloc[:,-1]
    y_futuro = df_futuro[df_futuro[cluster_col] == cluster].iloc[:,-1]

    print(f"{'Cluster Column':.<25}: {cluster_col}")
    print(f"{'Cluster':.<25}: {cluster}")
    print(f"{'Shape X_train':.<25}: {X_train.shape}")
    print(f"{'Shape X_test':.<25}: {X_test.shape}")
    print(f"{'Shape X_futuro':.<25}: {X_futuro.shape}")

    print(f"{'Shape y_train':.<25}: {y_train.shape}")
    print(f"{'Shape y_test':.<25}: {y_test.shape}")
    print(f"{'Shape y_futuro':.<25}: {y_futuro.shape}")
    print(f"\n")

    return X_train, X_test, X_futuro, y_train, y_test, y_futuro

In [67]:
def train_cluster(X_train, X_test, X_futuro, y_train, y_test, y_futuro):

    train_data = lgb.Dataset(X_train.drop(columns=exclusiones), label=y_train)
    test_data = lgb.Dataset(X_test.drop(columns=exclusiones), label=y_test)
    #futuro_data = lgb.Dataset(X_futuro.drop(columns=exclusiones), label=y_futuro)

    params = lgbm_params

    model = lgb.train(params,
                    train_data,
                    num_boost_round=var_num_boost_round,
                    valid_sets=[test_data, train_data],
                    )

    y_pred = model.predict(X_test.drop(columns=exclusiones), num_iteration=model.best_iteration)
    y_pred_futuro = model.predict(X_futuro.drop(columns=exclusiones), num_iteration=model.best_iteration)

    return model, y_pred, y_pred_futuro
    

In [68]:
modelos = []
pred_final = pd.DataFrame()
pred_final_futuro = pd.DataFrame()

for cluster in range(0,clusters[0]):
    X_train, X_test, X_futuro, y_train, y_test, y_futuro = separar_cluster_ttf(df_train, df_test, df_futuro, f'cluster_dtw_{clusters[0]}', cluster)
    model, y_pred, y_pred_futuro = train_cluster(X_train, X_test, X_futuro, y_train, y_test, y_futuro)

    modelos.append(model)

    pred = X_test[['periodo','product_id','customer_id','tn_norm']]
    pred['cluster'] = cluster
    pred['tn_futuro'] = y_test
    pred['tn_prediccion'] = y_pred
    pred_final = pd.concat([pred_final, pred], ignore_index=True, axis=0)

    pred_futuro =X_futuro[['periodo','product_id','customer_id','tn_norm']]
    pred_futuro['cluster'] = cluster
    pred_futuro['tn_futuro'] = y_futuro
    pred_futuro['tn_prediccion'] = y_pred_futuro
    pred_final_futuro = pd.concat([pred_final_futuro, pred_futuro], ignore_index=True, axis=0)


Cluster Column...........: cluster_dtw_30
Cluster..................: 0
Shape X_train............: (35809, 189)
Shape X_test.............: (212, 189)
Shape X_futuro...........: (1107, 189)
Shape y_train............: (35809,)
Shape y_test.............: (212,)
Shape y_futuro...........: (1107,)


[LightGBM] [Info] Total Bins 75292
[LightGBM] [Info] Number of data points in the train set: 35809, number of used features: 148
[LightGBM] [Info] Start training from score -0.134999
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[452]	training's l2: 0.688927	training's rmse: 0.830016	valid_0's l2: 0.748274	valid_0's rmse: 0.865028
Cluster Column...........: cluster_dtw_30
Cluster..................: 1
Shape X_train............: (51551, 189)
Shape X_test.............: (951, 189)
Shape X_futuro...........: (1360, 189)
Shape y_train............: (51551,)
Shape y_test.............: (951,)
Shape y_futuro...........: (1360,)


[LightGBM] [Info] Total Bin

In [69]:
final = pred_final.merge(prod_stats, how='left', on=['product_id','customer_id'])
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']
final.to_parquet(f'{folder}/{path_pred_test}', index=False)

In [70]:
final_futuro = pred_final_futuro.merge(prod_stats, how='left', on=['product_id','customer_id'])
final_futuro['tn_futuro_real'] = (final_futuro['tn_norm'] + final_futuro['tn_futuro']) * final_futuro['std_dev_tn'] + final_futuro['average_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
final_futuro['tn_prediccion_real'] = (final_futuro['tn_norm'] + final_futuro['tn_prediccion']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']
final_futuro.to_parquet(f'{folder}/{path_pred_futuro}', index=False)

In [71]:
#estado_control = f"05_lightgbm Terminado - {nombrefile} - {datetime.now()}"

In [72]:
# lgb.plot_importance(model, max_num_features=20, figsize=(10,10))
# plt.show()

In [73]:
# importance_df = (
#     pd.DataFrame({
#         'feature_name': model.feature_name(),
#         'importance_gain': model.feature_importance(importance_type='gain'),
#         'importance_split': model.feature_importance(importance_type='split'),
#     })
#     .sort_values('importance_gain', ascending=False)
#     .reset_index(drop=True)
# )
# importance_df.sort_values('importance_split', ascending=False, inplace=True)
# feat_dibujar = importance_df[0:20]['feature_name'].reset_index(drop=True)

In [74]:
# importance_df

In [75]:
if dibujar_pesos==True:
    fig, axs = plt.subplots(5, 4, figsize=(20, 25))
    d = -1
    for i in range(4):
        for j in range(5):
            d+=1
            lgb.plot_split_value_histogram(model,
                            feature=feat_dibujar[d],
                            bins="auto",
                            ax=axs[j, i]
                            ,title=f"Feat: {feat_dibujar[d]}")
    plt.show()

In [76]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------05_lightgbm (un intento)--------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



