In [138]:
import pandas as pd
import lightgbm as lgb
from datetime import datetime
import matplotlib.pyplot as plt
import json
import numpy as np

import warnings
warnings.filterwarnings('ignore')

fase = '05_lightgbm (un intento)'

In [139]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [140]:
folder = gen_config['folder']

path_pred_test = gen_config['path_pred_test']
path_pred_futuro = gen_config['path_pred_futuro']
path_prod_stats = gen_config['path_prod_stats']

path_train = gen_config['path_train']
path_test = gen_config['path_test']
path_futuro = gen_config['path_futuro']

lgbm_params = gen_config['var_lgbm_params']
exclusiones = gen_config['var_exclusiones']
dibujar_pesos = gen_config['var_dibujar_pesos']
var_num_boost_round = gen_config['var_num_boost_round']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------05_lightgbm (un intento)--------------------------------------


In [141]:
df_train = pd.read_parquet(f"{folder}/{path_train}")
df_test = pd.read_parquet(f"{folder}/{path_test}")
df_futuro = pd.read_parquet(f"{folder}/{path_futuro}")

prod_stats = pd.read_parquet(f"{folder}/{path_prod_stats}")
prod_stats = prod_stats[['product_id','customer_id', 'average_tn', 'std_dev_tn']]

print(f"{'Shape df_train':.<25}: {df_train.shape}")
print(f"{'Shape df_test':.<25}: {df_test.shape}")
print(f"{'Shape df_futuro':.<25}: {df_futuro.shape}")

Shape df_train...........: (2173865, 188)
Shape df_test............: (68823, 188)
Shape df_futuro..........: (53008, 188)


In [142]:
print(f"Nulos en tn_futuro de Test: {df_test['tn_futuro'].isna().sum()}")
#df_test['tn_futuro'] = df_test['tn_futuro'].fillna(0)
df_test.dropna(subset=['tn_futuro'], inplace=True)
print(f"{'Shape df_test dropna':.<25}: {df_test.shape}")

Nulos en tn_futuro de Test: 51079
Shape df_test dropna.....: (17744, 188)


In [143]:
distribution_report = pd.DataFrame(range(0,10), columns=['cluster'])
distribution_report['train'] = df_train[['cluster_dtw_10','periodo']].groupby('cluster_dtw_10').count()
distribution_report['train_prop'] = distribution_report['train'] / distribution_report['train'].sum()
distribution_report['test'] = df_test[['cluster_dtw_10','periodo']].groupby('cluster_dtw_10').count()
distribution_report['test_prop'] = distribution_report['test'] / distribution_report['test'].sum()
distribution_report['futuro'] = df_futuro[['cluster_dtw_10','periodo']].groupby('cluster_dtw_10').count()
distribution_report['futuro_prop'] = distribution_report['futuro'] / distribution_report['futuro'].sum()
distribution_report.set_index('cluster', inplace=True)

print(f"DISTRIBUCION DE DATOS EN CLUSTERS:\n{distribution_report.head(10)}")

DISTRIBUCION DE DATOS EN CLUSTERS:
          train  train_prop  test  test_prop  futuro  futuro_prop
cluster                                                          
0        179653    0.082642  2212   0.124662    4356     0.082176
1        174193    0.080131   396   0.022317    5424     0.102324
2        250628    0.115291  2002   0.112827    5358     0.101079
3        317325    0.145973  3928   0.221371    7276     0.137262
4        291646    0.134160  1159   0.065318    6279     0.118454
5        185434    0.085302  1543   0.086959    5119     0.096570
6        136073    0.062595  1818   0.102457    3082     0.058142
7        148670    0.068390   586   0.033025    4580     0.086402
8        186891    0.085972  2907   0.163830    4402     0.083044
9        303352    0.139545  1193   0.067234    7132     0.134546


In [144]:
categorical_features = df_train.select_dtypes(['category']).columns.tolist()
for col in categorical_features:
    df_train[col] = df_train[col].cat.codes
    df_test[col] = df_test[col].cat.codes
    df_futuro[col] = df_futuro[col].cat.codes
print(f"Convertidas a categorical: {categorical_features}")

Convertidas a categorical: ['yearquarter', 'cat1', 'cat2', 'cat3', 'brand', 'descripcion', 'cluster_dtw_10']


In [145]:
def separar_cluster_ttf(df_train, df_test, df_futuro, cluster_col, cluster):
    X_train = df_train[df_train[cluster_col] == cluster].iloc[:,:-1]
    X_test = df_test[df_test[cluster_col] == cluster].iloc[:,:-1]
    X_futuro = df_futuro[df_futuro[cluster_col] == cluster].iloc[:,:-1]

    y_train = df_train[df_train[cluster_col] == cluster].iloc[:,-1]
    y_test = df_test[df_test[cluster_col] == cluster].iloc[:,-1]
    y_futuro = df_futuro[df_futuro[cluster_col] == cluster].iloc[:,-1]

    print(f"{'Cluster Column':.<25}: {cluster_col}")
    print(f"{'Cluster':.<25}: {cluster}")
    print(f"{'Shape X_train':.<25}: {X_train.shape}")
    print(f"{'Shape X_test':.<25}: {X_test.shape}")
    print(f"{'Shape X_futuro':.<25}: {X_futuro.shape}")

    print(f"{'Shape y_train':.<25}: {y_train.shape}")
    print(f"{'Shape y_test':.<25}: {y_test.shape}")
    print(f"{'Shape y_futuro':.<25}: {y_futuro.shape}")
    print(f"\n")

    return X_train, X_test, X_futuro, y_train, y_test, y_futuro

In [146]:
def train_cluster(X_train, X_test, X_futuro, y_train, y_test, y_futuro):

    train_data = lgb.Dataset(X_train.drop(columns=exclusiones), label=y_train)
    test_data = lgb.Dataset(X_test.drop(columns=exclusiones), label=y_test)
    #futuro_data = lgb.Dataset(X_futuro.drop(columns=exclusiones), label=y_futuro)

    params = lgbm_params

    model = lgb.train(params,
                    train_data,
                    num_boost_round=var_num_boost_round,
                    valid_sets=[train_data, test_data],
                    )

    y_pred = model.predict(X_test.drop(columns=exclusiones), num_iteration=model.best_iteration)
    y_pred_futuro = model.predict(X_futuro.drop(columns=exclusiones), num_iteration=model.best_iteration)

    return model, y_pred, y_pred_futuro
    

In [165]:
modelos = []
pred_final = pd.DataFrame()
pred_final_futuro = pd.DataFrame()

for cluster in range(0,10):
    X_train, X_test, X_futuro, y_train, y_test, y_futuro = separar_cluster_ttf(df_train, df_test, df_futuro, 'cluster_dtw_10', cluster)
    model, y_pred, y_pred_futuro = train_cluster(X_train, X_test, X_futuro, y_train, y_test, y_futuro)

    modelos.append(model)

    pred = X_test[['periodo','product_id','customer_id','tn_norm']]
    pred['cluster'] = cluster
    pred['tn_futuro'] = y_test
    pred['tn_prediccion'] = y_pred
    pred_final = pd.concat([pred_final, pred], ignore_index=True, axis=0)

    pred_futuro =X_futuro[['periodo','product_id','customer_id','tn_norm']]
    pred_futuro['cluster'] = cluster
    pred_futuro['tn_futuro'] = y_futuro
    pred_futuro['tn_prediccion'] = y_pred_futuro
    pred_final_futuro = pd.concat([pred_final_futuro, pred_futuro], ignore_index=True, axis=0)


Cluster Column...........: cluster_dtw_10
Cluster..................: 0
Shape X_train............: (179653, 187)
Shape X_test.............: (2212, 187)
Shape X_futuro...........: (4356, 187)
Shape y_train............: (179653,)
Shape y_test.............: (2212,)
Shape y_futuro...........: (4356,)
[LightGBM] [Info] Total Bins 100301
[LightGBM] [Info] Number of data points in the train set: 179653, number of used features: 177
[LightGBM] [Info] Start training from score -0.106002
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's l2: 0.679935	training's rmse: 0.824582	valid_1's l2: 0.526368	valid_1's rmse: 0.725512
Cluster Column...........: cluster_dtw_10
Cluster..................: 1
Shape X_train............: (174193, 187)
Shape X_test.............: (396, 187)
Shape X_futuro...........: (5424, 187)
Shape y_train............: (174193,)
Shape y_test.............: (396,)
Shape y_futuro...........: (5424,)
[LightGBM

In [166]:
final = pred_final.merge(prod_stats, how='left', on=['product_id','customer_id'])
final['tn_futuro_real'] = (final['tn_norm'] + final['tn_futuro']) * final['std_dev_tn'] + final['average_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
final['tn_prediccion_real'] = (final['tn_norm'] + final['tn_prediccion']) * final['std_dev_tn'] + final['average_tn']
final.to_parquet(f'{folder}/{path_pred_test}', index=False)

In [168]:
final_futuro = pred_final_futuro.merge(prod_stats, how='left', on=['product_id','customer_id'])
final_futuro['tn_futuro_real'] = (final_futuro['tn_norm'] + final_futuro['tn_futuro']) * final_futuro['std_dev_tn'] + final_futuro['average_tn'] # por dos porque esta normalizado y al hacer sumas y restas se acumulan medias
final_futuro['tn_prediccion_real'] = (final_futuro['tn_norm'] + final_futuro['tn_prediccion']) * final_futuro['std_dev_tn'] + final_futuro['average_tn']
final_futuro.to_parquet(f'{folder}/{path_pred_futuro}', index=False)

In [169]:
#estado_control = f"05_lightgbm Terminado - {nombrefile} - {datetime.now()}"

In [170]:
# lgb.plot_importance(model, max_num_features=20, figsize=(10,10))
# plt.show()

In [171]:
# importance_df = (
#     pd.DataFrame({
#         'feature_name': model.feature_name(),
#         'importance_gain': model.feature_importance(importance_type='gain'),
#         'importance_split': model.feature_importance(importance_type='split'),
#     })
#     .sort_values('importance_gain', ascending=False)
#     .reset_index(drop=True)
# )
# importance_df.sort_values('importance_split', ascending=False, inplace=True)
# feat_dibujar = importance_df[0:20]['feature_name'].reset_index(drop=True)

In [172]:
# importance_df

In [173]:
if dibujar_pesos==True:
    fig, axs = plt.subplots(5, 4, figsize=(20, 25))
    d = -1
    for i in range(4):
        for j in range(5):
            d+=1
            lgb.plot_split_value_histogram(model,
                            feature=feat_dibujar[d],
                            bins="auto",
                            ax=axs[j, i]
                            ,title=f"Feat: {feat_dibujar[d]}")
    plt.show()

In [174]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------05_lightgbm (un intento)--------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



