In [15]:
#import pandas as pd
import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime
import json
import os
import matplotlib.pyplot as plt
import pickle

from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

fase = '02b_DTW'

In [16]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [17]:
folder = gen_config['folder']

#entradas
path_norm = gen_config['path_norm']
path_prod_stats = gen_config['path_prod_stats']
path_overall_prod_stats = gen_config['path_overall_prod_stats']
#salidas
path_dtw = gen_config['path_dtw']
path_traindtw = gen_config['path_traindtw']
path_fitdtw = gen_config['path_fitdtw']
#variables
clusters = gen_config['var_clusters']
path_dtwmodel = gen_config['path_dtw_model']
leer_pickle_dtw = gen_config['var_leer_pickle_dtw']
ejecutar_dtw = gen_config['var_ejecutar_dtw']


print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
----------------------------------------------02b_DTW-----------------------------------------------


In [18]:
df_norm = pl.read_parquet(f"{folder}/{path_norm}")
prod_stats = pl.read_parquet(f'{folder}/{path_prod_stats}')
overall_prod_stats = pl.read_parquet(f'{folder}/{path_overall_prod_stats}')

print(f"df_norm shape:   ({df_norm.shape[0]:>9_d},{df_norm.shape[1]:_d})")

df_norm shape:   (8_042_749,19)


In [19]:
df_norm = df_norm.with_columns([
    ((pl.col("periodo").cast(pl.Utf8) + "01").str.to_date("%Y%m%d")).alias("periodo_dt"),  
])

In [20]:
primer_periodo = df_norm['periodo_dt'].min()
df_norm['periodo','periodo_dt']

df_norm = df_norm.with_columns(
    ((pl.col('periodo_dt').dt.year() - primer_periodo.year) * 12 +
    (pl.col('periodo_dt').dt.month() - primer_periodo.month)).alias('mes_indice')
)

In [21]:
productos_fijos = overall_prod_stats.select(['product_id', 'total_tn']).sort('total_tn',descending = True)[:150][:,0]
productos_sample = overall_prod_stats.select(['product_id', 'total_tn']).sort('total_tn',descending = True)[150:][:,0].sample(150, seed=42)
productos_otros = overall_prod_stats['product_id'].filter(~overall_prod_stats['product_id'].is_in(pl.concat([productos_fijos, productos_sample])))

In [22]:
df_norm = df_norm.sort(by=['product_id', 'customer_id', 'periodo'])

df_train = df_norm.filter(df_norm['product_id'].is_in(pl.concat([productos_fijos, productos_sample])))
df_fit = df_norm.filter(df_norm['product_id'].is_in(productos_otros))

print(f"df_norm shape:   ({df_norm.shape[0]:>9_d},{df_norm.shape[1]:_d})")
print(f"df_train shape:  ({df_train.shape[0]:>9_d},{df_train.shape[1]:_d})")
print(f"df_fit shape:    ({df_fit.shape[0]:>9_d},{df_fit.shape[1]:_d})")

df_norm shape:   (8_042_749,19)
df_train shape:  (3_589_457,19)
df_fit shape:    (4_453_292,19)


In [23]:
prod_stats = prod_stats.to_pandas()

In [24]:
df_train = df_train.to_pandas()
df_fit = df_fit.to_pandas()

In [25]:
prodcust_train = df_train[['product_id', 'customer_id']].drop_duplicates().reset_index(drop=True)
prodcust_fit = df_fit[['product_id', 'customer_id']].drop_duplicates().reset_index(drop=True)

print(f"prodcust_train series:  ({prodcust_train.shape[0]:>9_d},{df_train.shape[1]:_d}, productos unicos: {df_train['product_id'].nunique()})")
print(f"prodcust_fit series:    ({prodcust_fit.shape[0]:>9_d},{df_fit.shape[1]:_d}, productos unicos: {df_fit['product_id'].nunique()})")

prodcust_train series:  (  110_136,19, productos unicos: 300)
prodcust_fit series:    (  152_669,19, productos unicos: 480)


In [26]:
series_train = df_train.pivot(index = ['product_id', 'customer_id'], columns='mes_indice', values='tn').reset_index()
series_train = series_train.drop(columns=['product_id', 'customer_id'], axis=1)
X_train = to_time_series_dataset(series_train)
print(f"X_train timeseries:  ({X_train.shape[0]:>9_d},{X_train.shape[1]:_d},{X_train.shape[2]:_d})")

X_train timeseries:  (  110_136,36,1)


In [27]:
series_fit = df_fit.pivot(index = ['product_id', 'customer_id'], columns='mes_indice', values='tn').reset_index()
series_fit = series_fit.drop(columns=['product_id', 'customer_id'], axis=1)
X_fit = to_time_series_dataset(series_fit)
print(f"X_fit timeseries:  ({X_fit.shape[0]:>9_d},{X_fit.shape[1]:_d},{X_fit.shape[2]:_d})")

X_fit timeseries:  (  152_669,36,1)


In [28]:
if ejecutar_dtw:
    if leer_pickle_dtw:
        x_clusters_dtw = np.full((X_train.shape[0], len(clusters)), np.nan)
        model_dtw = []
        model_dtw.append(TimeSeriesKMeans.from_pickle(f"{folder}/{path_dtwmodel}"))
    else:
        x_clusters_dtw = np.full((X_train.shape[0], len(clusters)), np.nan)
        model_dtw = []

        for i in range(len(clusters)):
            start_time = datetime.now()
            model = TimeSeriesKMeans(n_clusters=clusters[i], metric="dtw",
                                    max_iter=50, random_state=42, max_iter_barycenter=50,
                                    n_jobs = -1, verbose = True)
            x_clusters_dtw[:,i] = model.fit_predict(X_train)
            print(f"DTW Corrida {i}, clusters: {model.n_clusters}, inertia: {model.inertia_:.2f}, time: {(datetime.now()-start_time).total_seconds():.2f}")
            model_dtw.append(model)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 394 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 1194 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 1744 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 2394 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 3144 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 3994 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 4944 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 5994 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 7144 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 8394 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 9744 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 11194 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 12744 task

In [None]:
path_dtw_model = f"02b_dtw_model.pkl"
model_dtw[0].to_pickle(f"{folder}/{path_dtw_model}")

In [None]:
if ejecutar_dtw:
    df_clusters = pd.concat([prodcust_train.reset_index(drop=True), pd.DataFrame(x_clusters_dtw)], axis=1)
    cols = ['product_id', 'customer_id']
    for method in ['cluster_dtw']:
        for cluster in clusters:
            cols.append(f"{method}_{cluster:0>2}")

    df_clusters.columns = cols

In [None]:
#np.nan_to_num(X_fit, nan=0, copy=False)
print(f"X_fit timeseries:    ({X_fit.shape[0]:>9_d},{X_fit.shape[1]:_d},{X_fit.shape[2]:_d})")

In [None]:
if ejecutar_dtw:
    x_clusters_dtw_fit = np.full((X_fit.shape[0], len(clusters)), np.nan)
    x_clusters_dtw_fit[:,0] = model_dtw[0].predict(X_fit)

In [None]:
if ejecutar_dtw:
    df_clusters_fit = pd.concat([prodcust_fit.reset_index(drop=True), pd.DataFrame(x_clusters_dtw_fit)], axis=1)
    cols = ['product_id', 'customer_id']
    for method in ['cluster_dtw']:
        for cluster in clusters:
            cols.append(f"{method}_{cluster:0>2}")

    df_clusters_fit.columns = cols

In [None]:
if ejecutar_dtw:
    df_clusters_full = pd.concat([df_clusters, df_clusters_fit], axis=0)
    df_clusters_full.to_parquet(f"{folder}/{path_dtw}", index=False)

In [None]:
if dibujar_dtw:
    fig, axs = plt.subplots(max(clusters), len(clusters), figsize=(24, 40))

    for i in range(len(clusters)):
        y_pred = x_clusters_dtw[:,i]

        for j in range(clusters[i]):
            for xx in X_train[y_pred == j]:
                axs[j, i].plot(xx.ravel(), "k-", alpha=.1)
            axs[j, i].plot(model_dtw[i].cluster_centers_[j].ravel(), color='green')
            axs[j, i].set_title(f"Cluster {j} de {clusters[i]} DTW")
            
            # for xx in X_train[y_pred == j]:
            #     axs[j, (i+4)].plot(xx.ravel(), "k-", alpha=.1)
            # axs[j, (i+4)].plot(model_softdtw[i].cluster_centers_[j].ravel(), color='blue')
            # axs[j, (i+4)].set_title(f"Cluster {j} de {clusters[i]} Soft_DTW")

    plt.tight_layout()

    plt.show()

In [None]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")