In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
import os
import matplotlib.pyplot as plt
import pickle

from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

fase = '02b_DTW'

In [None]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [None]:
folder = gen_config['folder']

path_norm = gen_config['path_norm']
path_dtw = gen_config['path_dtw']
path_prod_stats = gen_config['path_prod_stats']

clusters = gen_config['var_clusters']
corte_prod_dtw = gen_config['var_corte_prod_dtw']
leer_pickle_dtw = gen_config['var_leer_pickle_dtw']
path_traindtw = gen_config['path_traindtw']
path_fitdtw = gen_config['path_fitdtw']
path_dtwmodel = gen_config['path_dtw_model']
ejecutar_dtw = gen_config['var_ejecutar_dtw']
dibujar_dtw = gen_config['var_dibujar_dtw']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

In [None]:
df_norm = pd.read_parquet(f"{folder}/{path_norm}")
prod_stats = pd.read_parquet(f'{folder}/{path_prod_stats}')

In [None]:
df_norm.sort_values(by=['product_id', 'customer_id','periodo'], inplace=True)
df_train = df_norm[df_norm['product_id'] <= corte_prod_dtw]
df_fit = df_norm[df_norm['product_id'] > corte_prod_dtw]

print(f"df_norm shape:   ({df_norm.shape[0]:>9_d},{df_norm.shape[1]:_d})")
print(f"df_train shape:  ({df_train.shape[0]:>9_d},{df_train.shape[1]:_d})")
print(f"df_fit shape:    ({df_fit.shape[0]:>9_d},{df_fit.shape[1]:_d})")

In [None]:
corte_fecha_prodcust = '2019-06-01'
minimo_values_series = 3

print(f"ProdCust presentes post {corte_fecha_prodcust} {prod_stats[prod_stats['ultimo_periodo'] >= corte_fecha_prodcust].shape}")
print(f"ProdCust con mas de {minimo_values_series} datos {prod_stats[prod_stats['values'] >= minimo_values_series].shape}")

print(f"ProdCust VALIDOS (ambas condiciones) {prod_stats[(prod_stats['ultimo_periodo'] >= corte_fecha_prodcust) & (prod_stats['values'] >= minimo_values_series)].shape}")

prodcust_validos = prod_stats[(prod_stats['ultimo_periodo'] >= corte_fecha_prodcust) & (prod_stats['values'] >= minimo_values_series)][['product_id', 'customer_id']]

In [None]:
prodcust_train = df_train[['product_id', 'customer_id']].drop_duplicates()
prodcust_fit = df_fit[['product_id', 'customer_id']].drop_duplicates()

print(f"ProdCust sin filtrar:")
print(f"prodcust_train series:  ({prodcust_train.shape[0]:>9_d},{df_train.shape[1]:_d})")
print(f"prodcust_fit series:    ({prodcust_fit.shape[0]:>9_d},{df_fit.shape[1]:_d})")

In [None]:
prodcust_train = prodcust_train.merge(prodcust_validos, on=['product_id', 'customer_id'], how='inner')
prodcust_fit = prodcust_fit.merge(prodcust_validos, on=['product_id', 'customer_id'], how='inner')

print(f"ProdCust aplicando filtros:")
print(f"prodcust_train series:  ({prodcust_train.shape[0]:>9_d},{df_train.shape[1]:_d})")
print(f"prodcust_fit series:    ({prodcust_fit.shape[0]:>9_d},{df_fit.shape[1]:_d})")
print(f"Productos presentes:    {prodcust_train['product_id'].nunique() + prodcust_fit['product_id'].nunique()}")

In [None]:
if leer_pickle_dtw:
    print(f"Cargando series DTW: {path_traindtw}")
    X_train = np.load(f"{folder}/{path_traindtw}")
    print(f"series_train shape: {X_train.shape}")
    
else:
    print(f"Creando series DTW: {path_traindtw}")
    series_train = []
    i = 0
    for producto, cliente in prodcust_train.values:
        i +=1
        print(f"Timeseries {i} de {prodcust_train.shape[0]}: {producto}, {cliente}", end="\r")
        series_train.append(df_train[(df_train["product_id"] == producto) & (df_train["customer_id"] == cliente)][['tn_norm']])
    
    X_train = to_time_series_dataset(series_train)
    np.save(f"{folder}/{path_traindtw}", X_train)
    # with open('series_train.pickle', 'wb') as handle:
    #     pickle.dump(series_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if leer_pickle_dtw:
    X_fit = np.load(f"{folder}/{path_fitdtw}")
    print(f"series_fit shape: {X_fit.shape}")
    
else:
    series_fit = []
    i = 0
    for producto, cliente in prodcust_fit.values:
        i +=1
        print(f"Timeseries {i} de {prodcust_fit.shape[0]}: {producto}, {cliente}", end="\r")
        series_fit.append(df_fit[(df_fit["product_id"] == producto) & (df_fit["customer_id"] == cliente)][['tn_norm']])
    
    X_fit = to_time_series_dataset(series_fit)
    np.save(f"{folder}/{path_fitdtw}", X_fit)
    # with open('series_fit.pickle', 'wb') as handle:
    #     pickle.dump(series_fit, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
np.nan_to_num(X_train, nan=0, copy=False)
print(f"X_train timeseries:  ({X_train.shape[0]:>9_d},{X_train.shape[1]:_d},{X_train.shape[2]:_d})")

In [None]:
if ejecutar_dtw:
    if leer_pickle_dtw:
        x_clusters_dtw = np.full((X_train.shape[0], len(clusters)), np.nan)
        model_dtw = []
        model_dtw.append(TimeSeriesKMeans.from_pickle(f"{folder}/{path_dtwmodel}"))
    else:
        x_clusters_dtw = np.full((X_train.shape[0], len(clusters)), np.nan)
        model_dtw = []

        for i in range(len(clusters)):
            start_time = datetime.now()
            model = TimeSeriesKMeans(n_clusters=clusters[i], metric="dtw",
                                    max_iter=50, random_state=42, max_iter_barycenter=50,
                                    n_jobs = -1, verbose = True)
            x_clusters_dtw[:,i] = model.fit_predict(X_train)
            print(f"DTW Corrida {i}, clusters: {model.n_clusters}, inertia: {model.inertia_:.2f}, time: {(datetime.now()-start_time).total_seconds():.2f}")
            model_dtw.append(model)

In [None]:
path_dtw_model = f"02b_dtw_model.pkl"
model_dtw[0].to_pickle(f"{folder}/{path_dtw_model}")

In [None]:
if ejecutar_dtw:
    df_clusters = pd.concat([prodcust_train.reset_index(drop=True), pd.DataFrame(x_clusters_dtw)], axis=1)
    cols = ['product_id', 'customer_id']
    for method in ['cluster_dtw']:
        for cluster in clusters:
            cols.append(f"{method}_{cluster:0>2}")

    df_clusters.columns = cols

In [None]:
np.nan_to_num(X_fit, nan=0, copy=False)
print(f"X_fit timeseries:    ({X_fit.shape[0]:>9_d},{X_fit.shape[1]:_d},{X_fit.shape[2]:_d})")

In [None]:
if ejecutar_dtw:
    x_clusters_dtw_fit = np.full((X_fit.shape[0], len(clusters)), np.nan)
    x_clusters_dtw_fit[:,0] = model_dtw[0].predict(X_fit)

In [None]:
if ejecutar_dtw:
    df_clusters_fit = pd.concat([prodcust_fit.reset_index(drop=True), pd.DataFrame(x_clusters_dtw_fit)], axis=1)
    cols = ['product_id', 'customer_id']
    for method in ['cluster_dtw']:
        for cluster in clusters:
            cols.append(f"{method}_{cluster:0>2}")

    df_clusters_fit.columns = cols

In [None]:
if ejecutar_dtw:
    df_clusters_full = pd.concat([df_clusters, df_clusters_fit], axis=0)
    df_clusters_full.to_parquet(f"{folder}/{path_dtw}", index=False)

In [None]:
if dibujar_dtw:
    fig, axs = plt.subplots(max(clusters), len(clusters), figsize=(24, 40))

    for i in range(len(clusters)):
        y_pred = x_clusters_dtw[:,i]

        for j in range(clusters[i]):
            for xx in X_train[y_pred == j]:
                axs[j, i].plot(xx.ravel(), "k-", alpha=.1)
            axs[j, i].plot(model_dtw[i].cluster_centers_[j].ravel(), color='green')
            axs[j, i].set_title(f"Cluster {j} de {clusters[i]} DTW")
            
            # for xx in X_train[y_pred == j]:
            #     axs[j, (i+4)].plot(xx.ravel(), "k-", alpha=.1)
            # axs[j, (i+4)].plot(model_softdtw[i].cluster_centers_[j].ravel(), color='blue')
            # axs[j, (i+4)].set_title(f"Cluster {j} de {clusters[i]} Soft_DTW")

    plt.tight_layout()

    plt.show()

In [None]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")