In [128]:
#import pandas as pd
import polars as pl
import numpy as np
from datetime import datetime
import json
import os
import math
import pandas as pd
from sklearn.preprocessing import PowerTransformer, StandardScaler, RobustScaler
import time

import warnings

import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

fase = '02_normaliza'

In [129]:
with open('gen_config.json', 'r') as file:
    gen_config =json.load(file)

In [130]:
folder = gen_config['folder']

#entradas
path_group = gen_config['path_group']
path_prod_stats = gen_config['path_prod_stats']

#salidas
path_norm = gen_config['path_norm']
#path_transform_stats = gen_config['path_transform_stats']

#variables
var_escalado = gen_config['var_escalado']
var_withmean = gen_config['var_withmean']

print(f"{'COMIENZA':-^100}")
print(f"{fase:-^100}")

----------------------------------------------COMIENZA----------------------------------------------
--------------------------------------------02_normaliza--------------------------------------------


In [131]:
df_norm = pl.read_parquet(f"{folder}/{path_group}")
prod_stats = pl.read_parquet(f'{folder}/{path_prod_stats}')

In [132]:
df_norm = df_norm.to_pandas()
prod_stats = prod_stats.to_pandas()

In [133]:
grouped = df_norm.groupby(['product_id', 'customer_id'])

# Standard Scaler

In [60]:
def scale_standard(group):
    scaler = StandardScaler(with_mean=var_withmean)
    group['tn_standard'] = scaler.fit_transform(group[['tn']])
    # Store the scaler in a dictionary with a tuple key
    standard_scalers[(group['product_id'].iloc[0], group['customer_id'].iloc[0])] = scaler
    return group

In [16]:
standard_scalers = {}
df_standard = grouped.apply(scale_standard)
df_standard = df_standard.reset_index(drop=True)

In [17]:
scaler_data = []
for (product_id, client_id), scaler in standard_scalers.items():
    scaler_data.append({
        'product_id': product_id,
        'customer_id': client_id,
        'standard_scaler_mean': scaler.mean_[0],
        'standard_scaler_scale': scaler.scale_[0]
    })

scalers_standard_stats = pd.DataFrame(scaler_data)

# Robust Scaler

In [61]:
def scale_robust(group):
    scaler = RobustScaler(with_centering=var_withmean)
    group['tn_robust'] = scaler.fit_transform(group[['tn']])
    # Store the scaler in a dictionary with a tuple key
    robust_scalers[(group['product_id'].iloc[0], group['customer_id'].iloc[0])] = scaler
    return group

In [19]:
robust_scalers = {}
df_robust = grouped.apply(scale_robust)
df_robust = df_robust.reset_index(drop=True)

In [20]:
scaler_data = []
for (product_id, client_id), scaler in robust_scalers.items():
    if var_withmean:
        scaler_data.append({
            'product_id': product_id,
            'customer_id': client_id,
            'robust_scaler_center': scaler.center_[0],
            'robust_scaler_scale': scaler.scale_[0]
        })
    else:
        scaler_data.append({
            'product_id': product_id,
            'customer_id': client_id,
            'robust_scaler_scale': scaler.scale_[0]
        })

scalers_robust_stats = pd.DataFrame(scaler_data)

# DF finales

In [134]:
df_norm = df_norm.merge(df_standard[['product_id', 'customer_id', 'periodo', 'tn_standard']], on=['product_id', 'customer_id', 'periodo'], how='left', suffixes=('', ''))
df_norm = df_norm.merge(  df_robust[['product_id', 'customer_id', 'periodo', 'tn_robust'  ]], on=['product_id', 'customer_id', 'periodo'], how='left', suffixes=('', ''))
df_norm['tn_norm'] = df_norm[var_escalado]
df_norm = pl.from_pandas(df_norm)

In [135]:
transform_stats = scalers_standard_stats.merge(scalers_robust_stats, on=['product_id', 'customer_id'], how='left', suffixes=('', ''))
prod_stats = prod_stats.merge(transform_stats, on=['product_id', 'customer_id'], how='left', suffixes=('', ''))
prod_stats.drop(columns=['total_tn_right'], inplace=True)
prod_stats = pl.from_pandas(prod_stats)

In [148]:
#df_norm.filter((df_norm['product_id'] == 21097) & (df_norm['customer_id'] == 10052))

if var_escalado == 'tn_trans':
    df_norm = df_norm.with_columns(
        pl.lit('tn_standard').alias('metodo_escalado')
    )
else:
    df_norm = df_norm.with_columns([
        pl.when(pl.col("tn_robust") < 10)
        .then(pl.col("tn_robust"))
        .otherwise(pl.col("tn_standard"))
        .alias("new_tn_column"),

        pl.when(pl.col("tn_robust") < 10)
        .then(pl.lit("tn_robust"))
        .otherwise(pl.lit('tn_standard'))
        .alias("metodo_escalado")
        ])

df_norm = df_norm.with_columns(
    pl.col('metodo_escalado').cast(pl.Categorical)
)


In [149]:
# grouped = df_scaled.groupby(['product_id', 'customer_id'])
# df_unscaled = grouped.apply(inverse_scale_sales)
# df_unscaled = df_unscaled.reset_index(drop=True)

# df_unscaled

In [150]:
# def inverse_scale_standard(group):
#     scaler = standard_scalers[(group['product_id'].iloc[0], group['customer_id'].iloc[0])]
#     group['tn_inverse'] = scaler.inverse_transform(group[['tn_standard']])
#     return group

In [151]:
# def inverse_scale_robust(group):
#     scaler = robust_scalers[(group['product_id'].iloc[0], group['customer_id'].iloc[0])]
#     group['tn_inverse'] = scaler.inverse_transform(group[['tn_standard']])
#     return group

In [152]:
# def create_scaler_from_row(row):
#     scaler = StandardScaler(with_mean=var_withmean)
#     scaler.mean_ = [row['scaler_mean']]
#     scaler.scale_ = [row['scaler_scale']]
#     scaler.var_ = [row['scaler_scale']**2]
#     return scaler

In [153]:
print(f"""
Datos con inf en tn_standard: {df_norm.filter((pl.col('tn_standard') == np.inf))['tn'].sum()}
Datos con -inf en tn_standard: {df_norm.filter((pl.col('tn_standard') == -np.inf))['tn'].sum()}
Datos con NaN en tn_standard: {df_norm.filter((pl.col('tn_standard') == np.NaN))['tn'].sum()}
Datos con inf en tn_robust: {df_norm.filter((pl.col('tn_robust') == np.inf))['tn'].sum()}
Datos con -inf en tn_robust: {df_norm.filter((pl.col('tn_robust') == -np.inf))['tn'].sum()}
Datos con NaN en tn_robust: {df_norm.filter((pl.col('tn_robust') == np.NaN))['tn'].sum()}
""")

print(df_norm.describe())
pl.Config.set_tbl_rows(100)
print(df_norm.sort('tn_robust', descending=True).head(100))


Datos con inf en tn_standard: 0.0
Datos con -inf en tn_standard: 0.0
Datos con NaN en tn_standard: 0.0
Datos con inf en tn_robust: 0.0
Datos con -inf en tn_robust: 0.0
Datos con NaN en tn_robust: 0.0

shape: (9, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ product_i ┆ customer_ ┆ periodo   ┆ … ┆ tn_robust ┆ tn_norm   ┆ new_tn_co ┆ metodo_e │
│ ---       ┆ d         ┆ id        ┆ ---       ┆   ┆ ---       ┆ ---       ┆ lumn      ┆ scalado  │
│ str       ┆ ---       ┆ ---       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ ---       ┆ ---      │
│           ┆ f64       ┆ f64       ┆           ┆   ┆           ┆           ┆ f64       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 5.303555e ┆ 5.303555e ┆ 5.303555e ┆ … ┆ 5.303555e ┆ 5.303555e ┆ 5.303555e ┆ 5303555  │
│           ┆ 6         ┆ 6         ┆ 6         ┆   ┆ 6         ┆ 6         

In [154]:
prod_stats.write_parquet(f'{folder}/{path_prod_stats}')
df_norm.write_parquet(f'{folder}/{path_norm}')

In [155]:
print(f"{fase:-^100}")
print(f"{'FINALIZA':-^100}\n\n\n")

--------------------------------------------02_normaliza--------------------------------------------
----------------------------------------------FINALIZA----------------------------------------------



