# Análisis de modelos para predicción de demanda

In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb 
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMRegressor


In [93]:
df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin1')

In [94]:
df_clean = df.loc[:,["Type", "Sales","Category Id","Category Name", "Customer Id","Customer Country","Customer City","Customer Segment", "Department Name","Product Price","Product Name","Order Item Total","Order Item Quantity", "order date (DateOrders)", "Benefit per order","Order Profit Per Order","Order Item Discount", "Order Item Discount Rate"]]

In [95]:
df_clean["order date (DateOrders)"] = pd.to_datetime(df_clean["order date (DateOrders)"])
df_clean["order date (DateOrders)"] = df_clean["order date (DateOrders)"].dt.date  # Solo la fecha, sin hora

In [96]:
order_days = pd.DataFrame(df_clean["order date (DateOrders)"].unique(), columns=["order date (DateOrders)"])
departments = pd.DataFrame(df_clean["Department Name"].unique(), columns=["Department Name"])

all_combinations = order_days.merge(departments, how="cross")

In [97]:
demand = df_clean.groupby(["order date (DateOrders)", "Department Name"]).agg({
    "Order Item Quantity": "sum",
    "Sales": "sum",
    "Order Profit Per Order": "sum"
}).reset_index()

In [98]:
df_full = all_combinations.merge(demand, on=["order date (DateOrders)", "Department Name"], how="left")

# Rellenar vacíos con 0 si no hubo ventas ese día-departamento
df_full[["Order Item Quantity", "Sales", "Order Profit Per Order"]] = df_full[["Order Item Quantity", "Sales", "Order Profit Per Order"]].fillna(0)


In [99]:
# one-hot
df_full = pd.get_dummies(df_full, columns=["Department Name"])

# Features de fecha
df_full["order date (DateOrders)"] = pd.to_datetime(df_full["order date (DateOrders)"])
df_full["Year"] = df_full["order date (DateOrders)"].dt.year
df_full["Month"] = df_full["order date (DateOrders)"].dt.month
df_full["Day_of_Week"] = df_full["order date (DateOrders)"].dt.dayofweek

df_full = df_full.drop(columns=["order date (DateOrders)"])


In [100]:
num_cols = ['Sales', 'Order Profit Per Order', 'Year', 'Month', 'Day_of_Week']
scaler = StandardScaler()
df_full[num_cols] = scaler.fit_transform(df_full[num_cols])

X = df_full.drop(columns=["Order Item Quantity"])
y = df_full["Order Item Quantity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [101]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test,y_pred)

print(f"R² score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


R² score: 0.8659
RMSE: 15.6766
MAE: 8.8674


## Prueba de modelos

### Random Forest

In [102]:
pipeline_rf = Pipeline([
    ('regressor', RandomForestRegressor(random_state=42))
])

param_grid_rf = {
    'regressor__n_estimators': [300, 400, 500], 
    'regressor__max_features': [0.4, 0.6, 0.8], 
    'regressor__min_samples_split': [2, 5] 
}

grid_search_rf = GridSearchCV(estimator = pipeline_rf, 
                              param_grid = param_grid_rf, 
                              cv=3,  
                              n_jobs=-1)

grid_search_rf.fit(X_train, y_train)

In [103]:
print(f"Mejores parámetros RF: {grid_search_rf.best_params_}")
print(f"Mejor R² en RF: {grid_search_rf.best_score_:.4f}")

Mejores parámetros RF: {'regressor__max_features': 0.6, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 500}
Mejor R² en RF: 0.9849


In [104]:
y_pred_rf = grid_search_rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"MAE RF: {mae_rf:.2f}")
print(f"RMSE RF en Test: {rmse_rf:.4f}")

MAE RF: 2.07
RMSE RF en Test: 4.6292


In [105]:
print("Media:", y.mean())
print("Máximo:", y.max())
print("Mínimo:", y.min())
print("Desviación estándar:", y.std())

Media: 30.981608453658144
Máximo: 176.0
Mínimo: 0.0
Desviación estándar: 42.63873755023687


### XGB

In [106]:
pipeline_xgb = Pipeline([
    ('regressor', xgb.XGBRegressor(random_state=42))
])

param_grid_xgb = {
    'regressor__n_estimators': [200, 300, 400],
    'regressor__learning_rate': [0.01, 0.02, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 8, 10]
}

grid_search_xgb = GridSearchCV(estimator = pipeline_xgb, 
                               param_grid = param_grid_xgb, 
                               cv=3,
                               n_jobs=-1)

grid_search_xgb.fit(X_train, y_train)

In [107]:
print(f"Mejores parámetros XGB: {grid_search_xgb.best_params_}")
print(f"Mejor R² XGB: {grid_search_xgb.best_score_:.4f}")

Mejores parámetros XGB: {'regressor__learning_rate': 0.02, 'regressor__max_depth': 8, 'regressor__n_estimators': 300}
Mejor R² XGB: 0.9858


In [108]:
y_pred_xgb = np.round(grid_search_xgb.predict(X_test))
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"RMSE XGB en Test: {rmse_xgb:.4f}")
print(f"MAE XGB: {mae_xgb:.2f}")

RMSE XGB en Test: 4.6840
MAE XGB: 2.11


### LightGBM

In [109]:
pipeline_lgb = Pipeline([
    ('regressor', LGBMRegressor(random_state=42))
])

param_grid_lgb = {
    'regressor__n_estimators': [150, 200],
    'regressor__learning_rate': [0.01, 0.05],
    'regressor__num_leaves': [31, 60],
    'regressor__min_child_samples': [20, 40]
}

grid_search_lgb = GridSearchCV(estimator=pipeline_lgb,
                               param_grid=param_grid_lgb,
                               cv=2,
                               n_jobs=-1)

grid_search_lgb.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Number of data points in the train set: 4958, number of used features: 16
[LightGBM] [Info] Number of data points in the train set: 4959, number of used features: 16[LightGBM] [Info] Number of data poin

In [110]:
print(f"Mejores parámetros LGBM: {grid_search_lgb.best_params_}")
print(f"Mejor R² LGBM: {grid_search_lgb.best_score_:.4f}")


Mejores parámetros LGBM: {'regressor__learning_rate': 0.05, 'regressor__min_child_samples': 20, 'regressor__n_estimators': 150, 'regressor__num_leaves': 31}
Mejor R² LGBM: 0.9817


In [111]:
y_pred_lgb = grid_search_lgb.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
print(f"RMSE LGBM en Test: {rmse_lgb:.4f}")
print(f"MAE LGBM: {mae_lgb:.2f}")

RMSE LGBM en Test: 4.8513
MAE LGBM: 2.20


In [113]:
print(df_clean.groupby("order date (DateOrders)")["Order Item Quantity"].sum().head(10))

order date (DateOrders)
2015-01-01    355
2015-01-02    354
2015-01-03    392
2015-01-04    410
2015-01-05    373
2015-01-06    368
2015-01-07    350
2015-01-08    387
2015-01-09    308
2015-01-10    384
Name: Order Item Quantity, dtype: int64


#### Guardado de datos

In [114]:
import joblib

# Guardar modelos
joblib.dump(grid_search_rf.best_estimator_, 'model_rf.pkl')
joblib.dump(grid_search_xgb.best_estimator_, 'model_xgb.pkl')
joblib.dump(grid_search_lgb.best_estimator_, 'model_lgb.pkl')

# Guardar scaler
joblib.dump(scaler, 'scaler.pkl')

# Guardar columnas
joblib.dump(X.columns, 'model_columns.pkl')

# Guardar métricas
metrics = {
    'Random Forest': {'RMSE': rmse_rf, 'MAE': mae_rf, 'R2': grid_search_rf.best_score_},
    'XGBoost': {'RMSE': rmse_xgb, 'MAE': mae_xgb, 'R2': grid_search_xgb.best_score_},
    'LightGBM': {'RMSE': rmse_lgb, 'MAE': mae_lgb, 'R2': grid_search_lgb.best_score_}
}
joblib.dump(metrics, 'metrics.pkl')


['metrics.pkl']