# Fase 2 del proyecto

La segunda fase consiste en implementar todos los modelos de regresión, ajusstar los parámetros y métricas de rendiiento para cada modelo.

## Cargar y preparación de datos 

In [4]:
import pandas as pd
import numpy as np


df = pd.read_csv("../data_clean/olist_dataset_unificado_clean.csv")

df['order_estimated_delivery_date'] = pd.to_datetime(df['order_estimated_delivery_date'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

#target variable
df['delivery_diff_days'] = (df['order_delivered_customer_date'] - df['order_estimated_delivery_date']).dt.days

# Eliminar nulos en la variable objetivo
df = df.dropna(subset=['delivery_diff_days'])

cols_to_drop = [
    'order_id', 'customer_id', 'customer_unique_id', 'order_purchase_timestamp',
    'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date',
    'order_estimated_delivery_date', 'product_id', 'seller_id', 'shipping_limit_date',
    'review_id', 'review_comment_title', 'review_comment_message', 'review_creation_date',
    'review_answer_timestamp', 'product_category_name', 'product_category_name_english'
]

df = df.drop(columns=cols_to_drop)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separar variables
X = df.drop(columns=['delivery_diff_days'])
y = df['delivery_diff_days']

# Codificar dummies para variables categóricas
X = pd.get_dummies(X)

# Escalar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

models = {
    "Árbol de Decisión": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Regresión Lineal": LinearRegression()
}

# Entrenar y evaluar
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    print(f"\nResultados para: {name}")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", rmse)
    print("R²:", r2_score(y_test, y_pred))


Resultados para: Árbol de Decisión
MAE: 7.471370143149284
RMSE: 11.982276568145258
R²: -0.37435008432111516
