In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../data/csv/cleaned_final_csv.csv', encoding='utf-8')

#### df info

In [None]:
df.info()

#### ML

In [None]:
cols = ['Brand', 'Tipo', 'Sucursal', 'Year', 'Km', 'Caja', 'Precio']
df_ml = df[cols].copy()

# Convertir ['Caja', 'Sucursal'] a columnas numericas
df_ml = pd.get_dummies(df_ml, columns=['Caja', 'Sucursal', 'Tipo'], drop_first=True)

In [None]:
X = df_ml.drop(columns=['Precio'])
y = df_ml['Precio']

In [None]:
### Dividir datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Entranamiento: {X_train.shape[0]} autos")
print(f"Examen (Test): {X_test.shape[0]} autos")

In [None]:
### Target Encoding

# Unir los precios temporalmente para calcular los promedios
train_temp = X_train.copy()
train_temp['Precio_Real'] = y_train
# Calcular precios promedio para cada Marca y Modelo
brand_map = train_temp.groupby('Brand')['Precio_Real'].mean() # {'marca': #####}
#model_map = train_temp.groupby('Model')['Precio_Real'].mean() # {'modelo': #####}
global_mean = y_train.mean() 
# Remplazo de marcas y modelos por precio promedio en train
X_train['Brand_Encoded'] = X_train['Brand'].map(brand_map) #VLOOKUP para precio promedio de cada marca
#X_train['Model_Encoded'] = X_train['Model'].map(model_map) #VLOOKUP para precio promedio de cada modelo

# Remplazo de marcas y modelos por precio promedio en test
X_test['Brand_Encoded'] = X_test['Brand'].map(brand_map)
#X_test['Model_Encoded'] = X_test['Model'].map(model_map)

# Limpieza de nulos por media de cada marca por si hay un modelo que no estaba en train
X_test['Brand_Encoded'] = X_test['Brand_Encoded'].fillna(global_mean)
# X_test['Model_Encoded'] = X_test['Model_Encoded'].fillna(global_mean)

# Eliminar columnas originales
X_train = X_train.drop(columns=['Brand'])
X_test = X_test.drop(columns=['Brand'])

In [None]:
print(X_train.columns)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [None]:
### Entrenamiento
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
### Prediccion
y_pred = rf_model.predict(X_test)

In [None]:
### Evaluacion
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
# Calculo del Error Porcentual Promedio (MAPE)
errores = abs(y_test - y_pred)
mape = 100 * np.mean(errores / y_test)

In [None]:
print("\n--- RESULTADOS DEL MODELO ---")
print(f"MAE (Error Promedio en Pesos):  ${mae:,.2f} MXN")
print(f"MAPE (Error Promedio %):        {mape:.2f}%")
print(f"R2 (Precisión General):         {r2:.2f}")

In [None]:
print("\n--- EJEMPLO REAL ---")
# Comparemos el primer auto del Test Set
precio_real = y_test.iloc[0]
precio_predicho = y_pred[0]
diferencia = precio_predicho - precio_real

print(f"Auto de prueba #1:")
print(f"Precio Real Kavak:   ${precio_real:,.2f}")
print(f"El Modelo predice:   ${precio_predicho:,.2f}")
print(f"Diferencia:          ${diferencia:,.2f}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Extraemos la importancia de cada variable
importances = rf_model.feature_importances_
feature_names = X_train.columns

# Creamos una tabla
df_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
df_imp = df_imp.sort_values('Importance', ascending=False)

# Mostramos el TOP 5
print(df_imp.head(5))

# Graficamos
plt.figure(figsize=(10, 6))
plt.barh(df_imp['Feature'].head(10), df_imp['Importance'].head(10), color='salmon')
plt.xlabel('Importancia (0 a 1)')
plt.title('¿Quién le sopló la respuesta al modelo?')
plt.gca().invert_yaxis()
plt.show()