In [None]:
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../data/processed/csv/cleaned_final_csv_scrap_completo.csv', encoding='utf-8')
df.columns

In [None]:
registros_unicos = df[['ID_Auto', 'Brand', 'Model', 'Precio', 'Km', 'Year', 'Interes_%', 'Total_a_Pagar']].copy()
registros_unicos = registros_unicos.sort_values(by=['Year', 'Km', 'Precio'], na_position='last')
registros_unicos = registros_unicos.dropna()
registros_unicos = registros_unicos.drop_duplicates(subset=['ID_Auto'], keep='first')
registros_unicos = registros_unicos.drop(columns=['ID_Auto'])

In [None]:
X = registros_unicos[['Precio', 'Km', 'Year']].copy()
#X[X.isna().any(axis=1)]
X_mean = X.mean()
X_std = X.std()

# Obtenemos el Z-Score para cada variable
X['Precio_z'] = (X['Precio'] - X_mean['Precio']) / X_std['Precio']
X['Km_z']     = (X['Km'] - X_mean['Km']) / X_std['Km']
X['Year_z']   = (X['Year'] - X_mean['Year']) / X_std['Year']

In [None]:
X_scaled = X[['Precio_z', 'Km_z', 'Year_z']].copy()
inertia = []
K_range = range(1, 11)

for k in K_range:
    K_means = KMeans(n_clusters=k, random_state=97, n_init=10)
    K_means.fit(X_scaled)
    inertia.append(K_means.inertia_)
# Graficamos
plt.figure(figsize=(10,6))
plt.plot(K_range, inertia, marker='o', linestyle='--', color='blue')
plt.xlabel('Numero de Clusters (K)')
plt.ylabel('Inercia (Distancia intra-cluster)')
plt.title('Metodo del codo: Buscando el K optimo')
plt.xticks(K_range)
plt.grid(True)
plt.show()

In [None]:
k_means = KMeans(n_clusters=3, random_state=97, n_init=10)
k_means.fit(X_scaled)

resultados_df = registros_unicos.copy()
resultados_df['Cluster'] = k_means.labels_
print(resultados_df.sample(5))

In [None]:
pd.options.display.float_format = '{:,.0f}'.format

profiles = resultados_df[['Precio', 'Km', 'Year', 'Cluster']].groupby('Cluster').mean().sort_values('Precio')
profiles['Cantidad_Autos'] = resultados_df['Cluster'].value_counts()

print("PERFILES DE LOS 3 GRUPOS DE AUTSOS EN KAVAK")
print(profiles)

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=resultados_df, x='Interes_%', y='Total_a_Pagar', hue='Cluster', palette='viridis', alpha=0.6)
plt.title('Market Segmentation: 3 Tipos de Autos')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=resultados_df, x='Year', y='Precio', hue='Cluster', palette='viridis', alpha=0.6)
plt.title('Market Segmentation: 3 Tipos de Autos')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
def cluster_brands_analysis(df, cluster_id, top_n=5):
    cluster_data = df[df['Cluster'] == cluster_id]

    top_brands = cluster_data['Brand'].value_counts().head(top_n)
    top_models = cluster_data['Model'].value_counts().head(top_n)
    
    print(f"--- Top Marcas en Cluster {cluster_id} ---")
    print(top_brands)
    print(f"\n--- Top Modelos en Cluster {cluster_id} ---")
    print(top_models)
    print("\n" + "="*40 + "\n")

In [None]:
# Ejecutamos para los 3 clusters
cluster_brands_analysis(resultados_df, 0) # Seminuevos (Morado)
cluster_brands_analysis(resultados_df, 1) # Kilometraje Alto (Verde)
cluster_brands_analysis(resultados_df, 2) # Premium/Caros (Amarillo)

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=resultados_df[resultados_df['Model'] == 'Mazda_3'], x='Interes_%', y='Km', hue='Cluster', palette='viridis', alpha=0.6)
plt.axhline(y=70000)
plt.title('Market Segmentation: Mazda 3')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
sns.set_theme(style='ticks')
sns.pairplot(resultados_df, hue='Cluster')