In [None]:
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.express as px

pd.options.display.float_format = '{:,.0f}'.format

#### Preparacion de datos
* Columnas tomadas: 'Brand', 'Model', 'Precio', 'Km', 'Year', 'Interes_%'
* Calculo de Z-Scores

In [None]:
df = pd.read_csv('../data/processed/csv/cleaned_final_csv_scrap_completo.csv', encoding='utf-8')

In [None]:
# Limpieza de planes.
term = 12

condition_clean_term_cars = (df['Sucursal'] != 'Aliado') & (df['Plazo'] == term)

clean_term_cars = df.loc[condition_clean_term_cars,
    ['ID_Auto', 'Brand', 'Model', 
     'Precio', 'Km', 'Year', 'Interes_%',
     'Version', 'Caja', 'Tipo', 'Total_a_Pagar', 'Plazo']].copy()
clean_term_cars = clean_term_cars.sort_values(by=['Year', 'Km', 'Precio'], na_position='last')
clean_term_cars = clean_term_cars.dropna()
clean_term_cars = clean_term_cars.drop_duplicates(subset=['ID_Auto'], keep='first')
clean_term_cars = clean_term_cars.drop(columns=['ID_Auto'])

In [None]:
# Muestra para alimentar el algoritmo.
features = ['Precio', 'Km', 'Year', 'Interes_%']
X = clean_term_cars[features].copy()

#Promedios y DESV.EST
X_mean = X.mean()
X_std = X.std()

# Obtenemos el Z-Score para cada variable
X['Precio_z'] = (X['Precio'] - X_mean['Precio']) / X_std['Precio']
X['Km_z']     = (X['Km'] - X_mean['Km']) / X_std['Km']
X['Year_z']   = (X['Year'] - X_mean['Year']) / X_std['Year']
X['Interes_%_z'] = (X['Interes_%'] - X_mean['Interes_%']) / X_std['Interes_%']

#### Busqueda del numero de agrupamientos optimos. (Elbow method)

In [None]:
X_scaled = X[['Precio_z', 'Km_z', 'Year_z']].copy()
inertia = []
K_range = range(1, 11)

for k in K_range:
    K_means = KMeans(n_clusters=k, random_state=97, n_init=10)
    K_means.fit(X_scaled)
    inertia.append(K_means.inertia_)

# Graficamos
plt.figure(figsize=(10,6))
plt.plot(K_range, inertia, marker='o', linestyle='--', color='blue')
plt.xlabel('Numero de Clusters (K)')
plt.ylabel('Inercia (Distancia intra-cluster)')
plt.title('Metodo del codo: Buscando el K optimo')
plt.xticks(K_range)
plt.grid(True)
plt.show()

#### Clustering con n = 3 

In [None]:
#Algoritmo
k_means = KMeans(n_clusters=3, random_state=97, n_init=10)
k_means.fit(X_scaled)
df_results = clean_term_cars.copy()
df_results['Cluster'] = k_means.labels_
means = df_results.groupby('Cluster')['Precio'].mean().sort_values()

cluster_names = {
    means.index[0]: 'Alto Kilometraje',
    means.index[1]: 'Standard',
    means.index[2]: 'Premium'
}

In [None]:
df_results['Segment'] = df_results['Cluster'].map(cluster_names)

profiles = df_results[['Precio', 'Km', 'Year', 'Interes_%', 'Segment']].groupby('Segment').mean().sort_values('Precio')
profiles['Amount_Cars'] = df_results['Segment'].value_counts()

print("PERFILES DE LOS 3 GRUPOS DE AUTOS EN KAVAK")
print(profiles)

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df_results, x='Km', y='Precio', hue='Segment', palette='viridis_r', alpha=0.6)
plt.axhline(y=350000)
plt.axvline(x=70000)
plt.title('Market Segmentation: 3 Tipos de Autos')
plt.grid(True, alpha=0.3)
plt.show()

#### Insights

In [None]:
def interactive_model_exploration(df, model, x_axis='Precio', y_axis='Km', color_label='Caja', symbol_label='Segment'):
    data = df[df['Model'] == model]

    fig = px.scatter(
        data,
        x=x_axis,
        y=y_axis,
        color=color_label,
        symbol=symbol_label,
        hover_data=['Precio', 'Year', 'Version', 'Total_a_Pagar', 'Plazo'],
        title=f'Analisis Profundo: {model} por {color_label} y {symbol_label}',
        template='plotly_dark',
        height=700
    )

    fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))
    fig.show()

In [None]:
interactive_model_exploration(df_results, 'Cx_5', color_label='Segment')

In [None]:
def cluster_brands_analysis(df, Segment, top_n=5):
    cluster_data = df[df['Segment'] == Segment]

    top_brands = cluster_data['Brand'].value_counts().head(top_n)
    top_models = cluster_data['Model'].value_counts().head(top_n)
    
    print(f"--- Top Marcas en Cluster {Segment} ---")
    print(top_brands)
    print(f"\n--- Top Modelos en Cluster {Segment} ---")
    print(top_models)
    print("\n" + "="*40 + "\n")

In [None]:
# Ejecutamos para los 3 clusters
cluster_brands_analysis(df_results, 'Alto Kilometraje') # Seminuevos (Morado)
cluster_brands_analysis(df_results, 'Standard') # Kilometraje Alto (Verde)
cluster_brands_analysis(df_results, 'Premium') # Premium/Caros (Amarillo)

In [None]:
sns.set_theme(style='ticks')
sns.pairplot(df_results, hue='Segment')