In [80]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import warnings

# Ignorar avisos futuros do scikit-learn para manter a saída limpa
warnings.filterwarnings('ignore', category=FutureWarning)

In [81]:
# Carregar o dataset que você preparou na etapa de ETL
file_path = Path('../data/analytics/master_dataset.csv')
df = pd.read_csv(file_path)

# Tratamento final de dados nulos antes da modelagem
# Decisão de negócio: Clientes sem NPS são desengajados. Preencher com um valor baixo.
df['nps_last_research'].fillna(5, inplace=True)

# Para qualquer outro nulo que possa ter sobrado, preencher com 0 (ex: em features de uso)
df.fillna(0, inplace=True)

print("Dataset mestre carregado e preparado.")
print(f"Shape do dataset: {df.shape}")
df.head()

Dataset mestre carregado e preparado.
Shape do dataset: (1000, 26)


Unnamed: 0,customer_id,mrr,plan,contracting_date,segment,interaction_date,interaction_type,nps_last_research,event_date,logins_last_week,...,Automations,Comments,Dashboards,Permission Control,Project Creation,SSO,Task Creation,weeks_since_last_interaction,tenure_in_years,weeks_since_last_usage_extraction
0,1,185.47,Pro,2021-05-18,Mid-Market,2022-05-07,Suporte Técnico,5.0,2023-12-09,20.5,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,238.0,5.539936,155.0
1,2,1558.07,Enterprise,2023-06-24,Enterprise,0,0,5.0,2026-06-03,22.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,3.438741,25.0
2,3,1072.58,Enterprise,2023-03-05,Enterprise,2025-07-13,Reunião QBR,10.0,2023-08-22,36.5,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,72.0,3.742642,171.0
3,4,218.26,Pro,2020-10-31,Mid-Market,2023-06-19,Email de Acompanhamento,5.0,2023-09-21,22.25,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,180.0,6.08291,166.0
4,5,417.91,Pro,2023-10-14,Enterprise,0,0,5.0,2026-06-13,32.75,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,3.132101,24.0


In [82]:
# Separar o DataFrame em um para cada plano que vamos modelar
df_enterprise = df[df['plan'] == 'Enterprise'].copy()
df_pro = df[df['plan'] == 'Pro'].copy()

print(f"Segmentação concluída:")
print(f"Temos {len(df_enterprise)} clientes Enterprise para analisar.")
print(f"Temos {len(df_pro)} clientes Pro para analisar.")

Segmentação concluída:
Temos 326 clientes Enterprise para analisar.
Temos 330 clientes Pro para analisar.


In [83]:
# Definir as features que farão parte de cada modelo, com base na nossa análise
features_enterprise = [
    'mrr', 'nps_last_research', 'num_opened_tickets',
    'weeks_since_last_interaction', 'active_users',
    'API Access', 'Advanced Reports', 'SSO' # Features-chave Enterprise
]

features_pro = [
    'mrr', 'nps_last_research', 'logins_last_week',
    'weeks_since_last_interaction', 'finished_tasks',
    'Dashboards', 'Automations', 'Permission Control' # Features-chave Pro
]

# Função para escalar os dados, treinar o K-Means e retornar o DF com os clusters
def train_kmeans(dataframe, feature_list, num_clusters=3):
    """Escala os dados, treina o K-Means e retorna o DF com os labels dos clusters."""
    print(f"\nIniciando modelagem para o plano: {dataframe['plan'].iloc[0]}...")
    
    # Seleciona e escala as features. É CRUCIAL para o K-Means.
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(dataframe[feature_list])
    
    # Instancia e treina o modelo
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    
    # Atribui o resultado (cluster 0, 1 ou 2) a uma nova coluna
    dataframe.loc[:, 'health_cluster'] = kmeans.fit_predict(df_scaled)
    
    print("Modelo treinado e clusters atribuídos.")
    return dataframe

print("Célula 4: Features definidas e função de modelagem pronta.")

Célula 4: Features definidas e função de modelagem pronta.


In [84]:
df_enterprise.info()

<class 'pandas.core.frame.DataFrame'>
Index: 326 entries, 1 to 999
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   customer_id                        326 non-null    int64  
 1   mrr                                326 non-null    float64
 2   plan                               326 non-null    object 
 3   contracting_date                   326 non-null    object 
 4   segment                            326 non-null    object 
 5   interaction_date                   326 non-null    object 
 6   interaction_type                   326 non-null    object 
 7   nps_last_research                  326 non-null    float64
 8   event_date                         326 non-null    object 
 9   logins_last_week                   326 non-null    float64
 10  finished_tasks                     326 non-null    float64
 11  active_users                       326 non-null    float64
 12 

In [85]:
# Aplica a função em cada segmento
df_enterprise_clustered = train_kmeans(df_enterprise, features_enterprise)
df_pro_clustered = train_kmeans(df_pro, features_pro)

print("\n--- Modelagem Concluída para ambos os segmentos. ---")


Iniciando modelagem para o plano: Enterprise...
Modelo treinado e clusters atribuídos.

Iniciando modelagem para o plano: Pro...
Modelo treinado e clusters atribuídos.

--- Modelagem Concluída para ambos os segmentos. ---


In [86]:
# Função para nos ajudar a entender o que cada cluster significa
def analyze_clusters(dataframe, feature_list):
    """Calcula a média de cada feature por cluster para ajudar na interpretação."""
    print(f"\n--- Análise dos Clusters do Plano: {dataframe['plan'].iloc[0]} ---")
    
    # Agrupa por cluster e calcula a média de cada feature
    cluster_analysis = dataframe.groupby('health_cluster')[feature_list].mean().round(2)
    
    # Adiciona a contagem de clientes em cada cluster
    cluster_analysis['n_clientes'] = dataframe['health_cluster'].value_counts()
    
    return cluster_analysis

# Executa a análise para cada segmento
analysis_enterprise = analyze_clusters(df_enterprise_clustered, features_enterprise)
analysis_pro = analyze_clusters(df_pro_clustered, features_pro)

# Exibe as tabelas de análise
print("\n--- Resultado Enterprise ---")
display(analysis_enterprise)

print("\n--- Resultado Pro ---")
display(analysis_pro)


--- Análise dos Clusters do Plano: Enterprise ---

--- Análise dos Clusters do Plano: Pro ---

--- Resultado Enterprise ---


Unnamed: 0_level_0,mrr,nps_last_research,num_opened_tickets,weeks_since_last_interaction,active_users,API Access,Advanced Reports,SSO,n_clientes
health_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1250.3,5.02,1.21,141.52,10.57,1.0,0.95,1.0,272
1,1273.09,4.13,1.27,167.58,10.28,0.9,0.81,0.0,31
2,1286.37,5.35,1.17,144.04,11.81,0.0,0.96,0.96,23



--- Resultado Pro ---


Unnamed: 0_level_0,mrr,nps_last_research,logins_last_week,weeks_since_last_interaction,finished_tasks,Dashboards,Automations,Permission Control,n_clientes
health_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,394.73,4.8,26.14,126.22,47.57,1.0,0.95,0.84,152
1,205.64,4.99,24.83,132.01,51.96,1.0,0.9,0.97,154
2,288.72,4.96,22.82,103.42,42.04,0.0,0.75,0.75,24
