In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
df = pd.read_csv('../data/processed/hair_salon_model_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1952 entries, 0 to 1951
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   book_tod             1952 non-null   object 
 1   book_dow             1952 non-null   object 
 2   recency              1952 non-null   int64  
 3   last_noshow          1952 non-null   int64  
 4   last_cumnoshow       1952 non-null   int64  
 5   last_cumbook         1952 non-null   int64  
 6   last_cumcancel       1952 non-null   int64  
 7   receipt_per_service  1952 non-null   float64
 8   noshow_rate          1952 non-null   float64
 9   book_category        1952 non-null   object 
 10  last_category        1952 non-null   object 
 11  noshow               1952 non-null   int64  
dtypes: float64(2), int64(6), object(4)
memory usage: 183.1+ KB


In [3]:
X = df.drop(columns='noshow')
y = df['noshow']

In [4]:
num_features = [
    'recency','last_noshow','last_cumnoshow','last_cumbook',
    'last_cumcancel','noshow_rate','receipt_per_service'
]

cat_features = [
    'book_tod','book_dow','book_category','last_category'
]

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


In [6]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
# SMOTE
smote = SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.8)

# Logistic Regression
pipe_lr = ImbPipeline([
    ('preprocessing', preprocessor),
    ('smote', smote),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Random Forest
pipe_rf = ImbPipeline([
    ('preprocessing', preprocessor),
  
    ('classifier', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42))
])

pipe_dtc = ImbPipeline([
    ('preprocessin', preprocessor),
    
    ('classifier', DecisionTreeClassifier(max_depth=16, random_state=42))
])
# Gradient Boosting
pipe_gb = ImbPipeline([
    ('preprocessing', preprocessor),
    
    ('classifier', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, loss = 'exponential', random_state=42))
])


In [8]:
pipelines = {'Logistic Regression': pipe_lr,
             'Decision Tree Classifier':pipe_dtc,
             'Random Forest': pipe_rf,
             'Gradient Boosting': pipe_gb}

for name, model in pipelines.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.96      0.86      0.91       346
           1       0.42      0.76      0.54        45

    accuracy                           0.85       391
   macro avg       0.69      0.81      0.73       391
weighted avg       0.90      0.85      0.87       391


=== Decision Tree Classifier ===
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       346
           1       0.58      0.56      0.57        45

    accuracy                           0.90       391
   macro avg       0.76      0.75      0.76       391
weighted avg       0.90      0.90      0.90       391


=== Random Forest ===
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       346
           1       0.60      0.60      0.60        45

    accuracy                           0.91       391
   macro avg       0.77      0.77      0.7

Gradient Boosting Classifier foi escolhido como modelo final por apresentar:

- Melhor equilíbrio entre precisão e recall na classe no-show

- Maior F1-score para o evento de interesse

- Melhor performance geral sem necessidade de SMOTE

- Maior robustez a ruído e desbalanceamento

Impacto financeiro

In [12]:
df_hair_salon = pd.read_csv('../data/raw/hair_salon_no_show_wrangled_df.csv')
df_hair_salon.head()


Unnamed: 0.1,Unnamed: 0,book_tod,book_dow,book_category,book_staff,last_category,last_staff,last_day_services,last_receipt_tot,last_dow,...,last_prod_flag,last_cumrev,last_cumbook,last_cumstyle,last_cumcolor,last_cumprod,last_cumcancel,last_cumnoshow,noshow,recency
0,0,afternoon,Wednesday,STYLE,JJ,,,0,0.0,,...,0,0,0,0,0,0,0,0,0,0
1,1,morning,Wednesday,COLOR,JOANNE,,,0,0.0,,...,0,0,0,0,0,0,0,0,0,0
2,2,,Wednesday,STYLE,JJ,COLOR,JOANNE,2,262.0,Wednesday,...,1,262,2,1,1,2,0,0,0,35
3,3,afternoon,Thursday,STYLE,KELLY,,,0,0.0,,...,0,0,0,0,0,0,0,0,0,0
4,4,evening,Saturday,STYLE,KELLY,STYLE,KELLY,1,35.0,Thursday,...,0,35,1,1,0,0,0,0,0,9


In [36]:
estimated_revenue_at_risk = df_hair_salon[df_hair_salon['noshow'] == 1][['last_receipt_tot']].sum().iloc[0]

print(
    f"Receita potencial em risco sem ação preventiva: "
    f"$ {estimated_revenue_at_risk:,.2f}"
)

Receita potencial em risco sem ação preventiva: $ 8,866.98


5674.8672

Sem qualquer modelo preditivo, os no-shows representam uma perda potencial de $ 8,866.98. Utilizando o valor do último serviço como estimativa aproximada da receita esperada, estimamos o impacto financeiro total associado aos no-shows no período analisado.

Com a aplicação do modelo de Gradient Boosting, que alcançou um recall de aproximadamente 64% para a classe no-show, o salão seria capaz de identificar preventivamente cerca de dois terços dos clientes com maior risco de ausência. Isso possibilita ações como confirmações proativas, remarcações estratégicas ou realocação planejada de horários, potencialmente recuperando uma parcela relevante da receita que seria perdida de no maximo $ 5674.87.