In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
import joblib
import sys, os

sys.path.append(os.path.abspath(".."))
from app.pipelines_transf import DataFramePreparer

In [11]:
# Cargar dataset limpio
df = pd.read_csv("../booking_clean.csv")
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,A,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out


In [12]:
print("Shape original:", df.shape)
print("Columnas originales:")
print(df.columns.tolist())

Shape original: (118898, 29)
Columnas originales:
['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status']


In [13]:
target = "is_canceled"

X = df.drop(columns=[target])
y = df[target]

In [14]:
# Particionar train/val/test
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

train_set, val_set, test_set = train_val_test_split(df, stratify=target)

X_train, y_train = train_set.drop(columns=[target]), train_set[target]
X_val, y_val = val_set.drop(columns=[target]), val_set[target]
X_test, y_test = test_set.drop(columns=[target]), test_set[target]

In [15]:
# Preparar datos con pipelines
prep = DataFramePreparer()
prep.fit(X_train)

X_train_prep = prep.transform(X_train)
X_val_prep = prep.transform(X_val)
X_test_prep = prep.transform(X_test)

print("Original shape:", X_train.shape)
print("Transformed shape:", X_train_prep.shape)

  fechas = pd.to_datetime(s, errors="coerce")


Original shape: (71338, 28)
Transformed shape: (71338, 236)


In [16]:
# Entrenar Decision Tree con hiperparámetros para evitar overfitting
clf = DecisionTreeClassifier(
    criterion="gini",
    max_depth=10,            # controla la profundidad
    min_samples_split=20,    # mínimo para dividir un nodo
    min_samples_leaf=10,     # mínimo en hojas
    random_state=42
)

clf.fit(X_train_prep, y_train)

In [17]:
# Evaluar modelo
def evaluar(model, X, y, nombre=""):
    y_pred = model.predict(X)
    print(f"\nResultados en {nombre}:")
    print("Accuracy:", accuracy_score(y, y_pred))
    print("Precision:", precision_score(y, y_pred, average="weighted"))
    print("Recall:", recall_score(y, y_pred, average="weighted"))
    print("F1:", f1_score(y, y_pred, average="weighted"))
    print(classification_report(y, y_pred))

evaluar(clf, X_train_prep, y_train, "TRAIN")
evaluar(clf, X_val_prep, y_val, "VAL")
evaluar(clf, X_test_prep, y_test, "TEST")


Resultados en TRAIN:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     44846
           1       1.00      1.00      1.00     26492

    accuracy                           1.00     71338
   macro avg       1.00      1.00      1.00     71338
weighted avg       1.00      1.00      1.00     71338


Resultados en VAL:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14949
           1       1.00      1.00      1.00      8831

    accuracy                           1.00     23780
   macro avg       1.00      1.00      1.00     23780
weighted avg       1.00      1.00      1.00     23780


Resultados en TEST:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14950
           1       1.00    

In [18]:
# Guardar modelo + preparador
joblib.dump({"pipeline": prep, "model": clf}, "../models/decision_tree_pipeline.pkl")
print("Modelo guardado en models/decision_tree_pipeline.pkl")

Modelo guardado en models/decision_tree_pipeline.pkl
