In [56]:
import numpy as np

import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [57]:
# Generate Synthetic Data with Categorical Features

np.random.seed(42)

data_size = 5000

In [58]:
df = pd.DataFrame(
    
{

    'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], data_size),
    'Product': np.random.choice(['Laptop', 'Phone', 'Tablet', 'TV'], data_size),
    'Customer_Type': np.random.choice(['Regular', 'VIP', 'Wholesale'], data_size),
    'Age': np.random.randint(18,70, data_size),
    'Spending_Score': np.random.randint(1, 100, data_size),
    'Churn': np.random.choice([0, 1], data_size)  # Target variable

}
 )

In [59]:
# Split Data into Train & Test

X = df.drop(columns=['Churn'])

y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state= 42)

In [60]:
# Identify Categorical Features (CatBoost Handles Them Natively)

categorical_features = ['City', 'Product' , 'Customer_Type']

In [61]:
#  Convert Data to CatBoost Pool (Recommended for Efficient Training)

train_pool = Pool(X_train, label=y_train, cat_features=categorical_features)


In [62]:
#  Train CatBoost with Categorical Features (NO NEED FOR MANUAL ENCODING)

catboost_model = CatBoostClassifier(
                iterations=500,
                depth=6,
                learning_rate=0.01,
                loss_function='MultiClass',
                verbose=100,
                random_state=42
                
            )

In [63]:
# Fitting the model

catboost_model.fit(train_pool, eval_set= (X_test, y_test), early_stopping_rounds=50)

0:	learn: 0.6931279	test: 0.6931485	best: 0.6931485 (0)	total: 2.06ms	remaining: 1.02s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6931200591
bestIteration = 14

Shrink model to first 15 iterations.


<catboost.core.CatBoostClassifier at 0x16335c4f0>

In [64]:
# Predictions & Accuracy

y_pred = catboost_model.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)

print(f"Model Accuracy : {accuracy: .4f}")

Model Accuracy :  0.5080


In [65]:
# Compare with Label Encoding (Manual Encoding)

X_encoded = X.copy()

for col in categorical_features:

    X_encoded[col] = X_encoded[col].astype('category').cat.codes

In [66]:
# Train with Manual Encoded Data

X_train_enc,X_test_enc,y_train,y_test = train_test_split(X_encoded, y, test_size=0.2,random_state=42)

In [67]:
# Train Again with Encoded Data model

catboost_model_enc = CatBoostClassifier(

    iterations= 100,
    depth=6,
    learning_rate=0.1,
    verbose=100
    
)

In [68]:
catboost_model_enc.fit(X_train_enc,y_train , eval_set=[(X_test_enc,y_test)],early_stopping_rounds=50)

0:	learn: 0.6924016	test: 0.6927442	best: 0.6927442 (0)	total: 582us	remaining: 57.7ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6925654382
bestIteration = 10

Shrink model to first 11 iterations.


<catboost.core.CatBoostClassifier at 0x16335edd0>

In [69]:
y_pred_enc = catboost_model_enc.predict(X_test_enc)

accuracy_enc = accuracy_score(y_test, y_pred_enc)

print(f"Accuracy with Manual Encoding: {accuracy_enc:.4f}")


Accuracy with Manual Encoding: 0.5150


In [70]:
#  Compare Performance

print(f"Native CatBoost Encoding Accuracy: {accuracy:.4f}")
print(f"Label Encoding Accuracy: {accuracy_enc:.4f}")

Native CatBoost Encoding Accuracy: 0.5080
Label Encoding Accuracy: 0.5150
