# CatBoost incremental
## 🧪 Training

Author: https://github.com/deburky

### Load dataset metadata
- `ParquetDataset` encapsulates details of reading a complete Parquet dataset possibly consisting of multiple files and partitions in subdirectories.
- PyArrow also provides `pyarrow.dataset` which is a higher-level API for working with datasets.

In [None]:
!uv pip install -e ../.

### Run incremental training

In [None]:
import time

import pyarrow.dataset as ds

from catboost_incremental import DataLoader, CatBoostTrainer

# Load full dataset
dataset_path = "../data/"
dataset = ds.dataset(dataset_path)
full_df = dataset.to_table().to_pandas()
label = "target"

data_loader = DataLoader(
    dataset_path, chunk_size=1000, partition_id_col="partition_id", label_col=label
)
trainer = CatBoostTrainer(
    data_loader=data_loader,
    label_col=label,
    model_config={
        "iterations": 10,
        "learning_rate": 0.01,
        "verbose": 0,
        "allow_writing_files": False,
    },
    # metric_fn=log_loss,
)

# Train model
start_time = time.perf_counter()
model = trainer.train()
end_time = time.perf_counter()

incremental_training_time = end_time - start_time
print(f"Incremental training time: {incremental_training_time:.2f} seconds")

# Evaluate
score = trainer.evaluate(full_df)
print(f"Accuracy: {score:.4f}")

### Run standard training

In [None]:
import time

import catboost as cb
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(full_df.drop(columns=[label]), full_df[label])

train_pool = cb.Pool(
    data=X_train,
    label=y_train,
)
test_pool = cb.Pool(
    data=X_test,
    label=y_test,
)

model = cb.CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    verbose=0,
    allow_writing_files=False,
)

# Train model
start_time = time.perf_counter()
model.fit(
    train_pool,
    eval_set=test_pool,
    verbose=0,
)
end_time = time.perf_counter()

normal_training_time = end_time - start_time
print(f"Incremental training time: {normal_training_time:.2f} seconds")

# Evaluate
score = model.score(test_pool)
print(f"Accuracy: {score:.4f}")