In [1]:
import catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
random_seed = 42

In [2]:
df = pd.read_csv("dataset/data.csv")

In [3]:
df = df.iloc[:, :-1]

In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=random_seed)

In [7]:
def split_dataset_to_gt_target(dataframe):
    return dataframe.iloc[:, 2:], dataframe["diagnosis"].str.lower()

In [8]:
train_x, train_y = split_dataset_to_gt_target(train)
test_x, test_y = split_dataset_to_gt_target(test)

In [9]:
def ml_pipeline(*args, **kwargs) -> tuple:
    scaled_datasets = tuple(
        StandardScaler().fit(dataset).transform(dataset)
        for dataset in args
    )
    return scaled_datasets

In [10]:
train_x, test_x = ml_pipeline(train_x, test_x)

In [11]:
grid = {'learning_rate': 0.1,
        'depth': 6,
        'l2_leaf_reg': 3}

model = catboost.CatBoostClassifier(**grid)
model.fit(train_x, train_y)


0:	learn: 0.5467548	total: 80.2ms	remaining: 1m 20s
1:	learn: 0.4361379	total: 96.7ms	remaining: 48.3s
2:	learn: 0.3477356	total: 109ms	remaining: 36.2s
3:	learn: 0.2938147	total: 118ms	remaining: 29.3s
4:	learn: 0.2429936	total: 136ms	remaining: 27.1s
5:	learn: 0.2114263	total: 150ms	remaining: 24.9s
6:	learn: 0.1841959	total: 164ms	remaining: 23.2s
7:	learn: 0.1692548	total: 174ms	remaining: 21.6s
8:	learn: 0.1486323	total: 186ms	remaining: 20.5s
9:	learn: 0.1330336	total: 208ms	remaining: 20.6s
10:	learn: 0.1183434	total: 216ms	remaining: 19.5s
11:	learn: 0.1069319	total: 242ms	remaining: 19.9s
12:	learn: 0.0975795	total: 257ms	remaining: 19.5s
13:	learn: 0.0897062	total: 266ms	remaining: 18.7s
14:	learn: 0.0821389	total: 274ms	remaining: 18s
15:	learn: 0.0774367	total: 286ms	remaining: 17.6s
16:	learn: 0.0722332	total: 291ms	remaining: 16.8s
17:	learn: 0.0690186	total: 299ms	remaining: 16.3s
18:	learn: 0.0651800	total: 306ms	remaining: 15.8s
19:	learn: 0.0605604	total: 313ms	remain

<catboost.core.CatBoostClassifier at 0x7f3b88d26b20>

In [21]:
from sklearn.metrics import classification_report
import json

In [13]:
preds = model.predict(test_x)

In [28]:
eval_results = classification_report(test_y, preds, output_dict=True)

In [29]:
with open("results/catboost_results.json", "w") as f:
    json.dump(eval_results, f)

{'b': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 114},
 'accuracy': 1.0,
 'macro avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 114},
 'weighted avg': {'precision': 1.0,
  'recall': 1.0,
  'f1-score': 1.0,
  'support': 114}}