# Breast Cancer prediction + Boosting
Построим модель для предсказания наличия у пациента рака молочной железы.
Датасет взят с Kaggle: https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset

In [33]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier

## Загрузим датасет и подготовим данные

In [3]:
data = pd.read_csv('./data/breast-cancer.csv')
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Посмотрим на распределение целевой переменной.

In [4]:
data = data.drop(columns = 'id')
data['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

Здесь M - злокачественная опухоль, B - доброкачественная. Из распределения видно, что количество диагнозов со злокачественной опухолью в полтора раза меньше, чем с доброкачественной.

In [5]:
data.loc[data['diagnosis']=='M', 'diagnosis'] = 1
data.loc[data['diagnosis']=='B', 'diagnosis'] = 0

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [7]:
data['diagnosis'] = pd.to_numeric(data['diagnosis'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [8]:
data['diagnosis'].value_counts()

diagnosis
0    357
1    212
Name: count, dtype: int64

Разделим и отнормируем данные.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns='diagnosis'), data['diagnosis'], test_size=0.2)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Обучение модели

Обучим модель при помощи градиентного бустинга

### CatBoost

In [34]:
model = CatBoostClassifier(iterations=10000,
                          learning_rate=0.005,
                          depth=7,
                          random_state=42)
model.fit(X_train, y_train, verbose=1000)
y_true = y_test
y_pred = model.predict(X_test)
print(f"F1-оценка: {f1_score(y_true, y_pred)}")
print(f"accuracy: {accuracy_score(y_true, y_pred)}")
print(f"precision: {precision_score(y_true, y_pred)}")
print(f"recall: {recall_score(y_true, y_pred)}")

0:	learn: 0.6851196	total: 2.01ms	remaining: 20.1s
1000:	learn: 0.0138517	total: 1.17s	remaining: 10.5s
2000:	learn: 0.0058526	total: 2.32s	remaining: 9.27s
3000:	learn: 0.0033008	total: 3.43s	remaining: 8s
4000:	learn: 0.0022709	total: 4.57s	remaining: 6.85s
5000:	learn: 0.0018213	total: 5.71s	remaining: 5.71s
6000:	learn: 0.0014738	total: 6.83s	remaining: 4.55s
7000:	learn: 0.0011934	total: 7.96s	remaining: 3.41s
8000:	learn: 0.0010644	total: 9.08s	remaining: 2.27s
9000:	learn: 0.0009529	total: 10.2s	remaining: 1.14s
9999:	learn: 0.0008543	total: 11.4s	remaining: 0us
F1-оценка: 0.9487179487179487
accuracy: 0.9649122807017544
precision: 0.9487179487179487
recall: 0.9487179487179487


### LightGBM

In [35]:
model = lgb.LGBMClassifier(num_leaves=31,
                          learning_rate=0.05,
                          n_estimators=100,
                          random_state=42)
model.fit(X_train, y_train)
y_true = y_test
y_pred = model.predict(X_test)
print(f"F1-оценка: {f1_score(y_true, y_pred)}")
print(f"accuracy: {accuracy_score(y_true, y_pred)}")
print(f"precision: {precision_score(y_true, y_pred)}")
print(f"recall: {recall_score(y_true, y_pred)}")

F1-оценка: 0.9620253164556962
accuracy: 0.9736842105263158
precision: 0.95
recall: 0.9743589743589743


### XGBoost

In [36]:
model = xgb.XGBRFClassifier()
model.fit(X_train, y_train, verbose=1000)
y_true = y_test
y_pred = model.predict(X_test)
print(f"F1-оценка: {f1_score(y_true, y_pred)}")
print(f"accuracy: {accuracy_score(y_true, y_pred)}")
print(f"precision: {precision_score(y_true, y_pred)}")
print(f"recall: {recall_score(y_true, y_pred)}")

F1-оценка: 0.9249999999999999
accuracy: 0.9473684210526315
precision: 0.9024390243902439
recall: 0.9487179487179487


## Результат

В качестве метрики была использована метрика F1-score, потому что нам было важно правильно предсказать именно диагноз со злокачественной опухолью, а не максимизировать общее количество правильных предсказаний по всем классам.

Лучше всего себя показал LightGBM, т.к. значения метрик больше всех остальных.