# Home Task
## Brest cancer dataset

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.datasets import load_breast_cancer


In [None]:
cancer = load_breast_cancer()

X, y, labels, features = cancer.data, cancer.target, cancer.target_names, cancer.feature_names
print("Labels:", labels)
print("Features:", features)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=0
)
print("="*70)
for name, arr in zip(["X_train", "y_train", "x_test", "y_test"], [X_train, y_train, X_test, y_test]):
    print(f"{name}: {arr.shape}")


Labels: ['malignant' 'benign']
Features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
X_train: (398, 30)
y_train: (398,)
x_test: (398,)
y_test: (171,)


In [None]:
def run_classifier(clf, param_grid, X_train, X_test, y_train, y_test):
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", clf)
    ])

    gs = GridSearchCV(pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)
    gs.fit(X_train, y_train)

    y_pred = gs.predict(X_test)

    best_model = gs.best_estimator_
    print(f"\n=== {clf.__class__.__name__} ===")
    print("Train score:", round(best_model.score(X_train, y_train), 4))
    print("Test score:", round(best_model.score(X_test, y_test), 4))
    print("Best params:", gs.best_params_)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


## DecisionTree

In [None]:
run_classifier(
    clf=DecisionTreeClassifier(),
    param_grid={
        "clf__max_depth": [3, 5, 7],
        "clf__min_samples_split": [2, 4, 6]
    },
    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
)


Fitting 5 folds for each of 9 candidates, totalling 45 fits

=== DecisionTreeClassifier ===
Train score: 0.9874
Test score: 0.9357
Best params: {'clf__max_depth': 5, 'clf__min_samples_split': 2}
Confusion Matrix:
 [[ 60   3]
 [  8 100]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.92        63
           1       0.97      0.93      0.95       108

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.93       171
weighted avg       0.94      0.94      0.94       171



## RandomForest


In [None]:
run_classifier(
    clf=RandomForestClassifier(),
    param_grid={
        "clf__n_estimators": [100, 200, 300],
        "clf__max_depth": [None, 5, 10, 20],
        "clf__min_samples_split": [2, 5, 10],
    },
    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
)


Fitting 5 folds for each of 36 candidates, totalling 180 fits

=== RandomForestClassifier ===
Train score: 0.9925
Test score: 0.9591
Best params: {'clf__max_depth': 10, 'clf__min_samples_split': 10, 'clf__n_estimators': 200}
Confusion Matrix:
 [[ 60   3]
 [  4 104]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94        63
           1       0.97      0.96      0.97       108

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



## Gradient Boosting Decisions Trees (GBDT)

In [None]:
run_classifier(
    clf=GradientBoostingClassifier(),
    param_grid={
        "clf__n_estimators": [100, 200],
        "clf__learning_rate": [0.01, 0.1, 0.2],
        "clf__max_depth": [3, 5, 7],
    }, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test
)


Fitting 5 folds for each of 18 candidates, totalling 90 fits

=== GradientBoostingClassifier ===
Train score: 1.0
Test score: 0.9883
Best params: {'clf__learning_rate': 0.2, 'clf__max_depth': 3, 'clf__n_estimators': 200}
Confusion Matrix:
 [[ 61   2]
 [  0 108]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        63
           1       0.98      1.00      0.99       108

    accuracy                           0.99       171
   macro avg       0.99      0.98      0.99       171
weighted avg       0.99      0.99      0.99       171



In [None]:
run_classifier(
    clf=XGBClassifier(),
    param_grid={
        "clf__n_estimators": [50, 100],
        "clf__learning_rate": [0.05, 0.1],
        "clf__max_depth": [3, 5],
        "clf__min_child_weight": [1, 2],
        "clf__subsample": [0.8, 0.9],
        "clf__colsample_bytree": [0.8, 0.9],
    }, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)


Fitting 5 folds for each of 64 candidates, totalling 320 fits

=== XGBClassifier ===
Train score: 1.0
Test score: 0.9708
Best params: {'clf__colsample_bytree': 0.9, 'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__min_child_weight': 1, 'clf__n_estimators': 100, 'clf__subsample': 0.9}
Confusion Matrix:
 [[ 61   2]
 [  3 105]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        63
           1       0.98      0.97      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171

