Step-by-Step Example: Classification with Breast Cancer Wisconsin Dataset

**1. Import Required Libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')


**2. Load and Explore the Dataset**

In [2]:
# Load dataset from sklearn
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Basic exploration
print("Dataset shape:", df.shape)
print("Feature columns:", df.columns.tolist()[:5], "...")
print("Target classes:", data.target_names)
print(df['target'].value_counts())  # Class distribution: 0 = malignant, 1 = benign
print(df.head())


Dataset shape: (569, 31)
Feature columns: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness'] ...
Target classes: ['malignant' 'benign']
target
1    357
0    212
Name: count, dtype: int64
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390      

**3. Data Preparation & Splitting**

In [3]:
X = df.drop('target', axis=1)
y = df['target']

# Check for missing values
print(X.isnull().sum().sum())

0


**4. Split Data into Train and Test Sets**

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training samples:", X_train.shape[0], "Testing samples:", X_test.shape[0])


Training samples: 455 Testing samples: 114


**5. Feature Scaling**

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**6. Train the Classifier**

In [7]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
clf.fit(X_train_scaled, y_train)

**7. Make Predictions**

In [8]:
y_pred = clf.predict(X_test_scaled)

**8. Evaluate Model Performance**

In [9]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.956 (95.6%)

Classification Report:
              precision    recall  f1-score   support

   malignant       0.95      0.93      0.94        42
      benign       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114


Confusion Matrix:
[[39  3]
 [ 2 70]]


**9. Feature Importance**

In [10]:
importances = clf.feature_importances_
for i, (name, importance) in enumerate(zip(X.columns, importances)):
    print(f"{i+1}. {name}: {importance:.3f}")

1. mean radius: 0.069
2. mean texture: 0.010
3. mean perimeter: 0.070
4. mean area: 0.049
5. mean smoothness: 0.006
6. mean compactness: 0.012
7. mean concavity: 0.058
8. mean concave points: 0.091
9. mean symmetry: 0.005
10. mean fractal dimension: 0.004
11. radius error: 0.017
12. texture error: 0.005
13. perimeter error: 0.010
14. area error: 0.033
15. smoothness error: 0.003
16. compactness error: 0.006
17. concavity error: 0.004
18. concave points error: 0.004
19. symmetry error: 0.004
20. fractal dimension error: 0.004
21. worst radius: 0.098
22. worst texture: 0.019
23. worst perimeter: 0.072
24. worst area: 0.140
25. worst smoothness: 0.012
26. worst compactness: 0.019
27. worst concavity: 0.034
28. worst concave points: 0.130
29. worst symmetry: 0.008
30. worst fractal dimension: 0.005


**10. Prediction on New Data**

In [11]:
new_sample = np.array([[15.2, 13.2, 98.7, 750, 0.10, 0.18, 0.10, 0.08, 0.19, 0.07,
                        0.35, 1.23, 2.45, 45.3, 0.006, 0.025, 0.03, 0.012, 0.02, 0.003,
                        17.0, 20.5, 112.0, 900, 0.14, 0.29, 0.21, 0.12, 0.24, 0.09]])
new_sample_scaled = scaler.transform(new_sample)
prediction = clf.predict(new_sample_scaled)
print(f"Prediction: {data.target_names[prediction[0]]}")

Prediction: malignant
