In [1]:
pip install ucimlrepo


Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# data (as pandas dataframes) 
X = breast_cancer.data.features 
y = breast_cancer.data.targets 
  
# metadata 
print(breast_cancer.metadata) 
  
# variable information 
print(breast_cancer.variables) 


{'uci_id': 14, 'name': 'Breast Cancer', 'repository_url': 'https://archive.ics.uci.edu/dataset/14/breast+cancer', 'data_url': 'https://archive.ics.uci.edu/static/public/14/data.csv', 'abstract': 'This breast cancer domain was obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia. This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. (See also lymphography and primary-tumor.)', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 286, 'num_features': 9, 'feature_types': ['Categorical'], 'demographics': ['Age'], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Mar 07 2024', 'dataset_doi': '10.24432/C51P4M', 'creators': ['Matjaz Zwitter', 'Milan Soklic'], 'intro_paper': None, 'additional_info': {'summary': 'Thi

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [4]:
X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [5]:
X.head().T

Unnamed: 0,0,1,2,3,4
age,30-39,40-49,40-49,60-69,40-49
menopause,premeno,premeno,premeno,ge40,premeno
tumor-size,30-34,20-24,20-24,15-19,0-4
inv-nodes,0-2,0-2,0-2,0-2,0-2
node-caps,no,no,no,no,no
deg-malig,3,2,2,2,2
breast,left,right,left,right,right
breast-quad,left_low,right_up,left_low,left_up,right_low
irradiat,no,no,no,no,no


In [6]:
X.head(90).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
age,30-39,40-49,40-49,60-69,40-49,60-69,50-59,60-69,40-49,40-49,...,50-59,60-69,60-69,50-59,40-49,50-59,60-69,70-79,50-59,40-49
menopause,premeno,premeno,premeno,ge40,premeno,ge40,premeno,ge40,premeno,premeno,...,ge40,ge40,ge40,ge40,premeno,ge40,ge40,ge40,ge40,premeno
tumor-size,30-34,20-24,20-24,15-19,0-4,15-19,25-29,20-24,50-54,20-24,...,14-Oct,14-Oct,15-19,15-19,20-24,35-39,25-29,0-4,20-24,40-44
inv-nodes,0-2,0-2,0-2,0-2,0-2,0-2,0-2,0-2,0-2,0-2,...,0-2,0-2,0-2,0-2,0-2,0-2,0-2,0-2,0-2,0-2
node-caps,no,no,no,no,no,no,no,no,no,no,...,no,no,no,no,no,no,no,no,no,no
deg-malig,3,2,2,2,2,2,2,1,2,2,...,2,1,2,2,1,3,2,1,3,1
breast,left,right,left,right,right,left,left,left,left,right,...,right,left,right,right,left,left,right,left,right,right
breast-quad,left_low,right_up,left_low,left_up,right_low,left_low,left_low,left_low,left_low,left_up,...,left_low,left_up,left_low,left_low,right_low,left_up,left_low,right_low,left_up,left_up
irradiat,no,no,no,no,no,no,no,no,no,no,...,no,no,no,no,no,no,no,no,no,no


In [7]:
y.head()

Unnamed: 0,Class
0,no-recurrence-events
1,no-recurrence-events
2,no-recurrence-events
3,no-recurrence-events
4,no-recurrence-events


In [8]:
# Encode target labels (e.g., 'no-recurrence-events' -> 0, 'recurrence-events' -> 1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [9]:
# All features are categorical
categorical_features = X.columns.tolist()

# Create a ColumnTransformer to one-hot encode all categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline that includes preprocessing and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [10]:
# Split dataset into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)


In [11]:
# Predict on test data
y_pred = pipeline.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.6896551724137931

Classification Report:
                       precision    recall  f1-score   support

no-recurrence-events       0.68      0.97      0.80        37
   recurrence-events       0.80      0.19      0.31        21

            accuracy                           0.69        58
           macro avg       0.74      0.58      0.55        58
        weighted avg       0.72      0.69      0.62        58



In [12]:
# import joblib

# # Save the model pipeline to a file
# joblib.dump(pipeline, 'breast_cancer_rf_model.pkl')

# # To load it later:
# # model = joblib.load('breast_cancer_rf_model.pkl')
# #

In [13]:
from sklearn.model_selection import GridSearchCV

# Efficient parameter grid (lightweight for faster tuning)
param_grid = {
    'classifier__n_estimators': [50, 100],          # Number of trees
    'classifier__max_depth': [None, 5, 10],         # Tree depth
    'classifier__min_samples_split': [2, 5]         # Minimum samples to split a node
}

# Setup GridSearchCV with 3-fold cross-validation
grid_search = GridSearchCV(
    pipeline,             # Pipeline includes preprocessing + classifier
    param_grid,
    cv=3,                 # 3-fold cross-validation (light and fast)
    n_jobs=-1,            # Use all CPU cores
    verbose=1             # Progress output
)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Print the best score and parameters
print("✅ Best Cross-Validation Score: {:.2f}%".format(grid_search.best_score_ * 100))
print("📌 Best Parameters Found:", grid_search.best_params_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\n🎯 Test Accuracy:", accuracy_score(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
✅ Best Cross-Validation Score: 75.44%
📌 Best Parameters Found: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}

🎯 Test Accuracy: 0.6724137931034483

📋 Classification Report:
                       precision    recall  f1-score   support

no-recurrence-events       0.66      1.00      0.80        37
   recurrence-events       1.00      0.10      0.17        21

            accuracy                           0.67        58
           macro avg       0.83      0.55      0.48        58
        weighted avg       0.78      0.67      0.57        58

